# Setup

In [None]:
!pip install torchinfo -Uq

In [None]:
import torch
from torchinfo import summary

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using Device : {device}")

Using Device : cpu


In [None]:
import pandas as pd

# 데이터 다운로드: https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques
direc = "./"
train_data = pd.read_csv(direc + "train.csv")
test_data = pd.read_csv(direc + "test.csv")

# 데이터 정보 확인
print(train_data.info())
print(train_data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

# Pre-processing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import numpy as np

# 열 선택
X = train_data.drop(columns=["SalePrice", "Id"])
y = np.log1p(train_data["SalePrice"])

# 수치형 및 범주형 변수 분리
num_features = X.select_dtypes(include=['int64', 'float64']).columns
cat_features = X.select_dtypes(include=['object']).columns

# 수치형 데이터 전처리 파이프라인
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# 범주형 데이터 전처리 파이프라인
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# 전처리 통합
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ])

# 전처리 적용
X_preprocessed = preprocessor.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset, TensorDataset

# 훈련-검증 분리
X_train, X_valid, y_train, y_valid = train_test_split(X_preprocessed, y, test_size=0.2)

# 텐서 변환
X_train_tensor = torch.tensor(X_train.toarray(), dtype=torch.float32, device=device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32, device=device).view(-1, 1)
X_valid_tensor = torch.tensor(X_valid.toarray(), dtype=torch.float32, device=device)
y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.float32, device=device).view(-1, 1)

# DataLoader 생성
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
valid_dataset = TensorDataset(X_valid_tensor, y_valid_tensor)

workers = 4 if torch.cuda.is_available() else 0
train_loader = DataLoader(train_dataset, batch_size=32, num_workers=workers, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, num_workers=workers)

# Model definition

In [None]:
import torch.nn as nn
from torchinfo import summary

# 모델 파라미터
input_dim = X_train_tensor.shape[1]

# 모델 초기화
model = nn.Sequential(
    nn.Linear(input_dim, 128),
    nn.BatchNorm1d(128),
    nn.ReLU(),
    nn.Dropout(0.3),  # 드롭아웃
    nn.Linear(128, 64),
    nn.BatchNorm1d(64),
    nn.ReLU(),
    nn.Dropout(0.3),  # 드롭아웃
    nn.Linear(64, 32),
    nn.BatchNorm1d(32),
    nn.ReLU(),
    nn.Dropout(0.2),  # 드롭아웃
    nn.Linear(32, 1)  # 출력 레이어
).to(device)

summary(model)

Layer (type:depth-idx)                   Param #
Sequential                               --
├─Linear: 1-1                            36,864
├─BatchNorm1d: 1-2                       256
├─ReLU: 1-3                              --
├─Dropout: 1-4                           --
├─Linear: 1-5                            8,256
├─BatchNorm1d: 1-6                       128
├─ReLU: 1-7                              --
├─Dropout: 1-8                           --
├─Linear: 1-9                            2,080
├─BatchNorm1d: 1-10                      64
├─ReLU: 1-11                             --
├─Dropout: 1-12                          --
├─Linear: 1-13                           33
Total params: 47,681
Trainable params: 47,681
Non-trainable params: 0

In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training

In [None]:
def train_model(model, train_loader, valid_loader, criterion, optimizer, epochs=100):
    for epoch in range(epochs):
        # 훈련 단계
        model.train()
        train_loss = 0.0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # 진행 상황 출력
        if epoch%100 == 0:
            # 검증 단계
            model.eval()
            valid_loss = 0.0
            with torch.no_grad():
                for X_batch, y_batch in valid_loader:
                    y_pred = model(X_batch)
                    loss = criterion(y_pred, y_batch)
                    valid_loss += loss.item()

            print(f"Epoch {epoch+1:4d}/{epochs:4d}, Train Loss: {train_loss/len(train_loader):.4f}, Valid Loss: {valid_loss/len(valid_loader):.4f}")

In [None]:
# 모델 학습
train_model(model, train_loader, valid_loader, criterion, optimizer, epochs=901)

Epoch    1/ 901, Train Loss: 139.4152, Valid Loss: 138.8673
Epoch  101/ 901, Train Loss: 1.8142, Valid Loss: 0.1048
Epoch  201/ 901, Train Loss: 1.3556, Valid Loss: 0.1008
Epoch  301/ 901, Train Loss: 0.9942, Valid Loss: 0.0505
Epoch  401/ 901, Train Loss: 0.6002, Valid Loss: 0.0377
Epoch  501/ 901, Train Loss: 0.3888, Valid Loss: 0.0316
Epoch  601/ 901, Train Loss: 0.2112, Valid Loss: 0.0278
Epoch  701/ 901, Train Loss: 0.1414, Valid Loss: 0.0224
Epoch  801/ 901, Train Loss: 0.0882, Valid Loss: 0.0189
Epoch  901/ 901, Train Loss: 0.0562, Valid Loss: 0.0176


# Testing

In [None]:
X = test_data.drop(columns=["Id"])
X_preprocessed = preprocessor.transform(X)
X_test_tensor = torch.tensor(X_preprocessed.toarray(), dtype=torch.float32, device=device)

In [None]:
model.eval()
valid_loss = 0.0
with torch.no_grad():
    outputs = model(X_test_tensor)

df = pd.DataFrame(np.exp(outputs.numpy()), columns=["SalesPrice"])
df = pd.concat([test_data["Id"], df], axis=1)
df

Unnamed: 0,Id,SalesPrice
0,1461,130380.398438
1,1462,164131.750000
2,1463,181121.968750
3,1464,196358.890625
4,1465,184740.796875
...,...,...
1454,2915,87800.460938
1455,2916,86715.093750
1456,2917,156362.796875
1457,2918,132525.953125


In [None]:
df.to_csv("sub.csv", index=False)