In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tqdm import tqdm # not sure how much of a performance hit this is since its python while everything else is c
import numpy as np

DATA_PATH_TRAIN = 'data/train.csv'
DATA_PATH_TEST = 'data/test.csv'
BATCH_SIZE = 3000
LEARNING_RATE = 0.001
EPOCHS = 3000
DEVICE = torch.device("cpu")

In [22]:
train_df = pd.read_csv(DATA_PATH_TRAIN)
test_df = pd.read_csv(DATA_PATH_TEST)

# Separate target variable and IDs, then drop from respective DataFrames
train_price = train_df['SalePrice']
test_ids = test_df['Id']
train_df = train_df.drop(columns=['Id', 'SalePrice'])
test_df = test_df.drop(columns=['Id'])

# Preprocessing function
def preprocessdata(data, train_cols=None):
    data.dropna(axis=1, thresh=int(0.85 * len(data)), inplace=True)

    numeric_cols = data.select_dtypes(exclude=['object']).columns
    categorical_cols = data.select_dtypes(include=['object']).columns

    data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())
    data[categorical_cols] = data[categorical_cols].fillna('Unknown')

    data = pd.get_dummies(data, columns=categorical_cols, dummy_na=False)

    if train_cols is not None:
        missing_cols = set(train_cols) - set(data.columns)
        for c in missing_cols:
            data[c] = 0
        data = data[train_cols]

    return data

train = preprocessdata(train_df)
test = preprocessdata(test_df, train_cols=train.columns)

   MSSubClass  LotArea  OverallQual  OverallCond  YearBuilt  YearRemodAdd  \
0          60     8450            7            5       2003          2003   
1          20     9600            6            8       1976          1976   
2          60    11250            7            5       2001          2002   
3          70     9550            7            5       1915          1970   
4          60    14260            8            5       2000          2000   

   MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  ...  SaleType_ConLw  \
0       196.0         706           0        150  ...           False   
1         0.0         978           0        284  ...           False   
2       162.0         486           0        434  ...           False   
3         0.0         216           0        540  ...           False   
4       350.0         655           0        490  ...           False   

   SaleType_New  SaleType_Oth  SaleType_WD  SaleCondition_Abnorml  \
0         False         False