In [5]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn
from torch.utils.data import DataLoader


from kaggle.multisvd.nnflow import TabularDataset, FeedForwardNN

# Using only a subset of the variables.

In [6]:
data = pd.read_csv("train.csv", usecols=["SalePrice", "MSSubClass", "MSZoning", "LotFrontage", "LotArea",
                                         "Street", "YearBuilt", "LotShape", "1stFlrSF", "2ndFlrSF"]).dropna()

In [7]:
    categorical_features = ["MSSubClass", "MSZoning", "Street", "LotShape", "YearBuilt"]
output_feature = "SalePrice"

In [8]:
label_encoders = {}
for cat_col in categorical_features:
        label_encoders[cat_col] = LabelEncoder()
        data[cat_col] = label_encoders[cat_col].fit_transform(data[cat_col])
   

In [9]:
dataset = TabularDataset(data=data, cat_cols=categorical_features,
                             output_col=output_feature)

In [10]:
batchsize = 64
dataloader = DataLoader(dataset, batchsize, shuffle=True, num_workers=1)

In [11]:
cat_dims = [int(data[col].nunique()) for col in categorical_features]
cat_dims

[15, 5, 2, 4, 112]

In [12]:
emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]

In [13]:
emb_dims

[(15, 8), (5, 3), (2, 1), (4, 2), (112, 50)]

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FeedForwardNN(emb_dims, no_of_cont=4, lin_layer_sizes=[50, 100],
                          output_size=1, emb_dropout=0.04,
                          lin_layer_dropouts=[0.001,0.01]).to(device)


In [15]:
no_of_epochs = 5
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
for epoch in range(no_of_epochs):
      for y, cont_x, cat_x in dataloader:
          
        cat_x = cat_x.to(device)
        cont_x = cont_x.to(device)
        y  = y.to(device)

        # Forward Pass
        preds = model(cont_x, cat_x)
        loss = criterion(preds, y)

        # Backward Pass and Optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
          

In [31]:
label, cont, cat = dataset[:]
label_tensor = torch.from_numpy(label)
cont_tensor = torch.from_numpy(cont)
cat_tensor = torch.from_numpy(cat)
label

array([[208500.],
       [181500.],
       [223500.],
       ...,
       [266500.],
       [142125.],
       [147500.]], dtype=float32)

In [30]:
preds: torch.Tensor = model(cont_tensor, cat_tensor)
preds

array([20603.738 ,  5667.6465, 24181.59  , ..., 28952.201 ,  4688.1914,
        5999.046 ], dtype=float32)

In [34]:
from sklearn.metrics import mean_squared_error

y_true = label.flatten()
y_preds = preds.data.numpy().flatten()
mean_squared_error(y_true, y_preds)

31844770000.0

In [None]:
# test_data = pd.read_csv("test.csv", usecols=["SalePrice", "MSSubClass", "MSZoning", "LotFrontage", "LotArea",
#                                          "Street", "YearBuilt", "LotShape", "1stFlrSF", "2ndFlrSF"]).dropna()


