In [42]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader
import sys
import torch
import torch.nn as nn
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [43]:
cv = KFold(n_splits=5, random_state=45)

In [2]:
sys.path.append('../experiments/')
from entity_embeddings import EmbeddingDataset, EmbeddingNN, train_embedding_model

In [3]:
data = pd.read_csv("../data/train_house.csv",
                   usecols=["SalePrice", "MSSubClass", "MSZoning",
                            "LotFrontage", "LotArea",
                            "Street", "YearBuilt", "LotShape",
                            "1stFlrSF", "2ndFlrSF"])\
    .dropna()

In [4]:
categorical_features = ["MSSubClass", "MSZoning", "Street",
                        "LotShape", "YearBuilt"]
output_feature = "SalePrice"
label_encoders = {}
for cat_col in categorical_features:
    label_encoders[cat_col] = LabelEncoder()
    data[cat_col] = label_encoders[cat_col].fit_transform(data[cat_col])

In [5]:
dataset = EmbeddingDataset(data=data, cat_cols=categorical_features,
                             output_col=output_feature)

In [6]:
batchsize = 64
dataloader = DataLoader(dataset, batchsize, shuffle=True, num_workers=1)

In [7]:
cat_dims = [int(data[col].nunique()) for col in categorical_features]

emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = EmbeddingNN(emb_dims, no_of_cont=4, lin_layer_sizes=[50, 100],
                          output_size=1, emb_dropout=0.04,
                          lin_layer_dropouts=[0.001, 0.01])\
    .to(device)

In [9]:
adam = torch.optim.Adam(model.parameters(), lr=0.1)
train_embedding_model(model, dataloader, 30, criterion=nn.MSELoss(), optimizer=adam)

loss on epoch 0 is 30464933888.0
loss on epoch 1 is 45675249664.0
loss on epoch 2 is 29134524416.0
loss on epoch 3 is 49721446400.0
loss on epoch 4 is 28392325120.0
loss on epoch 5 is 35265019904.0
loss on epoch 6 is 24488040448.0
loss on epoch 7 is 19568803840.0
loss on epoch 8 is 19931912192.0
loss on epoch 9 is 14278952960.0
loss on epoch 10 is 11385565184.0
loss on epoch 11 is 5977475584.0
loss on epoch 12 is 4427692544.0
loss on epoch 13 is 6609956352.0
loss on epoch 14 is 2248703488.0
loss on epoch 15 is 2946184704.0
loss on epoch 16 is 1694608384.0
loss on epoch 17 is 2272303104.0
loss on epoch 18 is 983650496.0
loss on epoch 19 is 634310400.0
loss on epoch 20 is 1768392448.0
loss on epoch 21 is 2190784512.0
loss on epoch 22 is 595133184.0
loss on epoch 23 is 1976399360.0
loss on epoch 24 is 1601166336.0
loss on epoch 25 is 1068084608.0
loss on epoch 26 is 601896064.0
loss on epoch 27 is 935945344.0
loss on epoch 28 is 429480192.0
loss on epoch 29 is 770927040.0


In [11]:
data_ohe = pd.get_dummies(data, columns=categorical_features)

In [37]:
data_ohe.shape

(1201, 143)

In [13]:
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1)

In [54]:
pred = cross_val_predict(rf, data.drop(output_feature, axis=1), data[output_feature], cv=cv)

In [55]:
mean_absolute_error(data[output_feature], pred), mean_squared_error(data[output_feature], pred)

(22980.178090559453, 1347717706.3195999)

In [31]:
data_emb = data.copy()
for col in categorical_features:
    data_emb[col] = label_encoders[col].inverse_transform(data_emb[col])
    data_emb[col] = data_emb[col].map(model.get_embeddings(categorical_features, emb_dims, label_encoders)[col])

In [34]:
for col in categorical_features:
    tags = data_emb[col].apply(pd.Series)
    tags = tags.rename(columns = lambda x : f'{col}_' + str(x))
    data_emb = pd.concat([data_emb.drop(col, axis=1)[:], tags[:]], axis=1)

In [57]:
data_emb.shape

(1201, 69)

In [52]:
pred = cross_val_predict(rf, data_emb.drop(output_feature, axis=1), data_emb[output_feature], cv=cv)

In [53]:
mean_absolute_error(data[output_feature], pred), mean_squared_error(data[output_feature], pred)

(22957.46829722586, 1368014609.14349)