In [1]:
import numpy as np
import pandas as pd
from statistics import mean
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import torch
import graphviz
from sklearn import tree

Implement 5-fold nested CV to find optimal parameter configuration for the model. Report: (1) tree visualization (2) the set of best parameters on each split (3) testing mse on each outer split (3) important splits / features (4) algorithm runtime

In [2]:
data = pd.get_dummies(pd.read_csv('ManualPreprocessedAmesHousing.csv'))

X = data.drop('SalePrice', axis=1).to_numpy()
y = data['SalePrice'].to_numpy()

max_depth = range(5, 30)
min_samples_leaf = range(2, 10)

best = {}

outer_kfold = KFold(n_splits=5)
inner_kfold = KFold(n_splits=4)

for train_index, test_index in outer_kfold.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    best_max_depth = 0
    best_min_samples_leaf = 0
    best_mse = float('inf')
    for depth in max_depth:
        for samples in min_samples_leaf:
            mse = []
            for train_inner_index, _ in inner_kfold.split(X_train):
                test_inner_index = np.array(list(set(train_index) - set(train_inner_index)))
                X_train_inner, X_test_inner = X[train_inner_index], X[test_inner_index]
                y_train_inner, y_test_inner = y[train_inner_index], y[test_inner_index]
                dtr = DecisionTreeRegressor(max_depth=depth, min_samples_leaf=samples)
                dtr.fit(X_train_inner, y_train_inner)
                mse.append(mean_squared_error(y_test_inner, dtr.predict(X_test_inner)))
            avg_mse = mean(mse)

            if avg_mse < best_mse:
                best_mse = avg_mse
                best_max_depth = depth
                best_min_samples_leaf = samples
    dtr = DecisionTreeRegressor(max_depth=best_max_depth, min_samples_leaf=best_min_samples_leaf)
    dtr.fit(X_train, y_train)
    score = mean_squared_error(y_test, dtr.predict(X_test))
    print('for this split, best tuning average mse is', best_mse, 'best max depth is', best_max_depth, 'best min samples leaf is', best_min_samples_leaf, 'the testing mse is', score)
    tree_viz = tree.export_graphviz(dtr, out_file=None, max_depth=3,
                                feature_names = data.drop('SalePrice', axis=1).columns,  
                                class_names = str(data['SalePrice']),
                                rounded = True,
                                special_characters = True,
                                filled = True)
    graph = graphviz.Source(tree_viz, format="png") 
    graph.render("decision tree"+" "+str(best_max_depth)+" "+str(best_min_samples_leaf)+".png")


for this split, best tuning average mse is 1161.3889511218356 best max depth is 10 best min samples leaf is 6 the testing mse is 1184.072415368033
for this split, best tuning average mse is 1240.1566045550055 best max depth is 10 best min samples leaf is 3 the testing mse is 816.6995644493976
for this split, best tuning average mse is 1205.4956022292404 best max depth is 17 best min samples leaf is 9 the testing mse is 1368.8059187141284
for this split, best tuning average mse is 1101.8455810768055 best max depth is 8 best min samples leaf is 3 the testing mse is 2051.211779566418
for this split, best tuning average mse is 1369.1129161089543 best max depth is 22 best min samples leaf is 6 the testing mse is 1058.4104642453954


## PyTorch (Hide for now)

In [None]:
# torch_data = torch.tensor(data.drop('SalePrice', axis=1).values).float().to(device)
# print(torch_data.shape)
# torch_label = torch.tensor(data['SalePrice'].values).float().to(device)

In [None]:
# class linearRegression(torch.nn.Module):
#     def __init__(self, inputDim, hiddenDim):
#         super(linearRegression, self).__init__()
#         layers = []
#         in_dim = inputDim
#         for hidden_dim in hiddenDim[:-1]:
#             layers.append(torch.nn.Linear(in_dim, hidden_dim))
#             layers.append(torch.nn.BatchNorm1d(hidden_dim))
#             layers.append(torch.nn.ReLU())
#             layers.append(torch.nn.Dropout(p=0.5))
#             in_dim = hidden_dim
#         layers.append(torch.nn.Linear(in_dim, hiddenDim[-1]))
#         layers.append(torch.nn.BatchNorm1d(hiddenDim[-1]))
#         layers.append(torch.nn.ReLU())
#         layers.append(torch.nn.Dropout(p=0.5))
#         self.net = torch.nn.Sequential(*layers)

#     def forward(self, x):
#         out = self.net(x)
#         return out

In [None]:
# hidden_dim = [176, 1]
# model = linearRegression(torch_data.shape[1], hidden_dim).to(device)
# # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR()
# optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
# criterion = torch.nn.MSELoss()

In [None]:
# def evaluate(xTrain, xTest, yTrain, yTest, batch_size):
#   stats = {}
#   train_mse = []
#   test_mse = []

#   for epoch in range(750):
#     print(epoch, end=' ')
#     for i in range(0, xTrain.shape[0], batch_size):
#       optimizer.zero_grad()
#       y_pred = model(xTrain[i:(i+batch_size), :])
#       loss = torch.sqrt(criterion(y_pred, yTrain[i:(i+batch_size)].view(-1,1)))
#       loss.backward()
#       optimizer.step()
#     train_mse.append(loss.item())

#     with torch.no_grad():
#       y_hat = model(xTest)
#       print(y_hat[0], yTest[0])
#       loss = criterion(y_hat, yTest.view(-1,1))
#       test_mse.append(loss.item())

#   stats['train'] = train_mse
#   stats['test'] = test_mse

#   fig = plt.figure()
#   for k, v in stats.items():
#       plt.plot(range(1, len(v) + 1), v, label=k)

#   plt.legend()
#   plt.xlabel('Epochs')
#   plt.ylabel('MSE')
#   plt.title('Train Test MSE')
#   fig.show()

#   return stats

In [None]:
# kf = KFold(n_splits=3)
# for train_index, test_index in kf.split(torch_data):
#   X_train, X_test = torch_data[train_index], torch_data[test_index]
#   y_train, y_test = torch_label[train_index], torch_label[test_index]

#   _ = evaluate(X_train, X_test, y_train, y_test, batch_size=128)
#   # Stop here for now
#   x == 5