In [1]:
import pandas as pd
import numpy as np
import numpy.linalg as la
from sklearn.model_selection import train_test_split
from sklearn.base import clone
import timeit
import math

In [2]:
def pca(F, X):
    n, d = X.shape
    mu = np.zeros((d, 1))
    Z = np.zeros((d, F))
    for i in range(d):
        mu[i] = (1. / n) * np.sum(X[:, [i]])
    X = X - mu.T
    U, s, Vt = la.svd(X, False)
    g = s[:F]
    for i in range(F):
        g[i] = 1. / g[i]
    W = Vt[:F]
    Z = np.dot(W.T, np.diag(g))
    return (mu, Z)

def pca_proj(X,mu,Z):
    n, d = X.shape
    X = X - mu.T
    return np.dot(X, Z)

In [3]:
def k_fold(k, model, f, X, y, error_type="mse"):
    n, d = X.shape
    z = np.zeros((k, 1))
    for i in range(k):
        T = list(range(int((i * n) / k), int((n * (i + 1) / k))))
        S = [j for j in range(n) if j not in T]
        curr_model = clone(model)

        training_mu, training_Z = pca(f, X[S])
        training_X = pca_proj(X[S], training_mu, training_Z)

        curr_model.fit(training_X, y[S])

        test_X = pca_proj(X[T], training_mu, training_Z)

        # y[T] will be len(T) by 1
        # X[T] will be len(T) by d
        if error_type == "mse":
            z[i] = (1. / len(T)) * np.sum((y[T] - curr_model.predict(test_X)) ** 2)
        elif error_type == "log_mse":
            z[i] = (1. / len(T)) * np.sum((np.log(y[T] + 1) - np.log(curr_model.predict(test_X) + 1)) ** 2)
    return z

In [4]:
def bootstrapping(B, model, f, X, y, error_type="mse"):
    n, d = X.shape
    z = np.zeros((B, 1))
    for i in range(B):
        u = np.random.choice(n, n, replace=True)
        S = np.unique(u)
        T = np.setdiff1d(np.arange(n), S, assume_unique=True)
        curr_model = clone(model)

        training_mu, training_Z = pca(f, X[u])
        training_X = pca_proj(X[u], training_mu, training_Z)

        curr_model.fit(training_X, y[u])

        test_X = pca_proj(X[T], training_mu, training_Z)

        # y[T] will be len(T) by 1
        # X[T] will be len(T) by d
        # theta_hat will be d by 1
        if error_type == "mse":
            z[i] = (1. / len(T)) * np.sum((y[T] - curr_model.predict(test_X)) ** 2)
        elif error_type == "log_mse":
            z[i] = (1. / len(T)) * np.sum((np.log(y[T] + 1) - np.log(curr_model.predict(test_X) + 1)) ** 2)
    return z

In [5]:
def evaluate_model(model, f, X, y, k=5, B=5):
    ########################KFOLD###################
    print('Evaluating K-fold with %d folds.' % k)
    start_time = timeit.default_timer()
    k_fold_z = k_fold(k, model, f, X, y, error_type="log_mse")
    elapsed = timeit.default_timer() - start_time
    
    k_fold_mse = np.mean(k_fold_z)
    print('K-fold Mean Squared log Error: ', k_fold_mse)
    
    k_fold_rmse = math.sqrt(k_fold_mse)
    print('K-fold Square Root Mean Squared log Error: ', k_fold_rmse)

    print("Time elapsed for k-fold: ", elapsed)
    
    print()
    print()
    ###################BOOTSTRAPPING################
    print('Evaluating bootstrapping with %d bootstraps.' % B)
    start_time = timeit.default_timer()
    bootstrapping_z = bootstrapping(B, model, f, X, y)
    elapsed = timeit.default_timer() - start_time
    
    bootstrapping_mse = np.mean(bootstrapping_z)
    print('Bootstrapping Mean Squared Error: ', bootstrapping_mse)
    
    bootstrapping_rmse = math.sqrt(bootstrapping_mse)
    print('Bootstrapping Square Root Mean Squared Error: ', bootstrapping_rmse)

    print("Time elapsed for bootstrapping: ", elapsed)
    
    return (k_fold_z, k_fold_mse, k_fold_rmse, bootstrapping_z, bootstrapping_mse, bootstrapping_rmse)

# Data Processing

In [6]:
data = pd.read_csv("train.csv", header=0)
print(data.shape)

X = data.iloc[:,:-1]
Y = data.iloc[:,-1:]

print(X.shape)
print(Y.shape)

(1460, 81)
(1460, 80)
(1460, 1)


In [7]:
# this just sums up how many nulls per feature and divides to find percentage of nulls per feature
# if over 50% null then print the feature
data_keys = X.keys()
for i, b in enumerate((X.isnull().sum() / X.shape[0]) > 0.5):
    if b:
        print(data_keys[i])

Alley
PoolQC
Fence
MiscFeature


In [8]:
# data = data.drop(['Alley', 'MiscFeature', 'Fence', 'PoolQC'], axis=1)

In [9]:
# # Replaces categorical value in Quality columns with numerical scale
# qualityCols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
#               'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond']

# X[qualityCols].head()

# for col in qualityCols:
#     # NA is never used since all NA's got converted to NaN objects when pandas read in the csv
#     X[col] = X[col].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po':1, 'NA': 0})

# X[qualityCols].head()

In [10]:
# categorical columns
catCols = set(list(X))-set(list(X._get_numeric_data()))
print(catCols)

# #TRY dropping all cat cols
# data = data.drop(columns=catCols)

{'Condition1', 'ExterQual', 'FireplaceQu', 'MiscFeature', 'Neighborhood', 'HouseStyle', 'BsmtFinType2', 'LotShape', 'RoofMatl', 'Utilities', 'MasVnrType', 'BsmtCond', 'CentralAir', 'GarageQual', 'BldgType', 'BsmtExposure', 'RoofStyle', 'PavedDrive', 'LotConfig', 'GarageFinish', 'Heating', 'LandContour', 'ExterCond', 'Street', 'BsmtQual', 'Electrical', 'Condition2', 'Exterior2nd', 'PoolQC', 'LandSlope', 'GarageType', 'KitchenQual', 'SaleCondition', 'Alley', 'MSZoning', 'Exterior1st', 'Functional', 'Foundation', 'BsmtFinType1', 'GarageCond', 'Fence', 'HeatingQC', 'SaleType'}


In [11]:
#Perform one hot encoding on all categorical columns
frames = []
for col in catCols:
    oneHot_encoded = pd.get_dummies(X[col])
    oneHot_encoded = oneHot_encoded.add_prefix(col + '_is_')
    frames.append(oneHot_encoded)

X = X.drop(catCols, axis=1)

X = pd.concat(frames, axis=1)

In [12]:
X.keys()

Index(['Condition1_is_Artery', 'Condition1_is_Feedr', 'Condition1_is_Norm',
       'Condition1_is_PosA', 'Condition1_is_PosN', 'Condition1_is_RRAe',
       'Condition1_is_RRAn', 'Condition1_is_RRNe', 'Condition1_is_RRNn',
       'ExterQual_is_Ex',
       ...
       'HeatingQC_is_TA', 'SaleType_is_COD', 'SaleType_is_CWD',
       'SaleType_is_Con', 'SaleType_is_ConLD', 'SaleType_is_ConLI',
       'SaleType_is_ConLw', 'SaleType_is_New', 'SaleType_is_Oth',
       'SaleType_is_WD'],
      dtype='object', length=252)

In [13]:
X.isnull().values.any()

False

In [14]:
# 80:20 train test ratio
test_size = 0.2
# This function splits the training and target sets into random train and test subsets.
# X_train and X_test are subsets of the training data
# y_train and y_test are subsets the the target data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size)

# PCA Feature Selection

In [15]:
F = 50
f = 50

In [16]:
X_mu, X_Z = pca(F, X.values)
X_pca = pca_proj(X.values, X_mu, X_Z)

In [17]:
print(X_mu.shape)
print(X_Z.shape)
print(X_pca.shape)

(252, 1)
(252, 50)
(1460, 50)


In [18]:
X_train_mu, X_train_Z = pca(F, X_train.values)

In [19]:
print(X_train_mu.shape)
print(X_train_Z.shape)

(252, 1)
(252, 50)


In [20]:
X_train_pca = pca_proj(X_train.values, X_train_mu, X_train_Z)
print(X_train_pca.shape)

X_test_pca = pca_proj(X_test.values, X_train_mu, X_train_Z)
print(X_test_pca.shape)

(1168, 50)
(292, 50)


# AdaBoost

In [21]:
from sklearn.ensemble import AdaBoostRegressor
adaBoost = AdaBoostRegressor()
k_z, k_mse, k_rmse, b_z, b_mse, b_rmse = evaluate_model(adaBoost, f, X.values, Y.values.ravel(), k=5, B=5)

adaBoost.fit(X_train_pca, y_train.values.ravel())
adaBoost.score(X_test_pca, y_test.values.ravel())

Evaluating K-fold with 5 folds.
K-fold Mean Squared log Error:  0.0863238411416
K-fold Square Root Mean Squared log Error:  0.2938091917240796
Time elapsed for k-fold:  5.4119528


Evaluating bootstrapping with 5 bootstraps.
Bootstrapping Mean Squared Error:  2954947094.39
Bootstrapping Square Root Mean Squared Error:  54359.425074130515
Time elapsed for bootstrapping:  6.112511199999999


0.63078892308865608

In [22]:
#View Predicted values
predicted = adaBoost.predict(X_test_pca)
ada_pred = y_test.copy()
ada_pred['predicted'] = predicted
ada_pred.head()

Unnamed: 0,SalePrice,predicted
818,155000,146194.506203
1296,155000,172181.563981
555,113000,134192.379205
992,187000,227937.773333
199,274900,336290.652174


# XGBoost Regressor

In [23]:
#!pip3 install xgboost

In [24]:
from xgboost import XGBRegressor
xgb = XGBRegressor(max_depth=3, learning_rate=0.2, booster='gbtree', n_estimators=70)
k_z, k_mse, k_rmse, b_z, b_mse, b_rmse = evaluate_model(xgb, f, X.values, Y.values.ravel(), k=5, B=5)

xgb.fit(X_train_pca, y_train)
xgb.score(X_test_pca, y_test.values.ravel())

Evaluating K-fold with 5 folds.
K-fold Mean Squared log Error:  0.0485028647496
K-fold Square Root Mean Squared log Error:  0.22023365943827247
Time elapsed for k-fold:  2.841487899999999


Evaluating bootstrapping with 5 bootstraps.
Bootstrapping Mean Squared Error:  2005284003.48
Bootstrapping Square Root Mean Squared Error:  44780.39753594813
Time elapsed for bootstrapping:  3.4412237999999995


0.70327558378028088

In [25]:
predicted = xgb.predict(X_test_pca)
xgb_pred = y_test.copy()
xgb_pred['predicted'] = predicted
xgb_pred.head()

Unnamed: 0,SalePrice,predicted
818,155000,154907.546875
1296,155000,150216.578125
555,113000,128500.03125
992,187000,175113.140625
199,274900,371756.3125


# SVM (SVR)

In [26]:
from sklearn import svm
svr_model = svm.SVR(kernel="poly", coef0=-3500, gamma="auto")
# coef0 only works with poly and sigmoid kernels
# it just puts that value instead of the column of 1's

# without it, this model breaks for some reason

k_z, k_mse, k_rmse, b_z, b_mse, b_rmse = evaluate_model(svr_model, f, X.values, Y.values.ravel(), k=5, B=5)

# epsilon, degree
svr_model.fit(X_train_pca, y_train.values.ravel())
svr_model.score(X_test_pca, y_test.values.ravel())

Evaluating K-fold with 5 folds.
K-fold Mean Squared log Error:  0.054210540666
K-fold Square Root Mean Squared log Error:  0.23283157145453767
Time elapsed for k-fold:  2.0605405999999995


Evaluating bootstrapping with 5 bootstraps.
Bootstrapping Mean Squared Error:  2532535920.18
Bootstrapping Square Root Mean Squared Error:  50324.30744854705
Time elapsed for bootstrapping:  2.895757400000001


0.70304689818503441

In [27]:
svr_predicted = svr_model.predict(X_test_pca)
svr_pred = y_test.copy()
svr_pred["predicted"] = svr_predicted
svr_pred.head()

Unnamed: 0,SalePrice,predicted
818,155000,181326.349284
1296,155000,179206.964198
555,113000,136460.572245
992,187000,217230.809878
199,274900,315125.702647


# XGBoost seems the best so let's use it to submit to Kaggle.

In [33]:
data_kaggle = pd.read_csv("test.csv", header=0)
print(data_kaggle.shape)

X_kaggle_test = data_kaggle

(1459, 80)


In [29]:
# this just sums up how many nulls per feature and divides to find percentage of nulls per feature
# if over 50% null then print the feature
data_keys = X_kaggle_test.keys()
for i, b in enumerate((X_kaggle_test.isnull().sum() / X_kaggle_test.shape[0]) > 0.5):
    if b:
        print(data_kaggle[i])

In [30]:
# # Replaces categorical value in Quality columns with numerical scale
# qualityCols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
#               'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond']

# X[qualityCols].head()

# for col in qualityCols:
#     # NA is never used since all NA's got converted to NaN objects when pandas read in the csv
#     X[col] = X[col].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po':1, 'NA': 0})

# X[qualityCols].head()

In [31]:
# categorical columns
catCols = set(list(X))-set(list(X._get_numeric_data()))
print(catCols)

# #TRY dropping all cat cols
# data = data.drop(columns=catCols)

set()


In [32]:
#Perform one hot encoding on all categorical columns
frames = []
for col in catCols:
    oneHot_encoded = pd.get_dummies(X[col])
    oneHot_encoded = oneHot_encoded.add_prefix(col + '_is_')
    frames.append(oneHot_encoded)

X = X.drop(catCols, axis=1)

X = pd.concat(frames, axis=1)

ValueError: No objects to concatenate

In [None]:
X.keys()

In [None]:
X.isnull().values.any()

In [None]:
# 80:20 train test ratio
test_size = 0.2
# This function splits the training and target sets into random train and test subsets.
# X_train and X_test are subsets of the training data
# y_train and y_test are subsets the the target data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size)

In [34]:
F = 50
f = F

In [None]:
X_mu, X_Z = pca(F, X.values)
X_pca = pca_proj(X.values, X_mu, X_Z)

In [None]:
print(X_mu.shape)
print(X_Z.shape)
print(X_pca.shape)

In [None]:
X_train_mu, X_train_Z = pca(F, X_train.values)

In [None]:
print(X_train_mu.shape)
print(X_train_Z.shape)

In [None]:
X_train_pca = pca_proj(X_train.values, X_train_mu, X_train_Z)
print(X_train_pca.shape)

X_test_pca = pca_proj(X_test.values, X_train_mu, X_train_Z)
print(X_test_pca.shape)