In [1]:
import pandas as pd
import numpy as np
import numpy.linalg as la
from sklearn.model_selection import train_test_split
from sklearn.base import clone
import timeit
import math

In [2]:
def pca(F, X):
    n, d = X.shape
    mu = np.zeros((d, 1))
    Z = np.zeros((d, F))
    for i in range(d):
        mu[i] = (1. / n) * np.sum(X[:, [i]])
    X = X - mu.T
    U, s, Vt = la.svd(X, False)
    g = s[:F]
    for i in range(F):
        g[i] = 1. / g[i]
    W = Vt[:F]
    Z = np.dot(W.T, np.diag(g))
    return (mu, Z)

def pca_proj(X,mu,Z):
    n, d = X.shape
    X = X - mu.T
    return np.dot(X, Z)

In [3]:
def k_fold(k, model, f, X, y, error_type="mse"):
    n, d = X.shape
    z = np.zeros((k, 1))
    for i in range(k):
        T = list(range(int((i * n) / k), int((n * (i + 1) / k))))
        S = [j for j in range(n) if j not in T]
        curr_model = clone(model)

        training_mu, training_Z = pca(f, X[S])
        training_X = pca_proj(X[S], training_mu, training_Z)

        curr_model.fit(training_X, y[S])

        test_X = pca_proj(X[T], training_mu, training_Z)

        # y[T] will be len(T) by 1
        # X[T] will be len(T) by d
        if error_type == "mse":
            z[i] = (1. / len(T)) * np.sum((y[T] - curr_model.predict(test_X)) ** 2)
        elif error_type == "log_mse":
            z[i] = (1. / len(T)) * np.sum((np.log(y[T] + 1) - np.log(curr_model.predict(test_X) + 1)) ** 2)
    return z

In [4]:
def bootstrapping(B, model, f, X, y, error_type="mse"):
    n, d = X.shape
    z = np.zeros((B, 1))
    for i in range(B):
        u = np.random.choice(n, n, replace=True)
        S = np.unique(u)
        T = np.setdiff1d(np.arange(n), S, assume_unique=True)
        curr_model = clone(model)

        training_mu, training_Z = pca(f, X[u])
        training_X = pca_proj(X[u], training_mu, training_Z)

        curr_model.fit(training_X, y[u])

        test_X = pca_proj(X[T], training_mu, training_Z)

        # y[T] will be len(T) by 1
        # X[T] will be len(T) by d
        # theta_hat will be d by 1
        if error_type == "mse":
            z[i] = (1. / len(T)) * np.sum((y[T] - curr_model.predict(test_X)) ** 2)
        elif error_type == "log_mse":
            z[i] = (1. / len(T)) * np.sum((np.log(y[T] + 1) - np.log(curr_model.predict(test_X) + 1)) ** 2)
    return z

In [5]:
def evaluate_model(model, f, X, y, k=5, B=5):
    ########################KFOLD###################
    print('Evaluating K-fold with %d folds.' % k)
    start_time = timeit.default_timer()
    k_fold_z = k_fold(k, model, f, X, y, error_type="log_mse")
    elapsed = timeit.default_timer() - start_time
    
    k_fold_mse = np.mean(k_fold_z)
    print('K-fold Mean Squared log Error: ', k_fold_mse)
    
    k_fold_rmse = math.sqrt(k_fold_mse)
    print('K-fold Square Root Mean Squared log Error: ', k_fold_rmse)

    print("Time elapsed for k-fold: ", elapsed)
    
    print()
    print()
    ###################BOOTSTRAPPING################
    print('Evaluating bootstrapping with %d bootstraps.' % B)
    start_time = timeit.default_timer()
    bootstrapping_z = bootstrapping(B, model, f, X, y)
    elapsed = timeit.default_timer() - start_time
    
    bootstrapping_mse = np.mean(bootstrapping_z)
    print('Bootstrapping Mean Squared Error: ', bootstrapping_mse)
    
    bootstrapping_rmse = math.sqrt(bootstrapping_mse)
    print('Bootstrapping Square Root Mean Squared Error: ', bootstrapping_rmse)

    print("Time elapsed for bootstrapping: ", elapsed)
    
    return (k_fold_z, k_fold_mse, k_fold_rmse, bootstrapping_z, bootstrapping_mse, bootstrapping_rmse)

# Loading and Processing Data

In [166]:
kaggle_train = pd.read_csv("train.csv", header=0)
kaggle_test = pd.read_csv("test.csv", header=0)

kaggle_train_X = kaggle_train.iloc[:,:-1]
kaggle_train_y = kaggle_train.iloc[:,-1:]

print(kaggle_train_X.shape)
print(kaggle_train_y.shape)
print(kaggle_test.shape)

(1460, 80)
(1460, 1)
(1459, 80)


## Joins Kaggle Train and Test data together so we can encode at once.

In [88]:
X_full = kaggle_train_X.append(kaggle_test, ignore_index=True)

## Drops columns that have more than 40% null values.

In [139]:
cols_to_drop = []
data_keys = X_full.keys()
for i, b in enumerate((X_full.isnull().sum() / X_full.shape[0]) > 0.4):
    if b:
        cols_to_drop.append(data_keys[i])

X_full_dropped = X_full.drop(cols_to_drop, axis=1)

## Runs one hot encoding for categorical columns.

In [142]:
catCols = set(X_full_dropped.select_dtypes(include=['object']))

In [143]:
frames = []
for col in catCols:
    oneHot_encoded = pd.get_dummies(X_full_dropped[col])
    oneHot_encoded = oneHot_encoded.add_prefix(col + '_is_')
    frames.append(oneHot_encoded)

X_full_ohe = X_full_dropped.drop(catCols, axis=1)
X_full_ohe = pd.concat([X_full_ohe, pd.concat(frames, axis=1)], axis=1)

## Splits data back into Kaggle Train and Kaggle Test.

In [156]:
kaggle_train_X_psplit = X_full_ohe.loc[X_full_ohe["Id"].between(1,1460)]
kaggle_test_X_psplit = X_full_ohe.loc[X_full_ohe["Id"].between(1461, 1460+1459)]

## Checks for null values and counts for both rows/columns.

In [162]:
# cols
print(kaggle_train_X_psplit.loc[:, kaggle_train_X_psplit.isnull().any()].shape[1])
print(kaggle_test_X_psplit.loc[:, kaggle_test_X_psplit.isnull().any()].shape[1])

# rows
print(kaggle_train_X_psplit.loc[kaggle_train_X_psplit.isnull().any(axis=1)].shape[0])
print(kaggle_test_X_psplit.loc[kaggle_test_X_psplit.isnull().any(axis=1)].shape[0])

3
11
339
313


## Replaces null values with medians of columns.

In [159]:
kaggle_train_X_processed = kaggle_train_X_psplit.fillna(kaggle_train_X_psplit.median())
kaggle_test_X_processed = kaggle_test_X_psplit.fillna(kaggle_test_X_psplit.median())

## Checks for null values and counts for both rows/columns once more.

In [163]:
# cols
print(kaggle_train_X_processed.loc[:, kaggle_train_X_processed.isnull().any()].shape[1])
print(kaggle_test_X_processed.loc[:, kaggle_test_X_processed.isnull().any()].shape[1])

# rows
print(kaggle_train_X_processed.loc[kaggle_train_X_processed.isnull().any(axis=1)].shape[0])
print(kaggle_test_X_processed.loc[kaggle_test_X_processed.isnull().any(axis=1)].shape[0])

0
0
0
0


## Split Kaggle Training into Train/Test since Kaggle Test has no response variables.

In [171]:
# 80:20 train test ratio
test_size = 0.2
# This function splits the training and target sets into random train and test subsets.
# X_train and X_test are subsets of the training data
# y_train and y_test are subsets the the target data
X_train, X_test, y_train, y_test = train_test_split(kaggle_train_X_processed, kaggle_train_y, test_size=test_size)

# PCA Feature Selection

## Selects 50 features.

In [168]:
F = 50
f = F

##  Runs PCA for 50 features on Kaggle train.

In [173]:
X_kaggle_train_mu, X_kaggle_train_Z = pca(F, kaggle_train_X_processed.values)
X_kaggle_train_pca = pca_proj(kaggle_train_X_processed.values, X_kaggle_train_mu, X_kaggle_train_Z)

##  Projects PCA for Kaggle Test onto Kaggle Train.

In [207]:
X_kaggle_test_pca = pca_proj(kaggle_test_X_processed.values, X_kaggle_train_mu, X_kaggle_train_Z)

##  Runs PCA for 50 features on the train split of Kaggle train.

In [174]:
X_train_mu, X_train_Z = pca(F, X_train.values)
X_train_pca = pca_proj(X_train.values, X_train_mu, X_train_Z)

##  Projects PCA for test split onto train split of Kaggle train.

In [176]:
X_test_pca = pca_proj(X_test.values, X_train_mu, X_train_Z)

# ADABoost

In [177]:
from sklearn.ensemble import AdaBoostRegressor
adaBoost = AdaBoostRegressor()
ada_k_z, ada_k_msle, ada_k_rmsle, ada_b_z, ada_b_mse, ada_b_rmse = evaluate_model(adaBoost, f, kaggle_train_X_processed.values, kaggle_train_y.values.ravel(), k=5, B=5)

adaBoost.fit(X_train_pca, y_train.values.ravel())
adaBoost.score(X_test_pca, y_test.values.ravel())

Evaluating K-fold with 5 folds.
K-fold Mean Squared log Error:  0.0561412467221
K-fold Square Root Mean Squared log Error:  0.23694144154646635
Time elapsed for k-fold:  2.7680442000000003


Evaluating bootstrapping with 5 bootstraps.
Bootstrapping Mean Squared Error:  1817940999.95
Bootstrapping Square Root Mean Squared Error:  42637.319333499414
Time elapsed for bootstrapping:  3.1517660000000003


0.71343711167870116

In [178]:
#View Predicted values
predicted = adaBoost.predict(X_test_pca)
ada_pred = y_test.copy()
ada_pred['predicted'] = predicted
ada_pred.head()

Unnamed: 0,SalePrice,predicted
1363,156932,141196.639004
951,119900,137759.678481
545,229000,260761.967794
479,89471,137759.678481
824,232600,225614.575851


# XGBoost

In [202]:
from xgboost import XGBRegressor
xgb = XGBRegressor(max_depth=3, learning_rate=0.2, booster='gbtree', n_estimators=70)
xg_k_z, xg_k_msle, xg_k_rmsle, xg_b_z, xg_b_mse, xg_b_rmse = evaluate_model(xgb, f, kaggle_train_X_processed.values, kaggle_train_y.values.ravel(), k=5, B=5)

xgb.fit(X_train_pca, y_train)
xgb.score(X_test_pca, y_test.values.ravel())

Evaluating K-fold with 5 folds.
K-fold Mean Squared log Error:  0.0265359306074
K-fold Square Root Mean Squared log Error:  0.16289852856111636
Time elapsed for k-fold:  2.279856300000006


Evaluating bootstrapping with 5 bootstraps.
Bootstrapping Mean Squared Error:  1277023873.24
Bootstrapping Square Root Mean Squared Error:  35735.47079917682
Time elapsed for bootstrapping:  2.2790066000000024


0.8329466263305898

In [180]:
predicted = xgb.predict(X_test_pca)
xgb_pred = y_test.copy()
xgb_pred['predicted'] = predicted
xgb_pred.head()

Unnamed: 0,SalePrice,predicted
1363,156932,160344.21875
951,119900,114909.859375
545,229000,267066.65625
479,89471,127868.039062
824,232600,236539.28125


# SVM (SVR)

In [191]:
from sklearn import svm
svr_model = svm.SVR(kernel="poly", coef0=-2500, gamma="auto")
# coef0 only works with poly and sigmoid kernels
# it just puts that value instead of the column of 1's

# without it, this model breaks for some reason

svr_k_z, svr_k_msle, svr_k_rmsle, svr_b_z, svr_b_mse, svr_b_rmse = evaluate_model(svr_model, f, kaggle_train_X_processed.values, kaggle_train_y.values.ravel(), k=5, B=5)

# epsilon, degree
svr_model.fit(X_train_pca, y_train.values.ravel())
svr_model.score(X_test_pca, y_test.values.ravel())

Evaluating K-fold with 5 folds.
K-fold Mean Squared log Error:  0.0221990711158
K-fold Square Root Mean Squared log Error:  0.14899352709370595
Time elapsed for k-fold:  1.1754397000000267


Evaluating bootstrapping with 5 bootstraps.
Bootstrapping Mean Squared Error:  1201120749.96
Bootstrapping Square Root Mean Squared Error:  34657.18900835281
Time elapsed for bootstrapping:  1.8984735999999884


0.85866173874046792

In [183]:
svr_predicted = svr_model.predict(X_test_pca)
svr_pred = y_test.copy()
svr_pred["predicted"] = svr_predicted
svr_pred.head()

Unnamed: 0,SalePrice,predicted
1363,156932,181801.864309
951,119900,133366.864751
545,229000,225840.562482
479,89471,133880.218847
824,232600,221183.595922


# Kaggle Submission

In [206]:
kaggle_xgb = XGBRegressor(max_depth=3, learning_rate=0.2, booster='gbtree', n_estimators=70)

kaggle_xgb.fit(X_kaggle_train_pca, kaggle_train_y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.2, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=70,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [209]:
kaggle_predicted = kaggle_xgb.predict(X_kaggle_test_pca)

In [215]:
kaggle_predicted_complete = pd.DataFrame({'Id': kaggle_test["Id"], 'SalePrice': kaggle_predicted})

In [220]:
kaggle_predicted_complete.to_csv('kaggle_predicted.csv', index=False)