In [2]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import cross_val_score
import timeit
import math

In [3]:
data = pd.read_csv("train.csv", header=0)
data.shape

(1460, 81)

In [4]:
# this just sums up how many nulls per feature and divides to find percentage of nulls per feature
# if over 50% null then print the feature
data_keys = data.keys()
for i, b in enumerate((data.isnull().sum() / data.shape[0]) > 0.5):
    if b:
        print(data_keys[i])

Alley
PoolQC
Fence
MiscFeature


In [5]:
data = data.drop(['Alley', 'MiscFeature', 'Fence', 'PoolQC'], axis=1)

In [6]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [7]:
# Replaces categorical value in Quality columns with numerical scale
qualityCols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
              'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond']

data[qualityCols].head()

for col in qualityCols:
    # NA is never used since all NA's got converted to NaN objects when pandas read in the csv
    data[col] = data[col].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po':1, 'NA': 0})

data[qualityCols].head()

Unnamed: 0,ExterQual,ExterCond,BsmtQual,BsmtCond,HeatingQC,KitchenQual,FireplaceQu,GarageQual,GarageCond
0,4,3,4.0,3.0,5,4,,3.0,3.0
1,3,3,4.0,3.0,5,3,3.0,3.0,3.0
2,4,3,4.0,3.0,5,4,3.0,3.0,3.0
3,3,3,3.0,4.0,4,4,4.0,3.0,3.0
4,4,3,4.0,3.0,5,4,3.0,3.0,3.0


In [8]:
# categorical columns
catCols = set(list(data))-set(list(data._get_numeric_data()))
print(catCols)

# #TRY dropping all cat cols
# data = data.drop(columns=catCols)

{'Utilities', 'SaleType', 'RoofMatl', 'BsmtFinType2', 'PavedDrive', 'Foundation', 'LandSlope', 'Street', 'Neighborhood', 'GarageFinish', 'LotShape', 'MasVnrType', 'RoofStyle', 'Exterior1st', 'LotConfig', 'Condition1', 'BldgType', 'CentralAir', 'HouseStyle', 'Exterior2nd', 'LandContour', 'Heating', 'GarageType', 'Functional', 'SaleCondition', 'MSZoning', 'Electrical', 'BsmtExposure', 'BsmtFinType1', 'Condition2'}


In [9]:
#Perform one hot encoding on all categorical columns
frames = []
salePrice = data['SalePrice']
for col in catCols:
    oneHot_encoded = pd.get_dummies(data[col])
    oneHot_encoded = oneHot_encoded.add_prefix(col + '_is_')
    frames.append(oneHot_encoded)
frames.append(salePrice)

data = data.drop(catCols, axis=1)

data = pd.concat(frames, axis=1)

In [10]:
data.keys()

Index(['Utilities_is_AllPub', 'Utilities_is_NoSeWa', 'SaleType_is_COD',
       'SaleType_is_CWD', 'SaleType_is_Con', 'SaleType_is_ConLD',
       'SaleType_is_ConLI', 'SaleType_is_ConLw', 'SaleType_is_New',
       'SaleType_is_Oth',
       ...
       'BsmtFinType1_is_Unf', 'Condition2_is_Artery', 'Condition2_is_Feedr',
       'Condition2_is_Norm', 'Condition2_is_PosA', 'Condition2_is_PosN',
       'Condition2_is_RRAe', 'Condition2_is_RRAn', 'Condition2_is_RRNn',
       'SalePrice'],
      dtype='object', length=199)

In [11]:
data.isnull().values.any()

False

In [12]:
# Drop all Na's for now
data = data.dropna()
# Split into training and target sets
num_variables = len(data.columns)
training = data.iloc[:, 0:num_variables-1]
target = data.iloc[:,num_variables-1:]

# 80:20 train test ratio
test_size = 0.2
# This function splits the training and target sets into random train and test subsets.
# X_train and X_test are subsets of the training data
# y_train and y_test are subsets the the target data
# do we do this ourselves or should we be using scikit learn
X_train, X_test, y_train, y_test = train_test_split(training, target, test_size=test_size)

In [13]:
def k_fold(k, model, X, y):
    n, d = X.shape
    z = np.zeros((k, 1))
    for i in range(k):
        T = list(range(int((i * n) / k), int((n * (i + 1) / k))))
        S = [j for j in range(n) if j not in T]
        model.fit(X[S], y[S])
        # y[T] will be len(T) by 1
        # X[T] will be len(T) by d
        z[i] = (1. / len(T)) * np.sum((y[T] - model.predict(X[T])) ** 2)
    return z

In [14]:
def evaluateModel(model, splits=5):
    start_time = timeit.default_timer()
    
#     mae = cross_val_score(model, X_test, y_test.values.ravel(), cv=splits, scoring='neg_mean_absolute_error')
#     mae = np.mean(mae)
#     print('Mean Absolute Error: ', -mae)
    
    mse = cross_val_score(model, X_test, y_test.values.ravel(), cv=splits, scoring='neg_mean_squared_error')
    print('Mean Squared Error: ', np.mean(mse * -1))
    
    rmse = math.sqrt(np.mean(mse*-1))
    print('Root Mean Squared Error: ', rmse)

    elapsed = timeit.default_timer() - start_time


# AdaBoost

In [15]:
from sklearn.ensemble import AdaBoostRegressor
adaBoost = AdaBoostRegressor()
adaBoost.fit(X_train, y_train.values.ravel())

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
         n_estimators=50, random_state=None)

In [16]:
evaluateModel(adaBoost)

Mean Squared Error:  3710813965.24
Root Mean Squared Error:  60916.45069468376


In [17]:
#View Predicted values
predicted = adaBoost.predict(X_test)
ada_pred = y_test.copy()
ada_pred['predicted'] = predicted
ada_pred.head()

Unnamed: 0,SalePrice,predicted
467,146500,187362.808333
92,163500,177158.831081
122,136000,177257.40445
673,257500,238016.669355
780,176000,214606.368567


In [18]:
ada_z = k_fold(5, adaBoost, training.values, target.values.ravel())
np.mean(ada_z)

3675427185.9484329

# XGBoost Regressor

In [19]:
#!pip3 install xgboost

In [20]:
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
evaluateModel(xgb)

Mean Squared Error:  2916037175.1
Root Mean Squared Error:  54000.34421282115


In [21]:
predicted = xgb.predict(X_test)
xgb_pred = y_test.copy()
xgb_pred['predicted'] = predicted
xgb_pred.head()

Unnamed: 0,SalePrice,predicted
467,146500,168132.984375
92,163500,172793.171875
122,136000,149014.09375
673,257500,274777.375
780,176000,174593.125


In [22]:
xgb_z = k_fold(5, xgb, training.values, target.values.ravel())
np.mean(xgb_z)

1864338879.9963806

# SVM (SVC just to test)

In [23]:
from sklearn import svm

svc_model = svm.SVC(kernel="rbf", C=1.0)

# change C (error) in hypertuning
svc_model.fit(X_train, y_train.values.ravel())

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [24]:
# we get a warning because svm is splitting the data into "classes" and because saleprice is numeric there are many prices where there are only 1 of that "class"...
evaluateModel(svc_model, splits=5)
svc_predicted = svc_model.predict(X_test)
svc_pred = y_test.copy()
svc_pred["predicted"] = svc_predicted
svc_pred.head()



Mean Squared Error:  3533357303.51
Root Mean Squared Error:  59442.04996051698


Unnamed: 0,SalePrice,predicted
467,146500,140000
92,163500,140000
122,136000,140000
673,257500,140000
780,176000,140000


In [25]:
svc_z = k_fold(5, svc_model, training.values, target.values.ravel())
np.mean(svc_z)

8012372536.532877

# SVM (SVR)

In [26]:
from sklearn import svm

svr_model = svm.SVR(kernel="poly", shrinking=False, coef0=-2000)
# coef0 only works with poly and sigmoid kernels
# it just puts that value instead of the column of 1's

# without it, this model breaks for some reason

# epsilon, degree
svr_model.fit(X_train, y_train.values.ravel())

SVR(C=1.0, cache_size=200, coef0=-2000, degree=3, epsilon=0.1, gamma='auto',
  kernel='poly', max_iter=-1, shrinking=False, tol=0.001, verbose=False)

In [27]:
evaluateModel(svr_model, splits=5)
svr_predicted = svr_model.predict(X_test)
svr_pred = y_test.copy()
svr_pred["predicted"] = svr_predicted
svr_pred.head()

Mean Squared Error:  3226357205.43
Root Mean Squared Error:  56801.031728607595


Unnamed: 0,SalePrice,predicted
467,146500,147945.452485
92,163500,167576.90122
122,136000,155205.90321
673,257500,285091.688516
780,176000,158608.723975


In [28]:
svr_z = k_fold(5, svr_model, training.values, target.values.ravel())
np.mean(svr_z)

1876966269.1482024

In [29]:
# looks like order of least to greatest error with the currently tuned models goes:
# XgBoost, SVR, AdaBoost, SVC
print(np.mean(ada_z))
print(np.mean(xgb_z))
print(np.mean(svc_z))
print(np.mean(svr_z))

3675427185.95
1864338880.0
8012372536.53
1876966269.15
