In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import cross_val_score
import timeit
import math



In [2]:
data = pd.read_csv("train.csv", header=0)
data.shape

(1460, 81)

In [3]:
# this just sums up how many nulls per feature and divides to find percentage of nulls per feature
# if over 50% null then print the feature
data_keys = data.keys()
for i, b in enumerate((data.isnull().sum() / data.shape[0]) > 0.5):
    if b:
        print(data_keys[i])

Alley
PoolQC
Fence
MiscFeature


In [4]:
data = data.drop(['Alley', 'MiscFeature', 'Fence', 'PoolQC'], axis=1)

In [5]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [6]:
# Replaces categorical value in Quality columns with numerical scale
qualityCols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
              'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond']

data[qualityCols].head()

for col in qualityCols:
    # NA is never used since all NA's got converted to NaN objects when pandas read in the csv
    data[col] = data[col].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po':1, 'NA': 0})

data[qualityCols].head()

Unnamed: 0,ExterQual,ExterCond,BsmtQual,BsmtCond,HeatingQC,KitchenQual,FireplaceQu,GarageQual,GarageCond
0,4,3,4.0,3.0,5,4,,3.0,3.0
1,3,3,4.0,3.0,5,3,3.0,3.0,3.0
2,4,3,4.0,3.0,5,4,3.0,3.0,3.0
3,3,3,3.0,4.0,4,4,4.0,3.0,3.0
4,4,3,4.0,3.0,5,4,3.0,3.0,3.0


In [7]:
# categorical columns
catCols = set(list(data))-set(list(data._get_numeric_data()))
print(catCols)

# #TRY dropping all cat cols
# data = data.drop(columns=catCols)

{'SaleCondition', 'LotShape', 'Heating', 'BsmtFinType1', 'Neighborhood', 'BsmtFinType2', 'RoofMatl', 'LandSlope', 'MSZoning', 'BldgType', 'Foundation', 'Exterior1st', 'GarageFinish', 'LandContour', 'SaleType', 'Functional', 'Street', 'CentralAir', 'HouseStyle', 'Exterior2nd', 'MasVnrType', 'LotConfig', 'GarageType', 'RoofStyle', 'PavedDrive', 'BsmtExposure', 'Utilities', 'Condition1', 'Electrical', 'Condition2'}


In [8]:
#Perform one hot encoding on all categorical columns
frames = []
salePrice = data['SalePrice']
for col in catCols:
    oneHot_encoded = pd.get_dummies(data[col])
    oneHot_encoded = oneHot_encoded.add_prefix(col + '_is_')
    frames.append(oneHot_encoded)
frames.append(salePrice)

data = data.drop(catCols, axis=1)

data = pd.concat(frames, axis=1)

In [9]:
data.keys()

Index(['SaleCondition_is_Abnorml', 'SaleCondition_is_AdjLand',
       'SaleCondition_is_Alloca', 'SaleCondition_is_Family',
       'SaleCondition_is_Normal', 'SaleCondition_is_Partial',
       'LotShape_is_IR1', 'LotShape_is_IR2', 'LotShape_is_IR3',
       'LotShape_is_Reg',
       ...
       'Electrical_is_SBrkr', 'Condition2_is_Artery', 'Condition2_is_Feedr',
       'Condition2_is_Norm', 'Condition2_is_PosA', 'Condition2_is_PosN',
       'Condition2_is_RRAe', 'Condition2_is_RRAn', 'Condition2_is_RRNn',
       'SalePrice'],
      dtype='object', length=199)

In [10]:
# Drop all Na's for now
data = data.dropna()
# Split into training and target sets
num_variables = len(data.columns)
training = data.iloc[:, 0:num_variables-1]
target = data.iloc[:,num_variables-1:]

# 80:20 train test ratio
test_size = 0.2
# This function splits the training and target sets into random train and test subsets.
# X_train and X_test are subsets of the training data
# y_train and y_test are subsets the the target data
# do we do this ourselves or should we be using scikit learn
X_train, X_test, y_train, y_test = train_test_split(training, target, test_size=test_size)

In [23]:
def evaluateModel(model, splits=5):
    start_time = timeit.default_timer()
    
#     mae = cross_val_score(model, X_test, y_test.values.ravel(), cv=splits, scoring='neg_mean_absolute_error')
#     mae = np.mean(mae)
#     print('Mean Absolute Error: ', -mae)
    
    mse = cross_val_score(model, X_test, y_test.values.ravel(), cv=splits, scoring='neg_mean_squared_error')
    print('Mean Squared Error: ', np.mean(mse * -1))
    
    rmse = math.sqrt(np.mean(mse*-1))
    print('Root Mean Squared Error: ', rmse)

    elapsed = timeit.default_timer() - start_time


# AdaBoost

In [48]:
from sklearn.ensemble import AdaBoostRegressor
adaBoost = AdaBoostRegressor()
adaBoost.fit(X_train, y_train.values.ravel())

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
         n_estimators=50, random_state=None)

In [24]:
evaluateModel(adaBoost)

Mean Squared Error:  2315639161.49
Root Mean Squared Error:  48121.0885318903


In [23]:
def evaluateModel(model, splits=5):
    start_time = timeit.default_timer()
    
#     mae = cross_val_score(model, X_test, y_test.values.ravel(), cv=splits, scoring='neg_mean_absolute_error')
#     mae = np.mean(mae)
#     print('Mean Absolute Error: ', -mae)
    
    mse = cross_val_score(model, X_test, y_test.values.ravel(), cv=splits, scoring='neg_mean_squared_error')
    print('Mean Squared Error: ', np.mean(mse * -1))
    
    rmse = math.sqrt(np.mean(mse*-1))
    print('Root Mean Squared Error: ', rmse)

    elapsed = timeit.default_timer() - start_time


In [14]:
#View Predicted values
predicted = adaBoost.predict(X_test)
ada_pred = y_test.copy()
ada_pred['predicted'] = predicted
ada_pred.head()

Unnamed: 0,SalePrice,predicted
542,213250,206697.674785
489,86000,193508.479212
760,127500,174041.801724
4,250000,315090.895954
690,141000,210472.893333


# XGBoost Regressor

In [15]:
#!pip3 install xgboost

In [55]:
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
evaluateModel(xgb)

Mean Squared Error:  1591264442.99
Root Mean Squared Error:  39890.656086286755


In [56]:
predicted = xgb.predict(X_test)
xgb_pred = y_test.copy()
xgb_pred['predicted'] = predicted
xgb_pred.head()

Unnamed: 0,SalePrice,predicted
542,213250,207564.75
489,86000,96091.960938
760,127500,124707.359375
4,250000,309618.15625
690,141000,179287.484375


# SVM (SVC just to test)

In [18]:
from sklearn import svm

svc_model = svm.SVC(kernel="rbf", C=1.0)

# change C (error) in hypertuning
svc_model.fit(X_train, y_train.values.ravel())

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [25]:
# we get a warning because svm is splitting the data into "classes" and because saleprice is numeric there are many prices where there are only 1 of that "class"...
evaluateModel(svc_model, splits=5)
svc_predicted = svc_model.predict(X_test)
svc_pred = y_test.copy()
svc_pred["predicted"] = svc_predicted
svc_pred.head()



Mean Squared Error:  2572714225.96
Root Mean Squared Error:  50721.930424271435


Unnamed: 0,SalePrice,predicted
542,213250,135000
489,86000,135000
760,127500,135000
4,250000,135000
690,141000,135000


# SVM (SVR)

In [71]:
from sklearn import svm

svr_model = svm.SVR(kernel="poly", shrinking=False, coef0=-2000)
# coef0 only works with poly and sigmoid kernels
# it just puts that value instead of the column of 1's

# without it, this model breaks for some reason

# epsilon, degree
svr_model.fit(X_train, y_train.values.ravel())

SVR(C=1.0, cache_size=200, coef0=-2000, degree=3, epsilon=0.1, gamma='auto',
  kernel='poly', max_iter=-1, shrinking=False, tol=0.001, verbose=False)

In [69]:
evaluateModel(svr_model, splits=5)
svr_predicted = svr_model.predict(X_test)
svr_pred = y_test.copy()
svr_pred["predicted"] = svr_predicted
svr_pred.head()

Mean Squared Error:  1936931228.15
Root Mean Squared Error:  44010.58086586259


Unnamed: 0,SalePrice,predicted
542,213250,232062.181808
489,86000,90184.807437
760,127500,131833.060401
4,250000,299414.401135
690,141000,172440.696141
