In [48]:
## """
# CSC665 Artificial Section 01 Spring 2019
# Kaggle Project – Regression Analysis
# Team #27
# Prof. Alex Kalinin
# Team Members:
# Ratna Lama			Team Leader 	rlama7@mail.sfsu.edu
# Rohit Nair
# Michael Winata
# Alexey Sergeev
## """

In [49]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

import os
from sklearn.preprocessing import LabelEncoder # categorical values encoder
from sklearn.model_selection import train_test_split # split data into train and test data

from sklearn.ensemble import RandomForestRegressor

# read CSV

In [50]:
df = pd.read_csv("train.csv")

In [51]:
df.shape

(1460, 81)

In [52]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Feature Engineering

In [53]:
# Split data into X and y
X = df.iloc[:, 1:-1]  # features (inputs)
y = df.iloc[:,-1]     # predictions

In [54]:
# Check for Null Values
X.isnull().sum()

MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          8
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           37
                 ... 
HalfBath            0
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         0
TotRmsAbvGrd        0
Functional          0
Fireplaces          0
FireplaceQu       690
GarageType         81
GarageYrBlt        81
GarageFinish       81
GarageCars          0
GarageArea          0
GarageQual         81
GarageCond

In [55]:
# Fill numerical columns missing values with median. Fill character missing values with the most used value count
col_miss_val = [col for col in df.columns if df[col].isnull().any()]
print(col_miss_val)

['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']


In [56]:
# Replace NaN with the most frequent values

In [57]:
for col in col_miss_val:
    if(X[col].dtype==np.dtype('O')):
        X[col]=X[col].fillna(X[col].value_counts().index[0])
    else:
        X[col]=df[col].fillna(X[col].median())

In [58]:
X.isnull().sum()

MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
Alley            0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
MasVnrArea       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
                ..
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
KitchenQual      0
TotRmsAbvGrd     0
Functional       0
Fireplaces       0
FireplaceQu      0
GarageType       0
GarageYrBlt      0
GarageFinish     0
GarageCars       0
GarageArea       0
GarageQual       0
GarageCond       0
PavedDrive       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea    

# Encode a Categorical Values

In [59]:
"""
LabelEncoder can be used to normalize labels.

It can also be used to transform non-numerical labels (as long as they are hashable and comparable) to numerical labels.
"""

'\nLabelEncoder can be used to normalize labels.\n\nIt can also be used to transform non-numerical labels (as long as they are hashable and comparable) to numerical labels.\n'

In [60]:
# find a column that contains character value
X.select_dtypes(include=['object'])

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal
1,RL,Pave,Grvl,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal
2,RL,Pave,Grvl,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal
3,RL,Pave,Grvl,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,Gd,MnPrv,Shed,WD,Abnorml
4,RL,Pave,Grvl,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal
5,RL,Pave,Grvl,IR1,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,...,Attchd,Unf,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal
6,RL,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,Somerst,Norm,...,Attchd,RFn,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal
7,RL,Pave,Grvl,IR1,Lvl,AllPub,Corner,Gtl,NWAmes,PosN,...,Attchd,RFn,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal
8,RM,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Artery,...,Detchd,Unf,Fa,TA,Y,Gd,MnPrv,Shed,WD,Abnorml
9,RL,Pave,Grvl,Reg,Lvl,AllPub,Corner,Gtl,BrkSide,Artery,...,Attchd,RFn,Gd,TA,Y,Gd,MnPrv,Shed,WD,Normal


In [61]:
encoder = LabelEncoder()

for col in X.select_dtypes(include=['object']):
    X[col] = encoder.fit_transform(X[col])

In [62]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,3,65.0,8450,1,0,3,3,0,4,...,0,0,2,2,2,0,2,2008,8,4
1,20,3,80.0,9600,1,0,3,3,0,2,...,0,0,2,2,2,0,5,2007,8,4
2,60,3,68.0,11250,1,0,0,3,0,4,...,0,0,2,2,2,0,9,2008,8,4
3,70,3,60.0,9550,1,0,0,3,0,0,...,0,0,2,2,2,0,2,2006,8,0
4,60,3,84.0,14260,1,0,0,3,0,2,...,0,0,2,2,2,0,12,2008,8,4


In [63]:
# check for the missing values in NULL
y.isnull().sum()

0

# Split the Data into Training and Testing Unit

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [65]:
# train_test_split

In [66]:
X.shape, X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1460, 79), (1095, 79), (365, 79), (1095,), (365,))

# Random Forest Regressor

In [67]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

%time rf.fit(X_train, y_train)

Wall time: 1.36 s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [68]:
rf.score(X_train, y_train)

0.977670115907448

In [69]:
rf.score(X_test, y_test)

0.8932725808281445

In [70]:
y_hat = rf.predict(X_test)
y_hat

array([140459.  , 312966.81, 117886.  , 162820.9 , 315749.88,  84028.54,
       210890.34, 150462.  ,  86415.04, 129901.03, 154595.  , 121601.93,
       108012.5 , 210011.05, 177802.4 , 128511.  , 195687.62, 135602.25,
       118973.5 , 207703.1 , 161331.1 , 225139.35, 182315.52, 123260.  ,
       199347.28, 171795.61, 184845.09, 104834.83, 177097.47, 192503.3 ,
       124613.26, 248431.49, 174920.64, 112897.  , 252367.89, 146395.  ,
       138288.09, 202039.74, 316108.26, 105624.65, 121969.5 , 240217.77,
       119876.  , 373737.85, 132993.1 , 149528.69, 115379.49, 125403.93,
       395224.88, 144296.39, 121658.5 , 201989.  , 123077.65, 352664.55,
       139540.  , 235801.25, 196310.55, 154239.75, 143834.12, 110206.83,
        76968.  , 151411.5 , 306820.48, 280726.33, 287942.65, 215355.69,
       113162.5 , 312734.44, 113260.  , 166673.16, 127035.77, 131205.19,
       110875.5 ,  91985.5 , 437482.49, 172426.22, 306475.39, 306861.05,
       138330.75, 124605.83, 100975.04, 103028.  , 

In [71]:
y_test

892     154500
1105    325000
413     115000
522     159000
1036    315500
614      75500
218     311500
1160    146000
649      84500
887     135500
576     145000
1252    130000
1061     81000
567     214000
1108    181000
1113    134500
168     183500
1102    135000
1120    118400
67      226000
1040    155000
453     210000
670     173500
1094    129000
192     192000
123     153900
415     181134
277     141000
433     181000
1317    208900
         ...  
83      126500
1274    139000
48      113000
155      79000
973     182000
1110    188000
950     129000
1030    160000
1435    174000
1196    219210
1442    310000
481     374000
179     100000
859     250000
1213    145000
583     325000
477     380000
1405    275000
1146    180000
1166    245350
346     151500
1214    134500
1137     94000
1365    216000
654     350000
988     195000
243     120000
1342    228500
1057    248000
1418    124000
Name: SalePrice, Length: 365, dtype: int64

In [72]:
y_hat.shape, y_test.shape

((365,), (365,))

# MSE, RMSE, R^2 Score calculations

In [73]:
mse = ((y_hat - y_test) ** 2).mean()

In [74]:
rmse = np.sqrt(mse)

In [75]:
v = ((y_test - y_test.mean()) ** 2).mean()

In [76]:
mse, rmse, v

(747658555.6574131, 27343.345729032742, 7005309052.339321)

In [77]:
# R^2
score = (1 - (mse/v))
score

0.8932725808281444

# Fine Tunning

In [78]:
"""
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)
"""

"\nfrom sklearn.model_selection import RandomizedSearchCV\n# Number of trees in random forest\nn_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]\n# Number of features to consider at every split\nmax_features = ['auto', 'sqrt']\n# Maximum number of levels in tree\nmax_depth = [int(x) for x in np.linspace(10, 110, num = 11)]\nmax_depth.append(None)\n# Minimum number of samples required to split a node\nmin_samples_split = [2, 5, 10]\n# Minimum number of samples required at each leaf node\nmin_samples_leaf = [1, 2, 4]\n# Method of selecting samples for training each tree\nbootstrap = [True, False]\n# Create the random grid\nrandom_grid = {'n_estimators': n_estimators,\n               'max_features': max_features,\n               'max_depth': max_depth,\n               'min_samples_split': min_samples_split,\n               'min_samples_leaf': min_samples_leaf,\n               'bootstrap': bootstrap}\nprint(random_grid)\n"

# Random Search Traning

In [79]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune

"""
rf = RandomForestRegressor()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model

rf_random.fit(X_train, y_train)
"""

'\nrf = RandomForestRegressor()\n\n# Random search of parameters, using 3 fold cross validation, \n# search across 100 different combinations, and use all available cores\n\nrf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)\n\n# Fit the random search model\n\nrf_random.fit(X_train, y_train)\n'

In [80]:
# rf_random.best_params_

# Evaluate Random Search

In [81]:
# Determine if random search yielded a better model, we compare base model with the best random search model.

# Computationally expensive so this section is commented out

"""
def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    errors = abs(predictions - y_test)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy
"""


"\ndef evaluate(model, X_test, y_test):\n    predictions = model.predict(X_test)\n    errors = abs(predictions - y_test)\n    mape = 100 * np.mean(errors / y_test)\n    accuracy = 100 - mape\n    print('Model Performance')\n    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))\n    print('Accuracy = {:0.2f}%.'.format(accuracy))\n    \n    return accuracy\n"

In [82]:
# Base Model Performance



# base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
# base_model.fit(X_train, y_train)
# base_accuracy = evaluate(base_model, X_test, y_test)

In [83]:
# Best Random Model Performance

# Computationally expensive so this section is commented out
"""
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)
"""

'\nbest_random = rf_random.best_estimator_\nrandom_accuracy = evaluate(best_random, X_test, y_test)\n'

# Test DATA Set

In [84]:
test_dataset = pd.read_csv('test.csv')
test_dataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [85]:
test_dataset.isnull().sum()
test_dataset = test_dataset.iloc[:,1:]

In [86]:
test_dataset.isnull().sum()

MSSubClass          0
MSZoning            4
LotFrontage       227
LotArea             0
Street              0
Alley            1352
LotShape            0
LandContour         0
Utilities           2
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         1
Exterior2nd         1
MasVnrType         16
MasVnrArea         15
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           44
                 ... 
HalfBath            0
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         1
TotRmsAbvGrd        0
Functional          2
Fireplaces          0
FireplaceQu       730
GarageType         76
GarageYrBlt        78
GarageFinish       78
GarageCars          1
GarageArea          1
GarageQual         78
GarageCond

# Feature Engineering of Test DATA setS

In [87]:
# Fill numerical columns missing values with median. Fill character missing values with the most used value count
test_col_miss_val = [col for col in test_dataset.columns if test_dataset[col].isnull().any()]
print(test_col_miss_val)

['MSZoning', 'LotFrontage', 'Alley', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType']


In [88]:
# Replace NaN with the most frequent values
for col in test_col_miss_val:
    if(test_dataset[col].dtype==np.dtype('O')):
        test_dataset[col]=test_dataset[col].fillna(
            test_dataset[col].value_counts().index[0])
    else:
        test_dataset[col]=test_dataset[col].fillna(test_dataset[col].median())

# Encode a Categorical data

In [89]:
encoder = LabelEncoder()

for col in test_dataset.select_dtypes(include=['object']):
    test_dataset[col] = encoder.fit_transform(test_dataset[col])

In [90]:
test_dataset.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,2,80.0,11622,1,0,3,3,0,4,...,120,0,0,2,2,0,6,2010,8,4
1,20,3,81.0,14267,1,0,0,3,0,0,...,0,0,0,2,0,12500,6,2010,8,4
2,60,3,74.0,13830,1,0,0,3,0,4,...,0,0,0,2,2,0,3,2010,8,4
3,60,3,78.0,9978,1,0,0,3,0,4,...,0,0,0,2,2,0,6,2010,8,4
4,120,3,43.0,5005,1,0,0,1,0,4,...,144,0,0,2,2,0,1,2010,8,4


# Prediction of Test Data Set

In [91]:
prediction = rf.predict(test_dataset)

In [92]:
print(prediction)

[125985.5  153392.25 179903.   ... 154867.53 117980.04 227227.55]


# Submission CSV of Prediction

In [93]:
submission_sample = pd.read_csv('sample_submission.csv')
# prediction = rf.predict()
# submission_sample.shape,prediction.shape

In [94]:
output = pd.DataFrame({'Id':submission_sample.Id, 'SalePrice':prediction})
output.to_csv('submission.csv', index=False)
output.head()

Unnamed: 0,Id,SalePrice
0,1461,125985.5
1,1462,153392.25
2,1463,179903.0
3,1464,184841.6
4,1465,211803.86
