In [1]:
import os
import pandas as pd
import numpy as np
import sklearn.ensemble
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

%matplotlib inline
cwd = os.getcwd()
print("Current working directory: {0}".format(cwd))

hp = pd.read_csv("./data/HousePrice2.csv", index_col = 0)
hp_res= hp[hp['MSZoning'].isin(['FV','RH','RL','RM'])]
hp_res = hp_res.dropna()
hp_res.shape

Current working directory: /Users/chereowang27/NYCDSA/Machine_Learning_I/Machine_Learning_Project/Ames_Housing_ML


(2530, 83)

In [2]:
hp_res['bath'] = hp_res['FullBath'] + 0.5*hp_res['HalfBath']

In [3]:
def label_nb(row):
    if row['Neighborhood'] in (['NridgHt','NoRidge','Veenker','StoneBr']):      
        return 'N'
    if row['Neighborhood']      \
             in (['Blmngtn','Gilbert','OldTown', 'NWAmes','NAmes','Blmngtn','BrDale','Somerst','BrkSide','Greens','NPkVill']):
        return 'NE'
    if row['Neighborhood']      \
            in (['CollgCr','SWISU', 'Edwards','SawyerW','Sawyer','ClearCr']):
        return 'W'                      
    if row['Neighborhood'] in (['Crawfor','MeadowV','Timber','Mitchel','IDOTRR']):     
        return 'S'
    
hp_res['Location']=hp_res.apply(lambda row: label_nb(row), axis = 1)
hp_res.columns

Index(['address', 'latitude', 'longitude', 'MapRefNo', 'Prop_Addr',
       'Unnamed: 0.1', 'PID', 'GrLivArea', 'SalePrice', 'MSSubClass',
       'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'LotShape',
       'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'BsmtFullBath', 'BsmtHalfBath',
       'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', '

In [4]:
hp_res = hp_res.drop(['Neighborhood','address', 'latitude', 'longitude', 'MapRefNo', 'Prop_Addr',\
       'Unnamed: 0.1',
                  'FullBath', 'HalfBath', \
                  'BsmtFullBath', 'BsmtHalfBath', \
                  'BsmtFinSF1','ExterQual', \
                  'Condition1','Condition2', \
                  'BsmtFinType1','BsmtFinType2', \
                  'BsmtQual', 'GarageQual','MoSold',   \
                  'BsmtFinSF2', 'BsmtUnfSF'], axis = 1)

catCol = ['MSSubClass', 'MSZoning','Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'BldgType', 'HouseStyle','OverallCond','RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterCond', 'Foundation',
       'BsmtCond', 'BsmtExposure', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish','GarageCond', 'PavedDrive',
       'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'MiscVal','SaleType',
       'SaleCondition','Location']


In [7]:
y = hp_res['SalePrice']
features = hp_res.drop(['SalePrice','PID'],axis = 1)
features = pd.get_dummies(features, columns = catCol, drop_first = True)

In [8]:
features.columns

Index(['GrLivArea', 'LotFrontage', 'LotArea', 'OverallQual', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       ...
       'SaleType_VWD', 'SaleType_WD ', 'SaleCondition_AdjLand',
       'SaleCondition_Alloca', 'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial', 'Location_NE', 'Location_S', 'Location_W'],
      dtype='object', length=539)

In [25]:
import time
from sklearn import tree
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_test, Y_train, \
Y_test = train_test_split(features, y, test_size = 0.3, random_state = 0)

tree = tree.DecisionTreeRegressor()
randomForest = ensemble.RandomForestClassifier()
bagging      = ensemble.BaggingClassifier()

In [26]:
start= time.time()
tree.fit(X_train, Y_train)
print(time.time() - start)

0.08796286582946777


In [14]:
grid_para_forest = [{
    "n_estimators": [25,50,100],
    "criterion": ["gini", "entropy"],
    "min_samples_leaf": range(1,10),
    "min_samples_split": np.linspace(start = 2, stop = 30, num = 15, dtype = int),
    "random_state":[42]
}]
grid_search_forest = GridSearchCV(randomForest, grid_para_forest, scoring = 'accuracy', \
                                  cv = 5, n_jobs = -1 )
start= time.time()
grid_search_forest.fit(X_train, Y_train)
print(time.time() - start)



340.8156077861786


{'criterion': 'entropy',
 'min_samples_leaf': 6,
 'min_samples_split': 22,
 'n_estimators': 50,
 'random_state': 42}

In [17]:
grid_search_forest.best_score_

0.05138219145380759

In [19]:
print("The training error is: %.5f" % (1 - grid_search_forest.score(X_train, Y_train)))
print("The test     error is: %.5f" % (1 - grid_search_forest.score(X_test, Y_test)))

The training error is: 0.10954
The test     error is: 0.96706
