# House Prices - Advanced Regression Techniques (Kaggle)

Run after following notebooks are run:
1. 01 Data Cleaning
2. 02 Exploratory Data Analysis

#### Development Ideas
*  Need to scale discrete numerical variables as well

In [4]:
### set up libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
import seaborn as sns

pd.options.display.max_rows = 100

## Load Data

In [5]:
### load data (cleaned)
train = pd.read_csv('train_clean_truncated.csv')

### convert all object datatypes to category
for col in train.select_dtypes(include='object'):
    train[col] = train[col].astype('category')

### convert additional columns to category datatype
train['MSSubClass'] = train['MSSubClass'].astype('category')

train.shape

(1460, 67)

In [6]:
## separate in to predictor and response tables (x and y)
x_train = train.drop('SalePrice', axis=1)
y_train = train.SalePrice

## One Hot Encoding

In [7]:
x_train_onehot = pd.get_dummies(x_train, drop_first=True)
print(x_train_onehot.shape)
x_train_onehot.columns.to_list()

(1460, 201)


['LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'GrLivArea',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 'MoSold',
 'YrSold',
 'age_sold',
 'age_remodel_sold',
 'age_garage_sold',
 'bsmt_bath',
 'bath',
 'MSSubClass_30',
 'MSSubClass_40',
 'MSSubClass_45',
 'MSSubClass_50',
 'MSSubClass_60',
 'MSSubClass_70',
 'MSSubClass_75',
 'MSSubClass_80',
 'MSSubClass_85',
 'MSSubClass_90',
 'MSSubClass_120',
 'MSSubClass_160',
 'MSSubClass_180',
 'MSSubClass_190',
 'MSZoning_FV',
 'MSZoning_RH',
 'MSZoning_RL',
 'MSZoning_RM',
 'Alley_Pave',
 'LotShape_Reg',
 'LandContour_HLS',
 'LandContour_Low',
 'LandContour_Lvl',
 'LotConfig_CulDSac',
 'LotConfig_FR',
 'LotConfig_Inside',
 'LandSlope_MOrS',
 'Neighborhood_Blueste',
 'Neighborhood_BrDale',
 'Neighborhood_BrkSide',
 'Neighborhood_C

## Modelling - Cross-Validation and Optimisation

### KNN

In [51]:
## instantiate knn algorithm
knn = KNeighborsRegressor(n_neighbors=8)

## cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=8)
knn_cv_scores = cross_val_score(knn, x_train_onehot.values, y_train.values, cv=kf)
knn_cv_scores, np.mean(knn_cv_scores)

(array([0.71725548, 0.72156853, 0.66662937, 0.7579926 , 0.75451236]),
 0.7235916668881759)

### Random Forest

In [30]:
## instantiate rf algorithm
rf = RandomForestRegressor()

## cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=8)
rf_cv_scores = cross_val_score(rf, x_train_onehot.values, y_train.values, cv=kf)
print(rf_cv_scores, np.mean(rf_cv_scores))

rf.fit(x_train_onehot.values, y_train.values)

[0.86977555 0.84720375 0.84399046 0.88920021 0.88200938] 0.8664358709518467


In [29]:
## List feature importance
rf_ft_imp = pd.DataFrame(data={'Features': x_train_onehot.columns, 'Gini Importance':rf.feature_importances_*100})
rf_ft_imp.sort_values(by='Gini Importance', ascending=False).head(10)

Unnamed: 0,Features,Gini Importance
2,OverallQual,53.892179
11,GrLivArea,11.373565
8,TotalBsmtSF,5.179511
16,GarageCars,4.515188
17,GarageArea,2.648732
5,BsmtFinSF1,2.399813
9,1stFlrSF,2.197964
166,CentralAir_Y,1.478233
1,LotArea,1.229211
3,OverallCond,1.196751
