# House Prices - Advanced Regression Techniques (Kaggle)

Run after following notebooks are run:
1. 01 Data Cleaning
2. 02 Exploratory Data Analysis

#### Development Ideas
*  Need to scale discrete numerical variables as well

In [30]:
### set up libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold, cross_val_score
import seaborn as sns

pd.options.display.max_rows = 100

## Load Data

In [31]:
### load data (cleaned)
train = pd.read_csv('train_clean_truncated.csv')

### convert all object datatypes to category
for col in train.select_dtypes(include='object'):
    train[col] = train[col].astype('category')

### convert additional columns to category datatype
train['MSSubClass'] = train['MSSubClass'].astype('category')

train.shape

(1460, 67)

In [32]:
## separate in to predictor and response tables (x and y)
x_train = train.drop('SalePrice', axis=1)
y_train = train.SalePrice

## One Hot Encoding

In [33]:
x_train_onehot = pd.get_dummies(x_train, drop_first=True)
print(x_train_onehot.shape)
x_train_onehot.columns.to_list()

(1460, 201)


['LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'GrLivArea',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 'MoSold',
 'YrSold',
 'age_sold',
 'age_remodel_sold',
 'age_garage_sold',
 'bsmt_bath',
 'bath',
 'MSSubClass_30',
 'MSSubClass_40',
 'MSSubClass_45',
 'MSSubClass_50',
 'MSSubClass_60',
 'MSSubClass_70',
 'MSSubClass_75',
 'MSSubClass_80',
 'MSSubClass_85',
 'MSSubClass_90',
 'MSSubClass_120',
 'MSSubClass_160',
 'MSSubClass_180',
 'MSSubClass_190',
 'MSZoning_FV',
 'MSZoning_RH',
 'MSZoning_RL',
 'MSZoning_RM',
 'Alley_Pave',
 'LotShape_Reg',
 'LandContour_HLS',
 'LandContour_Low',
 'LandContour_Lvl',
 'LotConfig_CulDSac',
 'LotConfig_FR',
 'LotConfig_Inside',
 'LandSlope_MOrS',
 'Neighborhood_Blueste',
 'Neighborhood_BrDale',
 'Neighborhood_BrkSide',
 'Neighborhood_C

## Modelling - Cross-Validation and Optimisation

### KNN

In [34]:
## instantiate knn algorithm
knn = KNeighborsRegressor(n_neighbors=10)

## cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=8)
knn_cv_scores = cross_val_score(knn, x_train_onehot.values, y_train.values, cv=kf)
knn_cv_scores

array([0.71570021, 0.70393951, 0.67314234, 0.74896047, 0.75320094])