# First run (light jog) at the house price kaggle challenge

A medium ai approach at the kaggle house price prediction challenge. Following the approach laid out in the first few lessons of the fast.ai machine learning course

In [1]:
import pandas as pd
import numpy as np 

from sklearn.model_selection import train_test_split

import matplotlib as plt
#import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

### Configuring global variables 

In [2]:
train_path = './input/train.csv'
test_path = './input/test.csv'

In [3]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

### Load train & test data

At this stage not adding any special flags when loading the data. Considering the low file size (and I don't know how to do it) the data hasn't been converted to feather format

In [4]:
raw_df = pd.read_csv(train_path)
test_raw_df = pd.read_csv(test_path)

In [5]:
raw_df.shape, test_raw_df.shape

((1460, 81), (1459, 80))

In [6]:
raw_df.tail(10).T

Unnamed: 0,1450,1451,1452,1453,1454,1455,1456,1457,1458,1459
Id,1451,1452,1453,1454,1455,1456,1457,1458,1459,1460
MSSubClass,90,20,180,20,20,60,20,70,20,20
MSZoning,RL,RL,RM,RL,FV,RL,RL,RL,RL,RL
LotFrontage,60,78,35,90,62,62,85,66,68,75
LotArea,9000,9262,3675,17217,7500,7917,13175,9042,9717,9937
Street,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave
Alley,,,,,Pave,,,,,
LotShape,Reg,Reg,Reg,Reg,Reg,Reg,Reg,Reg,Reg,Reg
LandContour,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl
Utilities,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub


### Handling null values in raw data

Since the plan is to use one hot encoding we need to remove null values from the data sets. By reviewiing the data it seems there are three simple options:

1. Drop feature
For features that have a large number of null values and don't appear to be important we can just remove them from the data set

2. Fill missing with mean
For continuous features we can simply take the mean value

3. Create 'missing' category
For categorical features we can create a value for missing data. This makes sense since a missing value make actually hold meaning, e.g. the property does not have a pool

In [7]:
null_features_count = raw_df.isnull().sum()
null_features_count = null_features_count[null_features_count > 0]
null_features_count

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

#### Test set difference 

We need to repeat the same process for the test set, i.e. make all the same transformations. 
In particular we need to see where the test set differs from the training set so that we don't miss transformations

In [8]:
test_null_features_count = test_raw_df.isnull().sum()
test_null_features_count = test_null_features_count[test_null_features_count > 0]

test_train_feature_diff = test_null_features_count.index.difference(null_features_count.index)
test_train_feature_diff.append(null_features_count.index.difference(test_null_features_count.index))

test_null_features_count[test_train_feature_diff]

BsmtFinSF1      1
BsmtFinSF2      1
BsmtFullBath    2
BsmtHalfBath    2
BsmtUnfSF       1
Exterior1st     1
Exterior2nd     1
Functional      2
GarageArea      1
GarageCars      1
KitchenQual     1
MSZoning        4
SaleType        1
TotalBsmtSF     1
Utilities       2
dtype: int64

In [9]:
# Created lists of feature names for missing values
large_null_features = [
    'MiscFeature'
]
fill_mean_features = [
    'MasVnrArea', 
    'GarageYrBlt', 
    'LotFrontage',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtFullBath',
    'BsmtHalfBath',
    'BsmtUnfSF',
    'GarageArea',
    'GarageCars',
    'TotalBsmtSF'
]
# this is only the test set at the minute, review the training set to see what can be switched
fill_most_common_cat_features = [
    'Exterior1st',
    'Exterior2nd',
    'Functional',
    'KitchenQual',
    'MSZoning',
    'SaleType',
    'Utilities' 
]
create_NA_cat_features = [
    'Alley',
    'PoolQC',
    'BsmtCond', 
    'BsmtFinType1', 
    'BsmtFinType2',
    'BsmtQual', 
    'BsmtExposure',
    'Fence',
    'FireplaceQu', 
    'GarageCond', 
    'GarageFinish', 
    'GarageQual',
    'GarageType'
]
create_none_cat_features = [
    'MasVnrType'
]

In [10]:
def handle_missing_values(df):
    for col in create_NA_cat_features:
        df[col].fillna(value='NA', inplace=True)

    for col in create_none_cat_features:
        df[col].fillna(value='None', inplace=True)

    for col in fill_mean_features:
        df[col].fillna(value=df[col].mean(), inplace=True)

    df.Electrical.fillna(value='Mix', inplace=True)

    df.drop(large_null_features, axis=1, inplace=True)
    
    for col in fill_most_common_cat_features:
        df[col].fillna(value=df[col].value_counts().idxmax(), inplace=True)

In [11]:
handle_missing_values(raw_df)
handle_missing_values(test_raw_df)

### Create first model

Firstly we need to take the log of the Sale Price so that higher sale prices don't affect the model.

Then we will create a random forest with just the default values to see what score we can achieve

In [12]:
raw_df.SalePrice = np.log(raw_df.SalePrice)

In [13]:
one_hot_encoded_df = pd.get_dummies(data=raw_df)
test_one_hot_encoded_df = pd.get_dummies(data=test_raw_df)

#### Categories in train set that don't appear in test set

This is interesting, there are more features in the train set than the test set. This is caused by the one hot encoding. The training set must have categories that don't appear in the test set. I think the best way to handle this would be to use train_cats on the features, but I'll try that another time, and just drop the differences for now

In [14]:
one_hot_encoded_df.shape, test_one_hot_encoded_df.shape

((1460, 299), (1459, 281))

In [15]:
test_missing_features = one_hot_encoded_df.columns.difference(test_one_hot_encoded_df.columns)
test_missing_features = test_missing_features.delete(test_missing_features.get_loc('SalePrice'))
one_hot_encoded_df.drop(test_missing_features, axis=1, inplace=True)

In [16]:
one_hot_encoded_df.drop('SalePrice', axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(one_hot_encoded_df, raw_df.SalePrice)

In [17]:
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X=X_train, y=y_train)
rf.score(X_test, y_test)

0.8977040328146759

In [18]:
y_pred = rf.predict(X_test)
np.sqrt(metrics.mean_squared_error(y_test, y_pred))

0.1310735771846903

### Create test set predictions for Kaggle

Create predictions from model for Kaggle submission 

In [24]:
kaggle_preds = rf.predict(test_one_hot_encoded_df)
kaggle_preds = np.exp(kaggle_preds)
kaggle_preds.shape

(1459,)

In [25]:
submission = pd.DataFrame({'Id': test_raw_df.Id, 'SalePrice': kaggle_preds})
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,124255.930605
1,1462,147710.207473
2,1463,177514.835879
3,1464,176131.949354
4,1465,193339.92355


In [26]:
filename = 'Housing Predictions 1.csv'
submission.to_csv(filename,index=False)
print('Saved file: ' + filename)

Saved file: Housing Predictions 1.csv
