In [1]:
# Imports

import pandas as pd
import pandas_profiling
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

pd.options.display.max_rows = 80

In [2]:
# Data 

data_path = Path('../data/project')
X = pd.read_csv(data_path/'X_train_preprocessed.csv', index_col = 'Id')
y = pd.read_csv(data_path/'y_train_preprocessed.csv', index_col = 'Id')
X_test = pd.read_csv(data_path/'X_test_preprocessed.csv', index_col = 'Id')

X_full = pd.read_csv(data_path/'train.csv', index_col = 'Id')

# Feature engineering
* Total frontage 

In [3]:
# Finding out which features are not standard at houses

mask = [False if i == 0 else True for i in X_full.isna().sum()]
has_na_cols = X_full.columns[mask]
has_na_cols

Index(['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
       'MiscFeature'],
      dtype='object')

In [4]:
def create_features(X):

    X = X.copy()

    # Total square feet
    X['TotalSF'] = X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF']

    # Boolean for if the property has a various features
    feats = ['LotFrontage', 'Alley', 'MasVnrType', 'BsmtQual', 'Electrical', 'FireplaceQu', 'GarageType', 'PoolQC', 'Fence',
    'MiscFeature']
    for feature in feats:
        X['has_'+feature] = X_full[feature].apply(lambda x: 1 if x == x else 0)

    return X

X_engineered = create_features(X)
X_test_engineered = create_features(X_test)

In [5]:
# Testing to see if the binary 'has_feature' engineering worked

feats = ['LotFrontage', 'Alley', 'MasVnrType', 'BsmtQual', 'Electrical', 'FireplaceQu', 'GarageType', 'PoolQC', 'Fence',
    'MiscFeature']
for i in feats:
    print(f'{i} percentage that isn\'t NA:', X_engineered['has_'+i].sum() / X_engineered.shape[0]*100)

LotFrontage percentage that isn't NA: 82.26027397260273
Alley percentage that isn't NA: 6.232876712328768
MasVnrType percentage that isn't NA: 99.45205479452055
BsmtQual percentage that isn't NA: 97.46575342465754
Electrical percentage that isn't NA: 99.93150684931507
FireplaceQu percentage that isn't NA: 52.73972602739726
GarageType percentage that isn't NA: 94.45205479452055
PoolQC percentage that isn't NA: 0.4794520547945206
Fence percentage that isn't NA: 19.246575342465754
MiscFeature percentage that isn't NA: 3.6986301369863015


In [6]:
X_train.info("")

NameError: name 'X_train' is not defined

In [7]:
# Baseline linear model

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


X_train, X_val, y_train, y_val = train_test_split(X_engineered, y, test_size = 0.2)

model_rf = RandomForestRegressor(n_jobs=-1)
model_rf.fit(X_train, y_train)

  model_rf.fit(X_train, y_train)


RandomForestRegressor(n_jobs=-1)

In [8]:
# Getting baseline RMSE

from sklearn.metrics import mean_squared_error as mse

y_pred = model_rf.predict(X_val)

# Reversing the log target
y_pred = np.exp(y_pred)

y_pred_train = model_rf.predict(X_train)
y_pred_train = np.exp(y_pred_train)

y_val = np.exp(y_val)
rmse_baseline = mse(y_val, y_pred, squared=False)
rmse_train = mse(np.exp(y_train), y_pred_train, squared=False)

print(f'The training RMSE of my RF baseline is: {rmse_train}')
print(f'The validation RMSE of my RF baseline is: {rmse_baseline}')

The training RMSE of my RF baseline is: 10808.99275828359
The validation RMSE of my RF baseline is: 32431.358892526747


In [13]:
X_val

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,has_LotFrontage,has_Alley,has_MasVnrType,has_BsmtQual,has_Electrical,has_FireplaceQu,has_GarageType,has_PoolQC,has_Fence,has_MiscFeature
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
541,20,85.0,14601,9,5,2006,2006,584.0,1260,0,...,1,0,1,1,1,1,1,0,0,0
323,60,86.0,10380,7,5,1986,1987,172.0,28,1474,...,1,0,1,1,1,1,1,0,1,0
256,60,66.0,8738,7,5,1999,1999,302.0,0,0,...,1,0,1,1,1,1,1,0,0,0
881,20,60.0,7024,5,5,2005,2006,0.0,980,0,...,1,0,1,1,1,0,1,0,0,0
1102,20,61.0,9758,5,5,1971,1971,0.0,412,287,...,1,0,1,1,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,60,100.0,13000,6,6,1968,1968,576.0,448,0,...,1,0,1,1,1,1,1,0,0,0
459,70,0.0,5100,8,7,1925,1996,0.0,0,0,...,0,1,1,1,1,1,1,0,1,0
386,120,43.0,3182,8,5,2004,2005,16.0,24,0,...,1,0,1,1,1,1,1,0,0,0
41,20,84.0,8658,6,5,1965,1965,101.0,643,0,...,1,0,1,1,1,1,1,0,1,0


In [9]:
# Checking out the feature premutation importances
import eli5
from eli5.sklearn import PermutationImportance

permuter = PermutationImportance(
    model_rf,
    n_iter = 5
)
permuter.fit(X_val, y_val)

PermutationImportance(estimator=RandomForestRegressor(n_jobs=-1))

In [14]:
# Viewing the top permutation importance features
feature_names = X_train.columns.to_list()

eli5.show_weights(
    permuter,
    top=10,
    feature_names = feature_names
)

Weight,Feature
0.0000  ± 0.0000,TotalSF
0.0000  ± 0.0000,OverallQual
0.0000  ± 0.0000,YearRemodAdd
0.0000  ± 0.0000,GarageCars
0.0000  ± 0.0000,BsmtFinSF1
0.0000  ± 0.0000,GarageArea
0.0000  ± 0.0000,GrLivArea
0.0000  ± 0.0000,KitchenQual
0.0000  ± 0.0000,LotArea
0.0000  ± 0.0000,YearBuilt


In [66]:
# Manually removing any features that a below 0.00001 permutation importance score
cutoff = 0.00001

mask = permuter.feature_importances_ >= cutoff

new_features = X_engineered.columns[mask]

X_trans = X_engineered.copy()
X_trans = X_trans[new_features]
X_test_trans = X_test_engineered.copy()
X_test_trans = X_test_trans[new_features]
print(f'Shape of X_train before removing features: {X_engineered.shape}')
print('Shape after removing features: ', X_trans.shape)

Shape of X_train before removing features: (1460, 90)
Shape after removing features:  (1460, 49)


In [None]:
# Think about removing any outliers:

# Remove the most extreme 1% prices,
# the most extreme .1% latitudes, &
# the most extreme .1% longitudes
df = df[(df['price'] >= np.percentile(df['price'], 0.5)) & 
        (df['price'] <= np.percentile(df['price'], 99.5)) & 
        (df['latitude'] >= np.percentile(df['latitude'], 0.05)) & 
        (df['latitude'] < np.percentile(df['latitude'], 99.95)) &
        (df['longitude'] >= np.percentile(df['longitude'], 0.05)) & 
        (df['longitude'] <= np.percentile(df['longitude'], 99.95))]


In [68]:
# Export data
path = '../data/project/'

# Training data
X_enginee.to_csv(path+'X_train_engineered.csv')
y.to_csv(path+'y_train_engineered.csv')

# Test data
X_test_trans.to_csv(path+'X_test_engineered.csv')