In [6]:
# Imports

import pandas as pd
import pandas_profiling
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split, KFold, cross_val_score, validation_curve, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from category_encoders import OrdinalEncoder, BinaryEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

from sklearn.feature_selection import SelectKBest, f_classif, chi2, SelectPercentile
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
pd.options.display.max_rows = 80


from sklearn.metrics import mean_squared_error as mse

In [7]:
# Importing the data 

data_path = Path('../data/project')
X = pd.read_csv(data_path/'X_train_engineered.csv', index_col = 'Id')
y = pd.read_csv(data_path/'y_train_engineered.csv', index_col = 'Id')
X_test = pd.read_csv(data_path/'X_test_engineered.csv', index_col = 'Id')

In [8]:
my_data = pd.concat([X, y], axis=1)
my_data.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,has_Alley,has_MasVnrType,has_BsmtQual,has_Electrical,has_FireplaceQu,has_GarageType,has_PoolQC,has_Fence,has_MiscFeature,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,1,1,1,0,1,0,0,0,12.247694
2,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,1,1,1,1,1,0,0,0,12.109011
3,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,1,1,1,1,1,0,0,0,12.317167
4,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,1,1,1,1,1,0,0,0,11.849398
5,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,1,1,1,1,1,0,0,0,12.429216


In [9]:
# Splitting out data into training and validation sets
from sklearn.model_selection import train_test_split


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2)

# Un-log our y_val
y_val = np.exp(y_val)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((1168, 90), (292, 90), (1168, 1), (292, 1))

# Fitting some models:
* Baseline linear model
* Random Forest
* XGBoost Regression
* Ridge regression

# BASIC LINEAR REGRESSOR WITH NON ENGINEERED DATA

In [10]:
# Baseline average guesser
data_path = Path('../data/project')
df = pd.read_csv(data_path/'train.csv', index_col = 'Id')
df_test = pd.read_csv(data_path/'test.csv', index_col = 'Id')

# Creating our dataset splits
target = 'SalePrice'
X_full = df.drop(columns=target)
y_full = df[target]
X_t, X_v, y_t, y_v = train_test_split(X_full, y_full, test_size = 0.2, random_state = 42)

# Building our predictions
y_pred = pd.Series([y_full.mean()] * len(y_v))

# Getting RMSE
rmse_mean_baseline = mse(y_v, y_pred, squared=False)

print(f'The validation RMSE of my mean baseline model is: {rmse_mean_baseline}')

The validation RMSE of my mean baseline model is: 87605.1275187076


# LINEAR REGRESSION MODEL WITH NON ENGINEERED DATA

In [11]:
# Baseline linear model

model_lr = Pipeline(steps=[
    ('encoder', OrdinalEncoder()),
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('randomforestregressor', LinearRegression(n_jobs = -1))
])
model_lr.fit(X_t, y_t);

NameError: name 'LinearRegression' is not defined

In [12]:
# Getting baseline RMSE


y_pred = model_lr.predict(X_v)

# Reversing the log target
#y_pred = np.exp(y_pred)
#y_pred = pd.Series(y_pred)
rmse_baseline = mse(y_v, y_pred, squared=False)

print(f'The validation RMSE of my baseline is: {rmse_baseline}')

NameError: name 'model_lr' is not defined

# RANDOM FOREST MODEL WITH ENGINEERED DATA

In [13]:
X_train.columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'MSZoning', 'Street', 'Alley', 'LotShape',
       'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir',
       'Electrical', 'K

In [14]:
# Baseline linear model

model_rf = Pipeline(steps=[
    ('randomforestregressor', RandomForestRegressor(n_jobs = -1))
])
model_rf.fit(X_train, y_train);

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


In [15]:
# Results: 

y_pred = model_rf.predict(X_val)
y_pred_train = model_rf.predict(X_train)

# Reversing the log target
y_pred = np.exp(y_pred)
y_pred_train = np.exp(y_pred_train)

rmse_rf = mse(y_val, y_pred, squared=False)
rmse_rf_train = mse(np.exp(y_train), y_pred_train, squared=False)

print(f'The training RMSE of my Random Forest model is: {rmse_rf_train}')
print(f'The validation RMSE of my Random Forest model is: {rmse_rf}')

The training RMSE of my Random Forest model is: 12000.013284947223
The validation RMSE of my Random Forest model is: 23384.888945257793


In [16]:
# XGBoost regression
from xgboost import XGBRegressor

model_gb = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(strategy = 'constant', fill_value = 0), 
    XGBRegressor(n_estimators=200, objective='reg:squarederror', n_jobs=-1)
)
model_gb.fit(X_train, y_train)
# Results: 

y_pred = model_gb.predict(X_val)

# Reversing the log target
y_pred = np.exp(y_pred)

rmse_gb = mse(y_val, y_pred, squared=False)

print(f'The validation RMSE of my XGBoost model is: {rmse_gb}')

The validation RMSE of my XGBoost model is: 23253.774617677627


In [17]:
# Feature selection using feature permutation:


In [18]:
# Hyper paramter tuning:

from scipy.stats import randint, uniform
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

param_distributions = { 
    'randomforestregressor__n_estimators': randint(50, 500), 
    'randomforestregressor__max_depth': [5, 10, 15, 20, None], 
    'randomforestregressor__max_features': uniform(0, 1), 
}

search = RandomizedSearchCV(
    model_rf, 
    param_distributions=param_distributions, 
    n_iter=5, 
    cv=2, 
    scoring='neg_mean_absolute_error', 
    verbose=10, 
    return_train_score=True, 
    n_jobs=-1
)

search.fit(X_train, y_train);

Fitting 2 folds for each of 5 candidates, totalling 10 fits
  self._final_estimator.fit(Xt, y, **fit_params_last_step)


In [19]:
print('Best hyperparameters', search.best_params_)
print('Cross-validation MAE', -search.best_score_)
model = search.best_estimator_

Best hyperparameters {'randomforestregressor__max_depth': 15, 'randomforestregressor__max_features': 0.11447064133198703, 'randomforestregressor__n_estimators': 494}
Cross-validation MAE 0.09839972763119673


In [20]:
y_pred = model.predict(X_val)
y_pred_train = model.predict(X_train)

y_pred = np.exp(y_pred)
y_pred_train = np.exp(y_pred_train)


rmse_rf = mse(y_val, y_pred, squared=False)
rmse_rf_train = mse(np.exp(y_train), y_pred_train, squared=False)

print(f'The training RMSE of my Random Forest model is: {rmse_rf_train}')
print(f'The validation RMSE of my Random Forest model is: {rmse_rf}')

The training RMSE of my Random Forest model is: 11953.99858110574
The validation RMSE of my Random Forest model is: 22946.131089422874


# Creating a predict function that also displays the shapley scores/plot