# Production Model and Insights - Garage Crafters

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot  as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer

import statsmodels.api as sm

## Functions

## Load clean training and test data

In [2]:
train = pd.read_csv('../data/cleaned_data/train_clean.csv')
test = pd.read_csv('../data/cleaned_data/test_clean.csv')

## Predictors and target Variables

In [3]:
predictors = [
'Garage Area',
'Garage Cars',
'Garage Cond',
'Garage Finish',
'Garage Qual',
'Garage Type',
'Garage Yr Blt',
'Gr Liv Area',
'Neighborhood',
    
    
'Bldg Type',
 'Bsmt Cond',
 'Bsmt Exposure',
 'Bsmt Qual',
 'BsmtFin Type 1',
 'BsmtFin Type 2',
 'Central Air',
 'Electrical',
 'Exter Qual',
 'Exter Cond',
 'Fence',
 'Fireplace Qu',
 'Foundation',
 'Heating QC',
 'House Style',
 'Kitchen Qual',
 'Land Contour',
 'Lot Shape',
 'MS Zoning',
 'Mas Vnr Type',
 'Paved Drive',
 'Street'
]

In [4]:
X = train[predictors]
y = train['SalePrice']

In [5]:
#HACKY METHOD ALERT: renaming no_garage so that the onehotencoder drops this column first
X.replace('no_garage', '0_no_garage', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.replace('no_garage', '0_no_garage', inplace=True)


## Split data into train and test sets

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 21)

In [7]:
X_train.shape, y_test.shape

((1640, 31), (411,))

## Column Transformer

In [8]:
X_cat_cols = X.select_dtypes(exclude=['number'])

In [9]:
X_cat_col_garage_mask = X_cat_cols.apply(lambda col: 'no_garage' in col.values)

In [10]:
X_cat_col_garage_mask

Garage Cond       False
Garage Finish     False
Garage Qual       False
Garage Type       False
Neighborhood      False
Bldg Type         False
Bsmt Cond         False
Bsmt Exposure     False
Bsmt Qual         False
BsmtFin Type 1    False
BsmtFin Type 2    False
Central Air       False
Electrical        False
Exter Qual        False
Exter Cond        False
Fence             False
Fireplace Qu      False
Foundation        False
Heating QC        False
House Style       False
Kitchen Qual      False
Land Contour      False
Lot Shape         False
MS Zoning         False
Mas Vnr Type      False
Paved Drive       False
Street            False
dtype: bool

In [11]:
X_cat_col_garage_list = X_cat_col_garage_mask[X_cat_col_garage_mask].index.tolist()


In [12]:
X_cat_col_other_list = X_cat_col_garage_mask[X_cat_col_garage_mask==False].index.tolist()


In [13]:
X_cat_col_garage_list

[]

In [14]:
X['Garage Cond'].unique()

array(['TA', 'Fa', '0_no_garage', 'Po', 'Gd', 'Ex'], dtype=object)

In [15]:
X['Garage Finish'].unique()

array(['RFn', 'Unf', 'Fin', '0_no_garage'], dtype=object)

In [16]:
X['Garage Qual'].unique()

array(['TA', 'Fa', '0_no_garage', 'Gd', 'Ex', 'Po'], dtype=object)

In [17]:
X['Garage Type'].unique()

array(['Attchd', 'Detchd', 'BuiltIn', 'Basment', '0_no_garage', '2Types',
       'CarPort'], dtype=object)

In [18]:
X_cat_col_other_list

['Garage Cond',
 'Garage Finish',
 'Garage Qual',
 'Garage Type',
 'Neighborhood',
 'Bldg Type',
 'Bsmt Cond',
 'Bsmt Exposure',
 'Bsmt Qual',
 'BsmtFin Type 1',
 'BsmtFin Type 2',
 'Central Air',
 'Electrical',
 'Exter Qual',
 'Exter Cond',
 'Fence',
 'Fireplace Qu',
 'Foundation',
 'Heating QC',
 'House Style',
 'Kitchen Qual',
 'Land Contour',
 'Lot Shape',
 'MS Zoning',
 'Mas Vnr Type',
 'Paved Drive',
 'Street']

In [19]:
OHE_garage = OneHotEncoder(sparse=False, handle_unknown='ignore', drop=['first'])

In [20]:
OHE_other = OneHotEncoder(sparse=False, handle_unknown='ignore', drop='first')

In [21]:
ct = make_column_transformer(
    (OHE_garage, X_cat_col_garage_list),
    (OHE_other, X_cat_col_other_list),
    remainder='passthrough'
)

## Transform training and testing data

In [22]:
X_train_ct = ct.fit_transform(X_train)



In [23]:
X_test_ct = ct.transform(X_test)



In [24]:
column_names = ct.get_feature_names_out()

In [25]:
column_names

array(['onehotencoder-2__Garage Cond_Ex',
       'onehotencoder-2__Garage Cond_Fa',
       'onehotencoder-2__Garage Cond_Gd',
       'onehotencoder-2__Garage Cond_Po',
       'onehotencoder-2__Garage Cond_TA',
       'onehotencoder-2__Garage Finish_Fin',
       'onehotencoder-2__Garage Finish_RFn',
       'onehotencoder-2__Garage Finish_Unf',
       'onehotencoder-2__Garage Qual_Ex',
       'onehotencoder-2__Garage Qual_Fa',
       'onehotencoder-2__Garage Qual_Gd',
       'onehotencoder-2__Garage Qual_Po',
       'onehotencoder-2__Garage Qual_TA',
       'onehotencoder-2__Garage Type_2Types',
       'onehotencoder-2__Garage Type_Attchd',
       'onehotencoder-2__Garage Type_Basment',
       'onehotencoder-2__Garage Type_BuiltIn',
       'onehotencoder-2__Garage Type_CarPort',
       'onehotencoder-2__Garage Type_Detchd',
       'onehotencoder-2__Neighborhood_Blueste',
       'onehotencoder-2__Neighborhood_BrDale',
       'onehotencoder-2__Neighborhood_BrkSide',
       'onehotencoder-2

## Initialize and train Linear Regression Model

In [26]:
model = LinearRegression()
model.fit(X_train_ct, y_train)

## Evaluate Model

In [27]:
# scoring model on training set
model.score(X_train_ct, y_train)

0.894954893458562

In [28]:
#scoring model on testing set
model.score(X_test_ct, y_test)

-757405448859732.9

In [29]:
np.mean(cross_val_score(model, X_test_ct, y_test, cv=5))

-1.5478124340430253e+17

In [30]:
# setting baseline
y_pred_baseline = [np.mean(y_train)] * len(y_test)

In [31]:
# scoring mean baseline on testing set
r2_score(y_test, y_pred_baseline)

-0.014887741251467856

## Transform and Make Predictions on Unseen Testing Set

In [32]:
test_features = test[predictors]
test_features_ct = ct.transform(test_features)



In [33]:
predictions = model.predict(test_features_ct)

## OLS

In [34]:
X_train_ct_df = pd.DataFrame(X_train_ct, columns=column_names)

In [35]:
y_train.reset_index(drop=True, inplace=True)
X_train_ct_df.reset_index(drop=True, inplace=True)

In [36]:
X_train_ct_df = sm.add_constant(X_train_ct_df)

In [37]:
ols = sm.OLS(y_train, X_train_ct_df).fit()

In [38]:
ols_summary = ols.summary()

In [39]:
ols_summary

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.895
Model:,OLS,Adj. R-squared:,0.886
Method:,Least Squares,F-statistic:,97.27
Date:,"Sat, 07 Oct 2023",Prob (F-statistic):,0.0
Time:,08:49:15,Log-Likelihood:,-19008.0
No. Observations:,1640,AIC:,38280.0
Df Residuals:,1507,BIC:,39000.0
Df Model:,132,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.66e+05,3.92e+04,4.230,0.000,8.9e+04,2.43e+05
onehotencoder-2__Garage Cond_Ex,-8.526e+04,4.22e+04,-2.019,0.044,-1.68e+05,-2439.238
onehotencoder-2__Garage Cond_Fa,5.431e+04,2.76e+04,1.965,0.050,85.733,1.09e+05
onehotencoder-2__Garage Cond_Gd,5.419e+04,2.86e+04,1.897,0.058,-1854.313,1.1e+05
onehotencoder-2__Garage Cond_Po,5.786e+04,2.89e+04,2.000,0.046,1114.103,1.15e+05
onehotencoder-2__Garage Cond_TA,6.558e+04,2.77e+04,2.364,0.018,1.12e+04,1.2e+05
onehotencoder-2__Garage Finish_Fin,5.25e+04,4.39e+04,1.195,0.232,-3.37e+04,1.39e+05
onehotencoder-2__Garage Finish_RFn,4.643e+04,4.38e+04,1.059,0.290,-3.96e+04,1.32e+05
onehotencoder-2__Garage Finish_Unf,4.776e+04,4.38e+04,1.091,0.275,-3.81e+04,1.34e+05

0,1,2,3
Omnibus:,529.286,Durbin-Watson:,1.996
Prob(Omnibus):,0.0,Jarque-Bera (JB):,33387.619
Skew:,-0.643,Prob(JB):,0.0
Kurtosis:,25.067,Cond. No.,1.02e+16
