# Production Model and Insights - Garage Crafters

In [386]:
import pandas as pd
import numpy as np
import matplotlib.pyplot  as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer

import statsmodels.api as sm

## Load clean training and test data

In [387]:
train = pd.read_csv('../data/cleaned_data/train_clean.csv')
test = pd.read_csv('../data/cleaned_data/test_clean.csv')

## Predictors and target Variables

In [388]:
predictors = [
'Garage Area',
'Garage Cars',
'Garage Cond',
'Garage Finish',
'Garage Qual',
'Garage Type',
'Garage Yr Blt',
'Gr Liv Area',
'Neighborhood',
]

In [389]:
X = train[predictors]
y = train['SalePrice']

## Split data into train and test sets

In [390]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 21)

In [391]:
X_train.shape, y_test.shape

((1640, 9), (411,))

## Column Transformer

In [392]:
categorical_cols = train[predictors].select_dtypes(exclude=['number']).columns.tolist()

In [393]:
categorical_transformer = OneHotEncoder(sparse=False, handle_unknown='ignore', drop='first')

In [394]:
ct = make_column_transformer(
    (categorical_transformer, categorical_cols),
    remainder='passthrough'
)

## Transform training and testing data

In [395]:
X_train_ct = ct.fit_transform(X_train)



In [396]:
X_test_ct = ct.transform(X_test)

In [397]:
column_names = ct.get_feature_names_out()

In [398]:
column_names

array(['onehotencoder__Garage Cond_Fa', 'onehotencoder__Garage Cond_Gd',
       'onehotencoder__Garage Cond_Po', 'onehotencoder__Garage Cond_TA',
       'onehotencoder__Garage Cond_no_garage',
       'onehotencoder__Garage Finish_RFn',
       'onehotencoder__Garage Finish_Unf',
       'onehotencoder__Garage Finish_no_garage',
       'onehotencoder__Garage Qual_Fa', 'onehotencoder__Garage Qual_Gd',
       'onehotencoder__Garage Qual_Po', 'onehotencoder__Garage Qual_TA',
       'onehotencoder__Garage Qual_no_garage',
       'onehotencoder__Garage Type_Attchd',
       'onehotencoder__Garage Type_Basment',
       'onehotencoder__Garage Type_BuiltIn',
       'onehotencoder__Garage Type_CarPort',
       'onehotencoder__Garage Type_Detchd',
       'onehotencoder__Garage Type_no_garage',
       'onehotencoder__Neighborhood_Blueste',
       'onehotencoder__Neighborhood_BrDale',
       'onehotencoder__Neighborhood_BrkSide',
       'onehotencoder__Neighborhood_ClearCr',
       'onehotencoder__Nei

In [399]:
X_train_ct_df = pd.DataFrame(X_train_ct, columns=column_names)

## Drop 'no_garage' Dummy Columns

In [400]:
# This code written with assistance from ChatGPT
no_garage_columns = [col for col in X_train_ct_df.columns if 'no_garage' in col]

if no_garage_columns:
    X_train_ct_df.drop(columns=no_garage_columns, inplace=True)
else:
    categorical_cols = train[predictors].select_dtypes(exclude=['number']).columns.tolist()
    first_columns_to_drop = [f"{col}_1" for col in categorical_cols if f"{col}_1" in X_train_ct_df.columns]
    X_train_ct_df.drop(columns=first_columns_to_drop, inplace=True)


In [401]:
no_garage_columns = [col for col in X_train_ct_df.columns if 'no_garage' in col]

In [402]:
X_train_ct_df.drop(columns=no_garage_columns, inplace=True)

## Initialize and train Linear Regression Model

In [403]:
model = LinearRegression()
model.fit(X_train_ct, y_train)

## Evaluate Model

In [404]:
# scoring model on training set
model.score(X_train_ct, y_train)

0.8009134303214196

In [405]:
#scoring model on testing set
model.score(X_test_ct, y_test)

0.7393897169564246

In [406]:
np.mean(cross_val_score(model, X_test_ct, y_test, cv=5))

-1.2610089418255112e+17

In [407]:
# setting baseline
y_pred_baseline = [np.mean(y_train)] * len(y_test)

In [408]:
# scoring mean baseline on testing set
r2_score(y_test, y_pred_baseline)

-0.014887741251467856

## Transform and Make Predictions on Unseen Testing Set

In [409]:
test_features = test[predictors]
test_features_ct = ct.transform(test_features)

In [410]:
predictions = model.predict(test_features_ct)

In [411]:
predictions_df = pd.DataFrame({'Id': test['Id'], 'SalePrice': predictions})

## OLS

In [412]:
y_train.reset_index(drop=True, inplace=True)
X_train_ct_df.reset_index(drop=True, inplace=True)

In [413]:
X_train_ct_df = sm.add_constant(X_train_ct_df)

In [414]:
ols = sm.OLS(y_train, X_train_ct_df).fit()

In [415]:
ols_summary = ols.summary()

In [416]:
ols_summary

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.801
Model:,OLS,Adj. R-squared:,0.795
Method:,Least Squares,F-statistic:,139.3
Date:,"Fri, 06 Oct 2023",Prob (F-statistic):,0.0
Time:,06:50:19,Log-Likelihood:,-19533.0
No. Observations:,1640,AIC:,39160.0
Df Residuals:,1593,BIC:,39410.0
Df Model:,46,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.423e+04,1.02e+04,5.317,0.000,3.42e+04,7.42e+04
onehotencoder__Garage Cond_Fa,1.545e+05,4.6e+04,3.356,0.001,6.42e+04,2.45e+05
onehotencoder__Garage Cond_Gd,1.477e+05,4.78e+04,3.089,0.002,5.39e+04,2.41e+05
onehotencoder__Garage Cond_Po,1.381e+05,4.83e+04,2.862,0.004,4.35e+04,2.33e+05
onehotencoder__Garage Cond_TA,1.62e+05,4.54e+04,3.570,0.000,7.3e+04,2.51e+05
onehotencoder__Garage Finish_RFn,-1.892e+04,2683.528,-7.050,0.000,-2.42e+04,-1.37e+04
onehotencoder__Garage Finish_Unf,-2.106e+04,3151.191,-6.683,0.000,-2.72e+04,-1.49e+04
onehotencoder__Garage Qual_Fa,-1.827e+05,3.74e+04,-4.885,0.000,-2.56e+05,-1.09e+05
onehotencoder__Garage Qual_Gd,-1.45e+05,3.91e+04,-3.710,0.000,-2.22e+05,-6.83e+04

0,1,2,3
Omnibus:,314.99,Durbin-Watson:,2.032
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5744.391
Skew:,0.355,Prob(JB):,0.0
Kurtosis:,12.141,Cond. No.,316000.0


In [417]:
#This cell written with the help of ChatGPT
ols_summary_df = pd.read_html(ols_summary.tables[1].as_html(), header=0, index_col=0)[0]

In [418]:
ols_summary_df.head(60)

Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
const,54230.0,10200.0,5.317,0.0,34200.0,74200.0
onehotencoder__Garage Cond_Fa,154500.0,46000.0,3.356,0.001,64200.0,245000.0
onehotencoder__Garage Cond_Gd,147700.0,47800.0,3.089,0.002,53900.0,241000.0
onehotencoder__Garage Cond_Po,138100.0,48300.0,2.862,0.004,43500.0,233000.0
onehotencoder__Garage Cond_TA,162000.0,45400.0,3.57,0.0,73000.0,251000.0
onehotencoder__Garage Finish_RFn,-18920.0,2683.528,-7.05,0.0,-24200.0,-13700.0
onehotencoder__Garage Finish_Unf,-21060.0,3151.191,-6.683,0.0,-27200.0,-14900.0
onehotencoder__Garage Qual_Fa,-182700.0,37400.0,-4.885,0.0,-256000.0,-109000.0
onehotencoder__Garage Qual_Gd,-145000.0,39100.0,-3.71,0.0,-222000.0,-68300.0
onehotencoder__Garage Qual_Po,-193900.0,54800.0,-3.541,0.0,-301000.0,-86500.0


In [419]:
ols_summary_df.sort_values(by='coef', ascending=False).head(60)

Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
onehotencoder__Garage Cond_TA,162000.0,45400.0,3.57,0.0,73000.0,251000.0
onehotencoder__Garage Cond_Fa,154500.0,46000.0,3.356,0.001,64200.0,245000.0
onehotencoder__Garage Cond_Gd,147700.0,47800.0,3.089,0.002,53900.0,241000.0
onehotencoder__Garage Cond_Po,138100.0,48300.0,2.862,0.004,43500.0,233000.0
onehotencoder__Neighborhood_GrnHill,112500.0,27300.0,4.119,0.0,58900.0,166000.0
onehotencoder__Neighborhood_StoneBr,95450.0,10800.0,8.849,0.0,74300.0,117000.0
onehotencoder__Neighborhood_NridgHt,82230.0,9498.121,8.657,0.0,63600.0,101000.0
onehotencoder__Garage Type_Attchd,60000.0,9599.862,6.25,0.0,41200.0,78800.0
const,54230.0,10200.0,5.317,0.0,34200.0,74200.0
onehotencoder__Garage Type_BuiltIn,50550.0,10400.0,4.864,0.0,30200.0,70900.0
