In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [433]:
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV, ElasticNet, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.pipeline import Pipeline

from sklearn.model_selection import \
    KFold, RepeatedKFold, GridSearchCV, \
    cross_validate, train_test_split

from ipywidgets import *
from IPython.display import display

import warnings



In [412]:
df_train = pd.read_csv('./datasets/df_train.csv')
df_test = pd.read_csv('./datasets/df_test.csv')

In [413]:
df_train.columns

Index(['id', 'ms_subclass', 'ms_zoning', 'lot_frontage', 'lot_area', 'street',
       'alley', 'lot_shape', 'land_contour', 'utilities', 'lot_config',
       'land_slope', 'neighborhood', 'condition_1', 'condition_2', 'bldg_type',
       'house_style', 'overall_qual', 'overall_cond', 'year_built',
       'year_remod/add', 'roof_style', 'roof_matl', 'exterior_1st',
       'exterior_2nd', 'mas_vnr_type', 'mas_vnr_area', 'exter_qual',
       'exter_cond', 'foundation', 'bsmt_qual', 'bsmt_cond', 'bsmt_exposure',
       'bsmtfin_type_1', 'bsmtfin_sf_1', 'bsmtfin_type_2', 'bsmtfin_sf_2',
       'bsmt_unf_sf', 'total_bsmt_sf', 'heating', 'heating_qc', 'central_air',
       'electrical', '1st_flr_sf', '2nd_flr_sf', 'low_qual_fin_sf',
       'gr_liv_area', 'bsmt_full_bath', 'bsmt_half_bath', 'full_bath',
       'half_bath', 'bedroom_abvgr', 'kitchen_abvgr', 'kitchen_qual',
       'totrms_abvgrd', 'functional', 'fireplaces', 'fireplace_qu',
       'garage_type', 'garage_yr_blt', 'garage_finish',

In [426]:
df_train = df_train.dropna(how='any',axis=1)

In [427]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 72 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               2051 non-null   int64 
 1   ms_subclass      2051 non-null   int64 
 2   ms_zoning        2051 non-null   int64 
 3   lot_area         2051 non-null   int64 
 4   street           2051 non-null   int64 
 5   alley            2051 non-null   int64 
 6   lot_shape        2051 non-null   int64 
 7   land_contour     2051 non-null   int64 
 8   utilities        2051 non-null   int64 
 9   lot_config       2051 non-null   int64 
 10  land_slope       2051 non-null   int64 
 11  neighborhood     2051 non-null   object
 12  condition_1      2051 non-null   int64 
 13  condition_2      2051 non-null   int64 
 14  bldg_type        2051 non-null   int64 
 15  house_style      2051 non-null   int64 
 16  overall_qual     2051 non-null   int64 
 17  overall_cond     2051 non-null   

In [457]:
X = df_train.drop(columns='saleprice')
y = df_train['saleprice']

In [458]:
X_train,X_val,y_train,y_val = train_test_split(X, y, random_state = 42)

In [459]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1538 entries, 532 to 860
Data columns (total 71 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               1538 non-null   int64 
 1   ms_subclass      1538 non-null   int64 
 2   ms_zoning        1538 non-null   int64 
 3   lot_area         1538 non-null   int64 
 4   street           1538 non-null   int64 
 5   alley            1538 non-null   int64 
 6   lot_shape        1538 non-null   int64 
 7   land_contour     1538 non-null   int64 
 8   utilities        1538 non-null   int64 
 9   lot_config       1538 non-null   int64 
 10  land_slope       1538 non-null   int64 
 11  neighborhood     1538 non-null   object
 12  condition_1      1538 non-null   int64 
 13  condition_2      1538 non-null   int64 
 14  bldg_type        1538 non-null   int64 
 15  house_style      1538 non-null   int64 
 16  overall_qual     1538 non-null   int64 
 17  overall_cond     1538 non-null  

In [460]:
ctx = ColumnTransformer(transformers = [
    ('ohe',OneHotEncoder(
        drop='first',
        sparse=False,
        handle_unknown='ignore'),
     ['neighborhood','roof_style','roof_matl','exterior_1st','exterior_2nd',
      'heating','foundation','sale_type'])
],remainder='passthrough')

In [461]:
pipe = Pipeline(
    steps = [
    ('ctx',ctx),
    ('ss',StandardScaler()),
])

In [462]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('ctx',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ohe',
                                                  OneHotEncoder(drop='first',
                                                                handle_unknown='ignore',
                                                                sparse=False),
                                                  ['neighborhood', 'roof_style',
                                                   'roof_matl', 'exterior_1st',
                                                   'exterior_2nd', 'heating',
                                                   'foundation',
                                                   'sale_type'])])),
                ('ss', StandardScaler())])

In [463]:
X_train = pipe.transform(X_train)
X_val = pipe.transform(X_val)

In [464]:
X_train = pd.DataFrame(data = X_train, columns = pipe.get_feature_names_out())
X_val = pd.DataFrame(data = X_val, columns = pipe.get_feature_names_out())

In [465]:
x = [(X_train.columns[col], variance_inflation_factor(X_train,col)) for col in range(len(X_train.columns))]

  return 1 - self.ssr/self.uncentered_tss
  vif = 1. / (1. - r_squared_i)


In [466]:
sorted(x,key=lambda x: x[1],reverse=True)

[('remainder__1st_flr_sf', inf),
 ('remainder__id', 257.89400639229234),
 ('remainder__utilities', nan),
 ('remainder__2nd_flr_sf', inf),
 ('remainder__low_qual_fin_sf', inf),
 ('remainder__gr_liv_area', inf),
 ('remainder__yr_sold', 245.87688986417564),
 ('remainder__pid_1', 163.407102843969),
 ('ohe__exterior_2nd_VinylSd', 129.91226643404457),
 ('ohe__exterior_1st_VinylSd', 109.11720613532914),
 ('ohe__exterior_2nd_MetalSd', 95.46770870792264),
 ('remainder__pid_2', 87.18021250074874),
 ('ohe__exterior_1st_MetalSd', 84.02964640704673),
 ('ohe__roof_style_Gable', 77.54629431336515),
 ('ohe__roof_style_Hip', 74.05325815388464),
 ('ohe__exterior_2nd_HdBoard', 58.064169462467376),
 ('ohe__exterior_2nd_Wd Sdng', 55.19960706370978),
 ('ohe__exterior_1st_HdBoard', 51.17821876761946),
 ('ohe__exterior_1st_Wd Sdng', 46.73171732777966),
 ('ohe__exterior_2nd_CmentBd', 46.70294493598491),
 ('ohe__exterior_1st_CemntBd', 43.14450701984665),
 ('ohe__exterior_2nd_Plywood', 38.457948696336295),
 ('oh

In [467]:
X_train.drop(columns=['remainder__1st_flr_sf','remainder__id','remainder__utilities',
                      'remainder__2nd_flr_sf','remainder__low_qual_fin_sf'],inplace=True)
X_val.drop(columns=['remainder__1st_flr_sf','remainder__id','remainder__utilities',
                      'remainder__2nd_flr_sf','remainder__low_qual_fin_sf'],inplace=True)

In [468]:
poly = PolynomialFeatures(
degree = 2,
include_bias=False)

In [469]:
poly.fit(X_train,y_train)

PolynomialFeatures(include_bias=False)

In [470]:
X_train = poly.transform(X_train)

In [471]:
X_val = poly.transform(X_val)

In [475]:
X_train = pd.DataFrame(data = X_train, columns = poly.get_feature_names_out())
X_val = pd.DataFrame(data = X_val, columns = poly.get_feature_names_out())

In [476]:

X_train.to_csv('./datasets/X_train.csv')

In [477]:
X_val.to_csv('./datasets/X_val.csv')

In [478]:
y_train.to_csv('./datasets/y_train.csv')
y_val.to_csv('./datasets/y_val.csv')

In [377]:
ctx = ColumnTransformer(transformers = [
    ('ohe',OneHotEncoder(
        drop='first',
        sparse=False,
        handle_unknown='ignore'),
     ['neighborhood','roof_style','roof_matl','exterior_1st','exterior_2nd',
      'heating','foundation','sale_type'])
],remainder='passthrough')

In [378]:
pipe = Pipeline(
    steps = [
    ('ctx',ctx),
    ('ss',StandardScaler()),
])

In [379]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('ctx',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ohe',
                                                  OneHotEncoder(drop='first',
                                                                handle_unknown='ignore',
                                                                sparse=False),
                                                  ['neighborhood', 'roof_style',
                                                   'roof_matl', 'exterior_1st',
                                                   'exterior_2nd', 'heating',
                                                   'foundation',
                                                   'sale_type'])])),
                ('ss', StandardScaler())])

In [380]:
X_train = pipe.transform(X_train)
X_val = pipe.transform(X_val)

In [381]:
X_train = pd.DataFrame(data = X_train, columns = pipe.get_feature_names_out())

In [382]:
X_val = pd.DataFrame(data = X_val, columns = pipe.get_feature_names_out())

In [383]:
x = [(X_train.columns[col], variance_inflation_factor(X_train,col)) for col in range(len(X_train.columns))]

  return 1 - self.ssr/self.uncentered_tss
  vif = 1. / (1. - r_squared_i)


In [384]:
sorted(x,key=lambda x: x[1],reverse=True)

[('remainder__1st_flr_sf', inf),
 ('remainder__id', 257.89400639229234),
 ('remainder__utilities', nan),
 ('remainder__2nd_flr_sf', inf),
 ('remainder__low_qual_fin_sf', inf),
 ('remainder__gr_liv_area', inf),
 ('remainder__yr_sold', 245.87688986417564),
 ('remainder__pid_1', 163.407102843969),
 ('ohe__exterior_2nd_VinylSd', 129.91226643404457),
 ('ohe__exterior_1st_VinylSd', 109.11720613532914),
 ('ohe__exterior_2nd_MetalSd', 95.46770870792264),
 ('remainder__pid_2', 87.18021250074874),
 ('ohe__exterior_1st_MetalSd', 84.02964640704673),
 ('ohe__roof_style_Gable', 77.54629431336515),
 ('ohe__roof_style_Hip', 74.05325815388464),
 ('ohe__exterior_2nd_HdBoard', 58.064169462467376),
 ('ohe__exterior_2nd_Wd Sdng', 55.19960706370978),
 ('ohe__exterior_1st_HdBoard', 51.17821876761946),
 ('ohe__exterior_1st_Wd Sdng', 46.73171732777966),
 ('ohe__exterior_2nd_CmentBd', 46.70294493598491),
 ('ohe__exterior_1st_CemntBd', 43.14450701984665),
 ('ohe__exterior_2nd_Plywood', 38.457948696336295),
 ('oh

In [385]:
X_train.drop(columns=['remainder__1st_flr_sf','remainder__id','remainder__utilities',
                      'remainder__2nd_flr_sf','remainder__low_qual_fin_sf'],inplace=True)
X_val.drop(columns=['remainder__1st_flr_sf','remainder__id','remainder__utilities',
                      'remainder__2nd_flr_sf','remainder__low_qual_fin_sf'],inplace=True)
x = [(X_train.columns[col], variance_inflation_factor(X_train,col)) for col in range(len(X_train.columns))]
sorted(x,key=lambda x: x[1],reverse=True)

[('ohe__exterior_2nd_VinylSd', 129.6634575010401),
 ('ohe__exterior_1st_VinylSd', 108.94044100764502),
 ('ohe__exterior_2nd_MetalSd', 95.3901807983008),
 ('remainder__pid_1', 92.7089493512871),
 ('ohe__exterior_1st_MetalSd', 83.97683040870534),
 ('ohe__roof_style_Gable', 77.4167879649698),
 ('ohe__roof_style_Hip', 73.96509785674297),
 ('ohe__exterior_2nd_HdBoard', 57.87402812472054),
 ('ohe__exterior_2nd_Wd Sdng', 55.1320241627533),
 ('remainder__pid_2', 54.89715715534771),
 ('ohe__exterior_1st_HdBoard', 50.96400527720432),
 ('ohe__exterior_1st_Wd Sdng', 46.688003346802525),
 ('ohe__exterior_2nd_CmentBd', 46.26491367183146),
 ('ohe__exterior_1st_CemntBd', 42.711326799631266),
 ('ohe__exterior_2nd_Plywood', 38.28630081874023),
 ('ohe__roof_matl_CompShg', 32.53584730100593),
 ('ohe__exterior_1st_Plywood', 27.362609662784347),
 ('ohe__roof_matl_Tar&Grv', 21.18621530697453),
 ('ohe__neighborhood_NAmes', 19.956902523250182),
 ('ohe__neighborhood_CollgCr', 16.90755981699915),
 ('ohe__neighbo

##  Step 2:  Baseline scores after Step 1

In [386]:
lr = LinearRegression()
model = 'lr'
lr.fit(X_train,y_train)


LinearRegression()

In [387]:
X_train.shape,y_train.shape
X_val.shape,y_val.shape

((513, 140), (513,))

In [388]:
train = lr.score(X_train,y_train)
y_pred = lr.predict(X_val)
test = lr.score(X_val,y_val)
mse = mean_squared_error(y_val,y_pred,squared=False)
scores = [model,train,test,mse]

In [389]:
scores

['lr', 0.9019003495758131, 0.882760260655939, 26830.111795289635]

In [390]:
ridge = RidgeCV()
model = 'ridge'
ridge.fit(X_train,y_train)
train = ridge.score(X_train,y_train)
y_pred = ridge.predict(X_val)
test = ridge.score(X_val,y_val)
mse = mean_squared_error(y_val,y_pred,squared=False)
scores = [model,train,test,mse]
scores

['ridge', 0.9006643401946709, 0.8816150074721487, 26960.83792871951]

In [391]:
lasso = LassoCV()
model = 'lasso'
lasso.fit(X_train,y_train)
train = lasso.score(X_train,y_train)
y_pred = lasso.predict(X_val)
test = lasso.score(X_val,y_val)
mse = mean_squared_error(y_val,y_pred,squared=False)
scores = [model,train,test,mse]
scores

['lasso', 0.8796674896308648, 0.8828839017946517, 26815.960532831206]

##  Step 3 Get best columns from Lasso

In [347]:
lasso_col = zip(X_train.columns, lasso.coef_)

In [348]:
lasso_col = [col for col,coef in lasso_col if abs(coef) > 0.0001]

In [276]:
enet = ElasticNet(
alpha = 0.01,
l1_ratio=0.99)

model = 'enet'
enet.fit(X_train,y_train)
train = enet.score(X_train,y_train)
y_pred = enet.predict(X_val)
test = enet.score(X_val,y_val)
mse = mean_squared_error(y_val,y_pred,squared=False)
scores = [model,train,test,mse]
scores

  model = cd_fast.enet_coordinate_descent(


['enet', 0.9018996454981228, 0.882756965498898, 26830.488838138266]

In [277]:
enet.coef_

array([ 3.97454600e+02,  1.76334341e+03,  3.30075691e+03,  6.06366156e+02,
        8.88230971e+02,  4.70304794e+03,  3.27963507e+02, -2.66083201e+01,
        1.17526955e+02,  4.02002662e+03,  1.92256128e+03,  2.39930564e+02,
        2.25027629e+03,  4.27140175e+03,  3.87273232e+03,  1.82793170e+03,
        1.48074182e+03,  6.03214511e+03,  1.11063034e+04,  1.97997461e+03,
        1.27083318e+03,  2.46857069e+03,  4.26783895e+02,  3.16366098e+03,
        8.12556650e+03,  2.20773232e+03,  6.87916191e+02,  3.13452996e+03,
        4.85949392e+02,  8.01438271e+03, -2.12876371e+03, -5.68153792e+02,
        4.95469686e+04,  1.19564523e+04,  3.70544404e+04,  2.23591374e+04,
        2.16283747e+04, -6.71867863e+01, -6.61560962e+02,  8.94603473e+02,
        9.21561738e+01,  7.54577065e+02, -2.33183655e+03, -2.16216687e+02,
       -9.92570218e+02, -4.52782273e+02, -8.93994672e+02, -9.38039753e+02,
        1.01916832e+03, -2.71876198e+03, -1.07151907e+03,  6.84825652e+02,
        4.23487535e+02, -

## Step 4:  Poly with best features

In [349]:
X_train.shape,X_val.shape,y_train.shape,y_val.shape

((1538, 140), (513, 140), (1538,), (513,))

In [350]:
X_train = X_train.loc[:,lasso_col]

In [352]:
X_val = X_val.loc[:,lasso_col]

In [354]:
poly = PolynomialFeatures(
degree = 2,
include_bias=False)

In [363]:
poly.fit(X_train,y_train)
X_train = poly.transform(X_train)
X_val = poly.transform(X_val)

(1538, 2210)

In [365]:
lr = LinearRegression()
model = 'lr'
lr.fit(X_train,y_train)
train = lr.score(X_train,y_train)
y_pred = lr.predict(X_val)
test = lr.score(X_val,y_val)
mse = mean_squared_error(y_val,y_pred,squared=False)
scores = [model,train,test,mse]
scores

['lr', 0.9921538387155522, -8.711101944497426e+21, 7313438409358058.0]

In [366]:
ridge = RidgeCV()
model = 'ridge'
ridge.fit(X_train,y_train)
train = ridge.score(X_train,y_train)
y_pred = ridge.predict(X_val)
test = ridge.score(X_val,y_val)
mse = mean_squared_error(y_val,y_pred,squared=False)
scores = [model,train,test,mse]
scores

['ridge', 0.9909020843789949, 0.543709647902167, 52930.464253423765]

In [367]:
lasso = LassoCV()
model = 'lasso'
lasso.fit(X_train,y_train)
train = lasso.score(X_train,y_train)
y_pred = lasso.predict(X_val)
test = lasso.score(X_val,y_val)
mse = mean_squared_error(y_val,y_pred,squared=False)
scores = [model,train,test,mse]
scores

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


['lasso', 0.9552825192514054, 0.832505502357755, 32068.993712219726]

In [368]:
enet = ElasticNet(
alpha = 0.01,
l1_ratio=0.99)

model = 'enet'
enet.fit(X_train,y_train)
train = enet.score(X_train,y_train)
y_pred = enet.predict(X_val)
test = enet.score(X_val,y_val)
mse = mean_squared_error(y_val,y_pred,squared=False)
scores = [model,train,test,mse]
scores

  model = cd_fast.enet_coordinate_descent(


['enet', 0.9939178321992128, -0.4932368568778973, 95752.34320601264]

In [123]:
alphas = [1400,1425,1450,1475,1500,1525,1550,1575,1600]
results = []
for a in alphas:
    pipe = Pipeline(
        steps = [
        ('ctx',ctx),
        ('ss',StandardScaler()),
        ('poly',poly),
        ('lasso',Lasso(alpha=a)),
    ])
    pipe.fit(X_train,y_train)
    train = pipe.score(X_train,y_train)
    test = pipe.score(X_val,y_val)
    y_pred = pipe.predict(X_val)
    mse = mean_squared_error(y_pred,y_val,squared=False)
    results.append([a,train,test,mse])

In [124]:
results

[[1400, 0.9512781589837208, 0.9203527378413868, 22114.161963670285],
 [1425, 0.950894538858513, 0.9204752956717416, 22097.141246011222],
 [1450, 0.9505109528838381, 0.9205602387342333, 22085.336735983958],
 [1475, 0.9501381730642383, 0.9206400420600506, 22074.2407422707],
 [1500, 0.9497735056038243, 0.9205284124554061, 22089.760362685596],
 [1525, 0.9494240158337535, 0.9204183988780468, 22105.044649729432],
 [1550, 0.9491025057874914, 0.9203614881285695, 22112.947166213344],
 [1575, 0.9487836238262699, 0.9203132001748817, 22119.650123940974],
 [1600, 0.9484787957456288, 0.9202831172687801, 22123.82497202926]]

In [86]:
pd.DataFrame(
    data= pipe.named_steps['lr'].coef_,
    index = pipe[1:-1].get_feature_names_out()
)

Unnamed: 0,0
x0,-20.190664
x1,-52.917506
x2,-40.790627
x3,9.399357
x4,-311.782806
...,...
x142 x143,783.258989
x142 x144,1595.167820
x143^2,342.731254
x143 x144,-130.317110
