### Ames Gridsearch

___

Imports

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

Read in data

In [2]:
train = pd.read_csv('train.csv') 
train.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


EDA

In [3]:
#columns to dummy: exter qual
train['Exter Qual'].value_counts()

TA    1247
Gd     697
Ex      81
Fa      26
Name: Exter Qual, dtype: int64

In [4]:
#columns to dummy: exter cond
train['Exter Cond'].value_counts()

TA    1778
Gd     215
Fa      49
Ex       7
Po       2
Name: Exter Cond, dtype: int64

Get some dummies

In [5]:
dummies = pd.get_dummies(train[['Exter Qual', 'Exter Cond']])
train = train.join(dummies)
train.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,SalePrice,Exter Qual_Ex,Exter Qual_Fa,Exter Qual_Gd,Exter Qual_TA,Exter Cond_Ex,Exter Cond_Fa,Exter Cond_Gd,Exter Cond_Po,Exter Cond_TA
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,130500,0,0,1,0,0,0,0,0,1
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,220000,0,0,1,0,0,0,0,0,1
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,109000,0,0,0,1,0,0,1,0,0
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,174000,0,0,0,1,0,0,0,0,1
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,138500,0,0,0,1,0,0,0,0,1


Combine a few columns

In [8]:
train[['Full Bath', 'Half Bath', 'Year Built', 'Overall Cond', 'Bedroom AbvGr']].head()

Unnamed: 0,Full Bath,Half Bath,Year Built,Overall Cond,Bedroom AbvGr
0,2,1,1976,8,3
1,2,1,1996,5,4
2,1,0,1953,7,3
3,2,1,2006,5,3
4,2,0,1900,8,3


In [10]:
# convert year built into a column called 'house_age' at time of sale
train['house_age'] = train['Yr Sold'] - train['Year Built']

In [12]:
# combine full baths and half baths into a column 'baths'
train['baths'] = train['Full Bath'] + (train['Half Bath']/2)

In [13]:
train['house_age'].head()

0     34
1     13
2     57
3      4
4    110
Name: house_age, dtype: int64

In [14]:
train['baths'].head()

0    2.5
1    2.5
2    1.0
3    2.5
4    2.0
Name: baths, dtype: float64

In [17]:
#create an interaction term that weights the num of beds
# by the house overall cond
train['weighted_beds'] = train['Bedroom AbvGr']* train['Overall Cond']
train[['weighted_beds', 'Bedroom AbvGr', 'Overall Cond']].head()

Unnamed: 0,weighted_beds,Bedroom AbvGr,Overall Cond
0,24,3,8
1,20,4,5
2,21,3,7
3,15,3,5
4,24,3,8


In [23]:
# our features are the three new columns we created, plus the dummy cols
features = ['weighted_beds', 'baths', 'house_age']
features.extend(list(dummies.columns))
features

['weighted_beds',
 'baths',
 'house_age',
 'Exter Qual_Ex',
 'Exter Qual_Fa',
 'Exter Qual_Gd',
 'Exter Qual_TA',
 'Exter Cond_Ex',
 'Exter Cond_Fa',
 'Exter Cond_Gd',
 'Exter Cond_Po',
 'Exter Cond_TA']

In [22]:
list(dummies.columns)

['Exter Qual_Ex',
 'Exter Qual_Fa',
 'Exter Qual_Gd',
 'Exter Qual_TA',
 'Exter Cond_Ex',
 'Exter Cond_Fa',
 'Exter Cond_Gd',
 'Exter Cond_Po',
 'Exter Cond_TA']

In [24]:
# create X and y
X = train[features]
y = train['SalePrice']

Train test split

In [27]:
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, random_state=23)

Scale

In [26]:
ss = StandardScaler()

In [28]:
X_train_sc = ss.fit_transform(X_train)
X_holdout_sc = ss.transform(X_holdout)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


Fit an e-net

In [29]:
enet = ElasticNet()

enet.fit(X_train_sc, y_train)

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

Calc R^2 and RMSE scores

In [30]:
#score on train data
enet.score(X_train_sc, y_train)

0.5897847710190603

In [31]:
#score on hold out data
enet.score(X_holdout_sc, y_holdout)

0.625119842301051

In [32]:
#create y_hats
y_train_predictions = enet.predict(X_train_sc)
y_holdout_predictions = enet.predict(X_holdout_sc)

In [33]:
#define a function to calculte RMSE
def rmse(y, y_hat):
    mse = mean_squared_error(y, y_hat)
    return np.sqrt(mse)

In [34]:
# calc rmse
rmse(y_train, y_train_predictions)

49504.656247871106

In [35]:
rmse(y_holdout, y_holdout_predictions)

51887.90695004524

Grid search time

In [39]:
#params
enet_params = {
    'alpha': np.linspace(.2, 1, 8),
    'l1_ratio': np.linspace(0, 1, 8)
}

In [42]:
# the gridsearch
enet_gridsearch = GridSearchCV(
    estimator=ElasticNet(),
    param_grid=enet_params,
    cv=7,
    verbose=1,
    n_jobs=2,
    return_train_score=False
)

In [43]:
#fit it
enet_gridsearch.fit(X_train_sc, y_train)

Fitting 7 folds for each of 64 candidates, totalling 448 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 448 out of 448 | elapsed:    3.0s finished


GridSearchCV(cv=7, error_score='raise-deprecating',
       estimator=ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid='warn', n_jobs=2,
       param_grid={'alpha': array([0.2    , 0.31429, 0.42857, 0.54286, 0.65714, 0.77143, 0.88571,
       1.     ]), 'l1_ratio': array([0.     , 0.14286, 0.28571, 0.42857, 0.57143, 0.71429, 0.85714,
       1.     ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring=None, verbose=1)

In [44]:
# best params
enet_gridsearch.best_params_

{'alpha': 0.2, 'l1_ratio': 0.8571428571428571}

In [46]:
# best r2 score
enet_gridsearch.best_score_

0.6087197198055743

In [47]:
# save the best model
best_enet = enet_gridsearch.best_estimator_

In [48]:
# best estimator score
best_enet.score(X_train_sc, y_train)

0.6168850746608325

In [49]:
# best score on holdout
best_enet.score(X_holdout_sc, y_holdout)

0.670813761904924

In [50]:
# predict

y_train_enet = best_enet.predict(X_train_sc)
y_holdout_enet = best_enet.predict(X_holdout_sc)

#calc rmse 
print(rmse(y_train, y_train_enet))
print(rmse(y_holdout, y_holdout_enet))


47841.48980572832
48622.88916404386
