## Model Tuning

### External LIbraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

  from pandas.core import datetools


### Read Cleaned and Preprocessed Data

In [2]:
clean_train = pd.read_csv('../data/Model_Benchmarks_train.csv')

In [3]:
clean_test = pd.read_csv('../data/Model_Benchmarks_test.csv')

### Train Test Split

In [4]:
features = ['overall_qual', 'total_sf','neighborhood', 'exter_qual', 'bsmt_qual', 'kitchen_qual', 'gr_liv_area',
            'garage_cars', 'garage_finish','fireplace_qu', 'full_bath', 'foundation', 'garage_type','mas_vnr_area']

- Using same features as previously used in linear regression 
- Once ran through regularization regression models (lasso, ridge, & Elastic net) features with little affect will be thrown out or reduced.

In [5]:
X = clean_train[features]
X = sm.add_constant(X)
y =  clean_train[['saleprice']]
X_kaggle_test = clean_test[features]
X_kaggle_test =sm.add_constant(X_kaggle_test)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state = 69)

### Transforming and Fitting Data

- Fitting and Transforming Polynomial features

In [6]:
poly = PolynomialFeatures(include_bias = False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.fit_transform(X_test)
X_kaggle_test_poly = poly.fit_transform(X_kaggle_test)

In [7]:
ss = StandardScaler()
ss.fit(X_train_poly)

X_train_poly_sc = ss.transform(X_train_poly)
X_test_poly_sc = ss.transform(X_test_poly)
X_kaggle_test_poly_sc = ss.transform(X_kaggle_test_poly)

### Inspecting four different models

In [8]:
lr = LinearRegression()
lasso = LassoCV()
ridge = RidgeCV()
en = ElasticNetCV(l1_ratio = [.1 , .5 , .7 , .9 , .95, .99, .1])

kf = KFold(n_splits = 7, shuffle = True, random_state = 69)

lr_cv = cross_val_score(lr, X_train_poly_sc, y_train, cv=kf).mean()
lasso_cv = cross_val_score(lasso, X_train_poly_sc, y_train, cv=kf).mean()
ridge_cv = cross_val_score(ridge, X_train_poly_sc, y_train, cv=kf).mean()
en_cv = cross_val_score(en, X_train_poly_sc, y_train, cv=kf).mean()

print('Linear Cross Value Score ', lr_cv)
print('Lasso Cross Value Score ', lasso_cv)
print('Ridge Cross Value Score ', ridge_cv)
print('Elastic Net Cross Value Score ', en_cv)

Linear Cross Value Score  0.877175744581021
Lasso Cross Value Score  0.8859616591548252
Ridge Cross Value Score  0.888248071988912
Elastic Net Cross Value Score  0.8835425721308179


- Examining these cross value scores it is clear that linear regression is obsolete. 
- Ridge has the highest cross value score, just barely beating out lasso and ridge.
    - Ridge will be used going foward on kaggle data

### Fit and Score Ridge Model

In [9]:
ridge_cv = cross_val_score(en, X_test_poly_sc, y_test, cv=kf).mean()
print('Ridge Cross Value Score ', ridge_cv)

Ridge Cross Value Score  0.8913763405052156


#### Takeaways:
- When ran on the test data the cross value score is almost identical
- model is not underfit or overfit

### Generating Predictions

In [10]:
en.fit(X_train_poly_sc, y_train)

ElasticNetCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
       l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 0.1], max_iter=1000,
       n_alphas=100, n_jobs=1, normalize=False, positive=False,
       precompute='auto', random_state=None, selection='cyclic',
       tol=0.0001, verbose=0)

In [11]:
y_test_preds = en.predict(X_kaggle_test_poly_sc)

In [12]:
predictions = pd.DataFrame([], columns = ['Id', 'SalePrice'])
predictions['Id'] = clean_test['id']
predictions['SalePrice'] = y_test_preds
predictions.head()

Unnamed: 0,Id,SalePrice
0,2658,144117.815184
1,2718,198012.505168
2,2414,192359.473185
3,1989,123903.68671
4,625,178795.491162


In [13]:
clean_train['saleprice'].describe()

count      2049.000000
mean     181479.018058
std       79295.913255
min       12789.000000
25%      129800.000000
50%      162500.000000
75%      214000.000000
max      611657.000000
Name: saleprice, dtype: float64

In [14]:
predictions['SalePrice'].describe()

count       879.000000
mean     179895.477997
std       74958.985588
min       60109.822344
25%      125967.770346
50%      161359.192224
75%      211869.984896
max      646995.948962
Name: SalePrice, dtype: float64

In [15]:
predictions['SalePrice'].describe()/clean_train['saleprice'].describe()

count    0.428990
mean     0.991274
std      0.945307
min      4.700119
25%      0.970476
50%      0.992980
75%      0.990047
max      1.057776
dtype: float64

In [16]:
predictions.to_csv('../data/predictions.csv', index=False)