# Linear Regression Modelling with Elastic Net
Build a pipeline to model an optimized Elastic Net solution.
Evaluate Feature Importances.

**Data Sources**

- `data/raw/train.csv`: Training set from [kaggle](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data).

**Changes**

- 2019-03-22: Start notebook



<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-libraries,-load-data" data-toc-modified-id="Import-libraries,-load-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import libraries, load data</a></span></li><li><span><a href="#Go-quick-&amp;-dirty" data-toc-modified-id="Go-quick-&amp;-dirty-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Go quick &amp; dirty</a></span></li><li><span><a href="#Pre-process-data-(outside-of-sklearn-pipeline)" data-toc-modified-id="Pre-process-data-(outside-of-sklearn-pipeline)-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Pre-process data (outside of sklearn pipeline)</a></span><ul class="toc-item"><li><span><a href="#General-pre-processing" data-toc-modified-id="General-pre-processing-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>General pre-processing</a></span></li><li><span><a href="#Create-two-versions-of-training-data-for-different-outlier-treatment-(experiment)" data-toc-modified-id="Create-two-versions-of-training-data-for-different-outlier-treatment-(experiment)-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Create two versions of training data for different outlier treatment (experiment)</a></span></li><li><span><a href="#Split-train-&amp;-test-sets" data-toc-modified-id="Split-train-&amp;-test-sets-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Split train &amp; test sets</a></span></li></ul></li><li><span><a href="#Fit,-tune,-predict-(with-Pipelines)" data-toc-modified-id="Fit,-tune,-predict-(with-Pipelines)-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Fit, tune, predict (with Pipelines)</a></span><ul class="toc-item"><li><span><a href="#Build-Pipe" data-toc-modified-id="Build-Pipe-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Build Pipe</a></span></li><li><span><a href="#Fit-&amp;-Tune" data-toc-modified-id="Fit-&amp;-Tune-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Fit &amp; Tune</a></span><ul class="toc-item"><li><span><a href="#Explore-NaN-Handling" data-toc-modified-id="Explore-NaN-Handling-4.2.1"><span class="toc-item-num">4.2.1&nbsp;&nbsp;</span>Explore NaN-Handling</a></span></li><li><span><a href="#Explore-Outlier-Handling" data-toc-modified-id="Explore-Outlier-Handling-4.2.2"><span class="toc-item-num">4.2.2&nbsp;&nbsp;</span>Explore Outlier-Handling</a></span></li><li><span><a href="#Explore-Multicollinearity-Removal" data-toc-modified-id="Explore-Multicollinearity-Removal-4.2.3"><span class="toc-item-num">4.2.3&nbsp;&nbsp;</span>Explore Multicollinearity-Removal</a></span></li></ul></li><li><span><a href="#Final-Tuning-&amp;-Evaluation" data-toc-modified-id="Final-Tuning-&amp;-Evaluation-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Final Tuning &amp; Evaluation</a></span></li></ul></li></ul></div>

---

## Import libraries, load data

In [1]:
# Import libraries
import numpy as np
import pandas as pd
from tqdm import tqdm

from scipy import stats
from scipy.stats import norm, skew

from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
from sklearn.model_selection import StratifiedKFold, GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# My functions
import EDA_functions as EDA
import cleaning_functions as cleaning
from linRegModel_class import LinRegModel

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns #, sns.set_style('whitegrid')
color = 'rebeccapurple'
%matplotlib inline

# Display settings
from IPython.display import display
pd.options.display.max_columns = 100

In [2]:
# Load data
raw_data = pd.read_csv('data/raw/train.csv')

# Check shape
display(raw_data.shape)

(1460, 81)

In [3]:
# Load variables from notebook 1
%store -r cols_to_del
%store -r cols_to_log
%store -r outliers_to_del
%store -r top_corr_columns

## Go quick & dirty
Use my 'quick & dirty' function for a baseline model on unprocessed data.

In [4]:
# Initialize a scikit-learn model object of choice
model_simple = ElasticNetCV(alphas=[0.5, 0.1, 1.5], copy_X=True, cv=5, eps=0.001, 
                            fit_intercept=True, l1_ratio=[0.2, 0.5, 0.8], max_iter=2000, 
                            n_alphas=100, n_jobs=-1)

# Create an instance of the LinRegModel class by passing df, target variable and model object
elastic_net_simple = LinRegModel(raw_data, 'SalePrice', model_simple)

# Output instance
display(elastic_net_simple)

ElasticNetCV(alphas=[0.5, 0.1, 1.5], copy_X=True, cv=5, eps=0.001,
       fit_intercept=True, l1_ratio=[0.2, 0.5, 0.8], max_iter=2000,
       n_alphas=100, n_jobs=-1, normalize=False, positive=False,
       precompute='auto', random_state=None, selection='cyclic',
       tol=0.0001, verbose=0)

In [5]:
# Perform the modelling
elastic_net_simple.go_quickDirty()





In [6]:
# Output result
elastic_net_simple

ElasticNetCV(alphas=[0.5, 0.1, 1.5], copy_X=True, cv=5, eps=0.001,
       fit_intercept=True, l1_ratio=[0.2, 0.5, 0.8], max_iter=2000,
       n_alphas=100, n_jobs=-1, normalize=False, positive=False,
       precompute='auto', random_state=None, selection='cyclic',
       tol=0.0001, verbose=0)

RMSE on test data 33771.86, r2-score 0.80.

In [7]:
# Check best alpha value
print(model_simple.alpha_)
print(model_simple.l1_ratio_)
print(model_simple.n_iter_)

0.1
0.8
2000


## Pre-process data (outside of sklearn pipeline)
Pre-processing steps that take place before data is pipelined

### General pre-processing

In [8]:
# Disable warning
pd.set_option('mode.chained_assignment', None)

# Create and clean training set with variables from the EDA notebook
train_data = (raw_data
              .pipe(cleaning.change_dtypes, cols_to_category=raw_data.select_dtypes(object))
              .pipe(cleaning.delete_columns, cols_to_delete=cols_to_del)
              .pipe(cleaning.apply_log, cols_to_transform=cols_to_log)
             )

train_data.drop(outliers_to_del, inplace=True)
train_data.dropna(subset=['MasVnrArea', 'MasVnrType', 'Electrical'], inplace=True);

'MiscFeature successfully deleted'

'PoolQC successfully deleted'

'FireplaceQu successfully deleted'

'Alley successfully deleted'

'Id successfully deleted'

'Fence successfully deleted'

In [9]:
# check results
display(train_data.shape)

(1447, 75)

### Create two versions of training data for different outlier treatment (experiment)

In [10]:
# Create List of Columns containing NaN
nan_cols = []
for col in train_data.columns:
    if train_data[col].isnull().sum() > 0:
        nan_cols.append(col)

In [11]:
# Check results
nan_cols

['LotFrontage',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond']

**Note:** All remaining cols with missing values but Lot Frontage are categorical.

In [12]:
# Create train set without missing values (drop nan_cols)
train_data_reduced = train_data.drop(nan_cols, axis=1)

In [13]:
# Check results - compare the two training sets
print("train set with NaN: ", train_data.shape[1])
print("train set without NaN: ", train_data_reduced.shape[1])

assert train_data_reduced.isnull().sum().sum() == 0
assert train_data_reduced.shape[1] == train_data.shape[1] - len(nan_cols)

train set with NaN:  75
train set without NaN:  64


### Split train & test sets

In [14]:
# Set with NaN
X_train = train_data.drop('SalePrice', axis=1)
y_train = train_data['SalePrice'].copy()

In [15]:
categorical_features = X_train.select_dtypes(include=['category']).columns
numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns

assert len(categorical_features) + len(numeric_features) == train_data.shape[1] - 1

In [16]:
# Set without NaN
X_train_reduced = train_data_reduced.drop('SalePrice', axis=1)
y_train_reduced = train_data_reduced['SalePrice'].copy()

In [17]:
categorical_features_reduced = X_train_reduced.select_dtypes(include=['category']).columns
numeric_features_reduced = X_train_reduced.select_dtypes(include=['float64', 'int64']).columns

assert len(categorical_features_reduced) + len(numeric_features_reduced) \
        == train_data_reduced.shape[1] - 1

## Fit, tune, predict (with Pipelines)

### Build Pipe

In [18]:
# Assemble pipeline (define function)
def build_pipe(X_train, y_train, numeric_features, categorical_features, clf):
    """Build a pipeline for preprocessing and modelling.
    
    ARGUMENTS:
        X_train: training features (df or array)
        y_train: training labels (df or array)
        numeric_features: list of strings, numeric columns
        categorical_features: list of strings, categorical columns
        clf: classifier (sk-learn model object)
        
    RETURNS:
        full_pipe: pipeline object
    """
    # level 1 - two separate pipes for cat and num features
    numeric_transformer = Pipeline(steps=[
        ('imputer_n', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
            ])

    categorical_transformer = Pipeline(steps=[
        ('imputer_c', SimpleImputer(strategy='constant', fill_value='missing')),
        ('ohe', OneHotEncoder(handle_unknown='ignore')),
            ])

    # level 2 - wrap the two level 1 pipes into a ColumnTransformer
    preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features),
                         ])

    # level 3 - pipe it with a classifier
    full_pipe = Pipeline(steps=[
                       ('preprocessor', preprocessor),
                       ('clf', model_simple),
                               ]) 
    
    return full_pipe

In [19]:
# Build pipeline
full_pipe = build_pipe(X_train, y_train, numeric_features, 
                       categorical_features, 
                       elastic_net_simple) # optimized model from above

In [20]:
# Build pipeline for train set without NaN
full_pipe_reduced = build_pipe(X_train_reduced, y_train_reduced, 
                               numeric_features_reduced, 
                               categorical_features_reduced, 
                               elastic_net_simple) # optimized model from above

### Fit & Tune

#### Explore NaN-Handling

In [21]:
def fit_pipe(X_train, y_train, pipe, scorer, cv=StratifiedKFold(3)):
    """Fit training data to a pipeline with GridSearchCV
    for best parameter tuning.
    
    ARGUMENTS:
        X_train: training features (df or array)
        y_train: training labels (df or array)
        pipe: pipeline (sk-learn pipeline object)
        scorer: evaluation metric for validation
        cv: type of CV, default is StratifiedKFold(3)
        
    RETURNS:
        grid: grid search object
        grid_results: dict with grid search results
    """
    parameters = {
            'preprocessor__num__imputer_n__strategy': ['mean', 'median'],
            'preprocessor__num__scaler' : [None, StandardScaler()]
#             'classifier__C': [0.1, 1.0, 10, 100],

                 }

    cv = GridSearchCV(pipe, param_grid=parameters, scoring=scorer, n_jobs=-1, iid=False,
                      cv=cv, error_score='raise', return_train_score=False, verbose=1)

    grid = cv.fit(X_train, y_train) 
    grid_results = grid.cv_results_

    return grid, grid_results

In [22]:
scorer = make_scorer(mean_squared_error)
cv = 3

# Pipe with NaN
grid, grid_results = fit_pipe(X_train, y_train, full_pipe, scorer, cv=cv)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   10.4s finished


In [41]:
pd.DataFrame(grid_results)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessor__num__imputer_n__strategy,param_preprocessor__num__scaler,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,1.289927,0.293017,0.12021,0.028455,mean,,{'preprocessor__num__imputer_n__strategy': 'me...,0.02554,0.02673,0.025142,0.025804,0.000675,1
1,1.11764,0.148989,0.128726,0.022261,mean,"StandardScaler(copy=True, with_mean=True, with...",{'preprocessor__num__imputer_n__strategy': 'me...,0.019196,0.022004,0.018972,0.020057,0.00138,4
2,1.240462,0.27152,0.151623,0.05653,median,,{'preprocessor__num__imputer_n__strategy': 'me...,0.025539,0.026729,0.025139,0.025802,0.000676,2
3,0.901496,0.174241,0.069253,0.022113,median,"StandardScaler(copy=True, with_mean=True, with...",{'preprocessor__num__imputer_n__strategy': 'me...,0.019196,0.022006,0.018972,0.020058,0.001381,3


In [42]:
grid.best_score_

0.025804104273622258

In [47]:
brk
# Here I have to go for the smallest score actually
np.min(grid_results['mean_test_score'])

0.020057310582132556

In [25]:
# Pipe without NaN
grid_reduced, grid_results_reduced = fit_pipe(X_train_reduced, y_train_reduced, 
                                              full_pipe_reduced, scorer, cv=cv)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    3.3s finished


In [26]:
np.sqrt(grid_reduced.best_score_)

0.1602627550424679

**Result:** Results are really, really close. Imputing with Median is preferred to imputing with Mean (Grid Search), but the eliminiation of all columns with NaN scores slightly better.

#### Explore Outlier-Handling

In [27]:
# Remove Outliers for remaining top_corr_cols in the reduced data set
# GarageYrBlt was a the top_corr_features that was dropped above
top_corr_columns = set(train_data_reduced.columns).intersection(set(top_corr_columns))
train_data_outliers = cleaning.remove_outliers_IQR_method(train_data_reduced,
                                                         top_corr_columns)

YearRemodAdd
Rows removed: 0

FullBath
Rows removed: 0

OverallQual
Rows removed: 1

GrLivArea
Rows removed: 7

TotRmsAbvGrd
Rows removed: 26

GarageCars
Rows removed: 4

GarageArea
Rows removed: 14

TotalBsmtSF
Rows removed: 54

1stFlrSF
Rows removed: 1

YearBuilt
Rows removed: 6

SalePrice
Rows removed: 13


Rows removed in total: 126



In [28]:
X_train_outliers = train_data.drop('SalePrice', axis=1)
y_train_outliers = train_data['SalePrice'].copy()

categorical_features_outliers = X_train.select_dtypes(include=['category']).columns
numeric_features_outliers = X_train.select_dtypes(include=['float64', 'int64']).columns

# Build pipeline for reduced train set without Outliers
full_pipe_outliers = build_pipe(X_train_outliers, y_train_outliers, 
                               numeric_features_outliers, 
                               categorical_features_outliers, 
                               elastic_net_simple) # optimized model from above

grid_outliers, grid_results_outliers = fit_pipe(X_train_outliers, y_train_outliers,
                                              full_pipe_outliers, scorer, cv=cv)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    4.2s finished


In [29]:
np.sqrt(grid_outliers.best_score_)

0.16063655957976147

**Result:** Result on data with removed outliers is slightly worse.

#### Explore Multicollinearity-Removal

In [30]:
# Remove Outliers for remaining top_corr_cols in the reduced data set
# GarageYrBlt was a the top_corr_features that was dropped above
cols_multi = set(train_data_reduced.columns).intersection(set(['1stFloor', 'GarageArea', 'FirstFlSF']))
train_data_multi = cleaning.delete_columns(train_data_reduced,  cols_multi)
assert train_data_multi.shape[1] == train_data_reduced.shape[1] - len(cols_multi)

'GarageArea successfully deleted'

In [31]:
X_train_multi = train_data.drop('SalePrice', axis=1)
y_train_multi = train_data['SalePrice'].copy()

categorical_features_multi = X_train.select_dtypes(include=['category']).columns
numeric_features_multi = X_train.select_dtypes(include=['float64', 'int64']).columns

# Build pipeline for reduced train set without multi
full_pipe_multi = build_pipe(X_train_multi, y_train_multi, 
                               numeric_features_multi, 
                               categorical_features_multi, 
                               elastic_net_simple) # optimized model from above

grid_multi, grid_results_multi = fit_pipe(X_train_multi, y_train_multi,
                                              full_pipe_multi, scorer, cv=cv)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    4.3s finished


In [32]:
np.sqrt(grid_multi.best_score_)

0.16063655957976147

**Result:** Result on data with removed multi_col(s) is slightly worse.

### Final Tuning & Evaluation

In [33]:
X, y = train_data_reduced.drop(['SalePrice'], axis = 1), train_data_reduced['SalePrice']
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [34]:
# Assemble pipeline (define function)
def build_pipe(X_train, y_train, numeric_features, categorical_features, clf):
    """Build a pipeline for preprocessing and modelling.
    
    ARGUMENTS:
        X_train: training features (df or array)
        y_train: training labels (df or array)
        numeric_features: list of strings, numeric columns
        categorical_features: list of strings, categorical columns
        clf: classifier (sk-learn model object)
        
    RETURNS:
        full_pipe: pipeline object
    """
    # level 1 - two separate pipes for cat and num features
    numeric_transformer = Pipeline(steps=[
        ('imputer_n', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
            ])

    categorical_transformer = Pipeline(steps=[
        ('imputer_c', SimpleImputer(strategy='constant', fill_value='missing')),
        ('ohe', OneHotEncoder(handle_unknown='ignore')),
            ])

    # level 2 - wrap the two level 1 pipes into a ColumnTransformer
    preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features),
                         ])

    # level 3 - pipe it with a classifier
    full_pipe = Pipeline(steps=[
                       ('preprocessor', preprocessor),
                       ('clf', model_simple),
                               ]) 
    
    return full_pipe

In [35]:
elastic_net_final = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], 
                                 eps=1e-3, n_alphas=100, fit_intercept=True, 
                                 normalize=True, precompute='auto', max_iter=2000, 
                                 tol=0.0001, cv=6, copy_X=True, verbose=0, n_jobs=-1, 
                                 positive=False, random_state=0)

full_pipe_final = build_pipe(X_train_reduced, y_train_reduced, 
                               numeric_features_reduced, 
                               categorical_features_reduced, 
                               elastic_net_final) # cv on parameters

full_pipe_final.fit(X_train_reduced, y_train_reduced)
y_pred = full_pipe_final.predict(X_test)


In [36]:
print('Test r2 score: ', r2_score(y_test, y_pred))
test_mse = mean_squared_error(y_pred, y_test)
test_rmse = np.sqrt(test_mse)
print('Test RMSE: %.4f' % test_rmse)

Test r2 score:  0.9028418562579344
Test RMSE: 0.1253


In [37]:
# # OLD PIPE

# cols_to_crop = top_corr_columns[1:]  # 'SalePrice' has to be dropped
# cols_to_del_multicol = ['1stFlrSF', 'GarageArea', 'TotRmsAbvGrd', 'GarageYrBlt']


# first_transformer = Pipeline(steps=[
#     ('crop', OutlierDropperIQR(columns=cols_to_crop)),
# #     ('drop', ColumnDropper(columns=cols_to_del_multicol)),
#     ])

# # level 1 - two separate pipes for cat and num features

# numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns
# numeric_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler())])

# categorical_features = X_train.select_dtypes(include=['category']).columns
# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# # level 2 - wrap the two level 1 pipes into a ColumnTransformer
# preprocessor = ColumnTransformer(
#         transformers=[
#             ('num', numeric_transformer, numeric_features),
#             ('cat', categorical_transformer, categorical_features),
#                      ])

# # level 3 - pipe it with a classifier
# clf = Pipeline(steps=[
#                    ('first', first_transformer),
#                    ('preprocessor', preprocessor),
#                    ('regressor', model_simple),
#                      ]) 

# # apply the preprocessor and then pass transformed data to the predictor 
# clf.fit(X_train, y_train)