## Setup and Data Import

In [1]:
import numpy as np

import pandas as pd
pd.options.display.max_rows = None

import sys
sys.path.insert(0,'..')

import Imputation_Module as imp
import FeatureEngineering_Module as fe

from sklearn.linear_model import LinearRegression, HuberRegressor,\
    Ridge, RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler,\
    RobustScaler, PowerTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.decomposition import PCA

# import statsmodels.api as sm

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train_raw = pd.read_csv('../train.csv')
test_raw = pd.read_csv('../test.csv')

train = train_raw.copy()
test = test_raw.copy()

## Imputation and Feature Engineering

In [3]:
train, test = imp.impute_data()

In [4]:
train = fe.feat_engineering(train);
test = fe.feat_engineering(test);

In [5]:
train_dummies, test_dummies = fe.one_hot_encoding(train, test)

In [6]:
X = train_dummies.drop('SalePrice', 1)
y = train_dummies.SalePrice

## Modeling

### Linear Modeling
- Figure out how to use pipelines
- Standardize data, because cols have different scales (area, count, quality, etc). If needed, explore MinMaxScaler or MaxAbsScaler.
- Normalize data, because numeric cols have different ranges (single-digit bathrooms, double/triple-digit areas, quadruple-digit years, etc.)
- Notes: Explore different types of CV, outlier-robust regressors, GLMs

In [7]:
reg = LinearRegression().fit(X, y)
reg_scores = cross_val_score(reg, X, y, cv=10)

print('Baseline Linear Regression Accuracy: %0.5f (+/- %0.5f)'
      % (reg_scores.mean(), reg_scores.std() * 2))

Baseline Linear Regression Accuracy: 0.90267 (+/- 0.02661)


In [8]:
def linear_transform(pipe, pipe_name, X=X, y=y):
    pipe.fit(X, y) # taking log(y) here does not change the accuracy
    pipe_scores = cross_val_score(pipe, X, y, cv=10)    
    
    print('%s Accuracy: %0.5f (+/- %0.5f)'
          % (pipe_name, pipe_scores.mean(), pipe_scores.std() * 2))

In [9]:
tt_lin = TransformedTargetRegressor(
        regressor=LinearRegression(), func=np.log, inverse_func=np.exp)
linear_transform(tt_lin, 'Log(y)')

Log(y) Accuracy: 0.91780 (+/- 0.02405)


In [10]:
tt_hub = TransformedTargetRegressor(
        regressor=HuberRegressor(max_iter=1e10), func=np.log, inverse_func=np.exp)

scaled = StandardScaler().fit_transform(X)
minmax = MinMaxScaler().fit_transform(X)
maxabs = MaxAbsScaler().fit_transform(X)
robust = RobustScaler().fit_transform(X)
power = PowerTransformer().fit_transform(X)

linear_transform(tt_hub, 'Log(y)', X=scaled)
linear_transform(tt_hub, 'Log(y)', X=minmax)
linear_transform(tt_hub, 'Log(y)', X=maxabs)
linear_transform(tt_hub, 'Log(y)', X=robust)
linear_transform(tt_hub, 'Log(y)', X=power)

  loglike = -n_samples / 2 * np.log(x_trans.var())


Log(y) Accuracy: 0.87425 (+/- 0.10432)
Log(y) Accuracy: 0.91500 (+/- 0.04124)
Log(y) Accuracy: 0.91502 (+/- 0.04130)
Log(y) Accuracy: 0.91481 (+/- 0.04143)
Log(y) Accuracy: 0.87508 (+/- 0.10915)


In [11]:
tt_ridge = TransformedTargetRegressor(
        regressor=RidgeCV(), func=np.log, inverse_func=np.exp)
linear_transform(tt_ridge, 'Log(y)')

Log(y) Accuracy: 0.91823 (+/- 0.01797)


In [12]:
tt_lasso = TransformedTargetRegressor(
        regressor=LassoCV(normalize=True), func=np.log, inverse_func=np.exp)
linear_transform(tt_lasso, 'Log(y)')

Log(y) Accuracy: 0.91905 (+/- 0.01886)


In [13]:
tt_net = TransformedTargetRegressor(
        regressor=ElasticNetCV(normalize=True), func=np.log, inverse_func=np.exp)
linear_transform(tt_net, 'Log(y)')

Log(y) Accuracy: 0.91735 (+/- 0.01762)


In [14]:
pca = PCA()
# set the tolerance to a large value to make the example faster
pipe = Pipeline(steps=[('pca', pca), ('tt_ridge', tt_ridge)])

param_grid = {'pca__n_components': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]}
search = GridSearchCV(pipe, param_grid, n_jobs=-1)
search.fit(X, y)

linear_transform(search, 'PCA')

PCA Accuracy: 0.87818 (+/- 0.02518)
