## Setup and Data Import

In [1]:
import numpy as np

import pandas as pd
pd.options.display.max_rows = None
pd.options.display.max_columns = None

import sys
sys.path.insert(0,'..')

import Imputation_Module as imp # imports csv files
import FeatureEngineering_Module as fe
import Encoding_Module as ec

from sklearn.metrics import mean_squared_error
import sklearn.linear_model as lm
import sklearn.model_selection as ms
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.decomposition import PCA
from sklearn import tree
from sklearn.ensemble import GradientBoostingRegressor

import statsmodels.api as sm

import random

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('dark_background')

## Imputation and Feature Engineering

In [2]:
train, test = imp.impute_data()

train = fe.FeatureEngineering(train)
test = fe.FeatureEngineering(test)

train_dummies, test_dummies = fe.Dummify(train, test)

train_ec, test_ec = ec.encode(train, test)

In [3]:
X = train_dummies.drop('SalePrice', 1)
y = train_dummies.SalePrice

In [4]:
X_ec = train_ec.drop('SalePrice', 1)
y_ec = train_dummies.SalePrice

## Modeling

### Linear Modeling with Log(y)

In [5]:
X_train, X_test, y_train, y_test \
    = ms.train_test_split(X, np.log(y), test_size=0.2, random_state=0)

In [6]:
def linear_model(model, X=X_train, y=y_train):
    model.fit(X, y)
    score = model.score(X, y)
    y_pred = model.predict(X_test)

    RMSLE = mean_squared_error(y_test, y_pred, squared=False)
    print('%s R^2: %0.5f'
          % (type(model).__name__, score.mean()))
    print('Root Mean Squared Log Error: %0.5f' % RMSLE)

#### Baseline

In [7]:
ols = lm.LinearRegression()
linear_model(ols)

LinearRegression R^2: 0.93146
Root Mean Squared Log Error: 0.11501


#### Ridge

In [8]:
ridgecv = lm.RidgeCV(alphas=np.logspace(-5, 5, 100), normalize=True)
linear_model(ridgecv)


RidgeCV R^2: 0.92983
Root Mean Squared Log Error: 0.11273


#### Lasso

In [9]:
lasso = lm.LassoCV(alphas=np.logspace(-7, -1, 100), max_iter=10000, normalize=True, cv=10)
linear_model(lasso)

LassoCV R^2: 0.92857
Root Mean Squared Log Error: 0.11116


In [10]:
lasso.alpha_

2.310129700083158e-05

#### Elastic Net

In [11]:
net = lm.ElasticNetCV(normalize=True)
linear_model(net)

ElasticNetCV R^2: 0.92889
Root Mean Squared Log Error: 0.11233


In [12]:
lasso_coefs = pd.DataFrame({'feature': np.array(X_train.columns),
                            'coefficient': lasso.coef_}
                          ).sort_values('coefficient', ascending=False)

# lasso_coefs.plot.barh(y='coefficient', x='feature', figsize=(12, 40),
#                       title='LassoCV Coefficients');

In [29]:
X_add_const = sm.add_constant(X_train)
sm_lasso = sm.OLS(y_train, X_add_const).fit_regularized('sqrt_lasso', alpha=2.31)#.summary().tables[1]
# sm_lasso.rename(columns=sm_lasso[0]).drop(sm_lasso.index[0])

sm_lasso.summary

AttributeError: 'function' object has no attribute 'tables'

### Tree Modeling

In [None]:
Xec_train, Xec_test, yec_train, yec_test = ms.train_test_split(X_ec, y_ec, test_size=0.2, random_state=0)

In [None]:
def get_error(model, X_train=Xec_train, y_train=yec_train, X_test=Xec_test, y_test=yec_test):
    model.fit(X_train, y_train)
    train_error = 1 - model.score(X_train, y_train)
    test_error  = 1 - model.score(X_test, y_test)
    print("The training error is: %.5f" %train_error)
    print("The test error is: %.5f" %test_error)

#### Baseline (Log(y))

In [None]:
tree_model = tree.DecisionTreeRegressor()
get_error(tree_model)

In [None]:
grid_para_tree = {'criterion': ["mse", "friedman_mse", "mae"],
                  'max_depth': range(1, 12)}
grid_search_tree = ms.GridSearchCV(tree_model, grid_para_tree, cv=10)

get_error(grid_search_tree)
print('The best score is %.4f' %grid_search_tree.best_score_)
print('The best parameters are %s' %grid_search_tree.best_params_)

#### Gradient Boosting

In [None]:
# gb_model = GradientBoostingRegressor()

# grid_para_gb = {'loss': ['ls', 'lad', 'huber', 'quantile'],
#                 'criterion': ["mse", "friedman_mse", "mae"],
#                 'max_depth': range(1, 12)}
# grid_search_gb = ms.GridSearchCV(gb_model, grid_para_gb, cv=10, n_jobs=5)

# get_error(grid_search_gb)
# print('The best score is %.4f' %grid_search_tree.best_score_)
# print('The best parameters are %s' %grid_search_tree.best_params_)
# # -------------------------------------------------------------
# # TAKES FOREVER TO RUN. RESULTS:

# # The training error is: 0.01315
# # The test error is: 0.09606
# # The best score is 0.8820023636243057
# # The best parameters are {'criterion': 'mse', 'loss': 'lad', 'max_depth': 9}

In [None]:
gb_model = GradientBoostingRegressor(loss='lad', n_estimators=400, subsample=0.8,
                                     criterion='mse', max_depth=9, random_state=0)
get_error(gb_model)

##### Feature Importances

In [None]:
feat_imp = pd.DataFrame({'feature': np.array(Xec_train.columns),
                         'importance': gb_model.feature_importances_})

In [None]:
feat_imp.sort_values('importance').plot.barh(y='importance', x='feature', figsize=(12,12),
                   title='Feature Importances');