## Setup and Data Import

In [None]:
import numpy as np

import pandas as pd
pd.options.display.max_rows = None
pd.options.display.max_columns = None

import sys
sys.path.insert(0,'..')

import Imputation_Module as imp
import FeatureEngineering_Module as fe
import OneHotEncoding_Module as ohe
import Encoding_Module as ec

from sklearn.metrics import mean_squared_error, mean_squared_log_error
import sklearn.linear_model as lm
import sklearn.model_selection as ms
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.decomposition import PCA
from sklearn import tree
from sklearn.ensemble import GradientBoostingRegressor

# import statsmodels.api as sm

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train_raw = pd.read_csv('../train.csv')
test_raw = pd.read_csv('../test.csv')

train = train_raw.copy()
test = test_raw.copy()

## Imputation and Feature Engineering

In [None]:
train, test = imp.impute_data()

pipe = Pipeline([
    ('selector', fe.FeatureEngineering())
])

train = pipe.transform(train)
test = pipe.transform(test)

train_dummies, test_dummies = ohe.Dummify.transform(train, test)

train_ec, test_ec = ec.encode(train, test)

In [None]:
X = train_dummies.drop('SalePrice', 1)
y = train_dummies.SalePrice

In [None]:
X_ec = train_ec.drop('SalePrice', 1)
y_ec = train_dummies.SalePrice

## Modeling

### Linear Modeling

In [None]:
def linear_model(model, X=X, y=y):
    model.fit(X, y)
    model_scores = ms.cross_val_score(model, X, y, cv=10)
    y_pred = ms.cross_val_predict(model, X, y, cv=10)
    RMSE = mean_squared_error(y, y_pred, squared=False)
    print('%s Cross-Validation Accuracy: %0.5f (+/- %0.5f)'
          % (type(model).__name__, model_scores.mean(), model_scores.std() * 2))
    print('Root Mean Squared Error: %0.5f' % RMSE)
# What is the difference between RMSE and RMSLE?
# What is the difference between cv=5 and cv=10?

#### Baseline

In [None]:
ols = lm.LinearRegression()
linear_model(ols)

In [None]:
log_y = np.log(y)

#### Log(y)

In [None]:
linear_model(ols, y=log_y)

#### Ridge

In [None]:
ridgecv = lm.RidgeCV(alphas=np.logspace(-5, 5, 100), normalize=True)
linear_model(ridgecv, y=log_y)
# Why does Ridge use LOOCV by default instead of k-fold?

#### Lasso

In [None]:
lasso = lm.LassoCV(normalize=True)
linear_model(lasso, y=log_y)
# Why does Ridge specify alphas but Lasso sets them automatically?

#### Elastic Net

In [None]:
net = lm.ElasticNetCV(normalize=True)
linear_model(net, y=log_y)
# When would we choose Elastic Net?

#### PCA

In [None]:
# Because PCA results were worse, does that mean the manual feature selection done was effective?

In [None]:
# pca = PCA()
# pca_pipe = Pipeline(steps=[('pca', pca), ('lasso', lasso)])

# pca_params = {'pca__n_components': np.arange(5, 55, 5)}
# pca_search = ms.GridSearchCV(pca_pipe, pca_params)
# pca_search.fit(X, log_y)

# linear_model(pca_search, y=log_y)
# -------------------------------------------------------------
# TAKES FOREVER TO RUN. RESULTS:

# Ridge Cross-Validation Accuracy: 0.87426 (+/- 0.02962)
# Root Mean Squared Error: 0.13987

# Lasso Cross-Validation Accuracy: 0.87360 (+/- 0.02834)
# Root Mean Squared Error: 0.14004

In [None]:
# From here, what explains the remaining errors? It looks like from Ryan's charts,
# it was variance.

### Tree Modeling

In [None]:
X_train, X_test, y_train, y_test = ms.train_test_split(X_ec, y_ec, test_size=0.2, random_state=0)

In [None]:
def get_error(X_train, y_train, X_test, y_test, model, show = True):
    model.fit(X_train, y_train)
    train_error = 1 - model.score(X_train, y_train)
    test_error  = 1 - model.score(X_test, y_test)
    if show:
        print("The training error is: %.5f" %train_error)
        print("The test     error is: %.5f" %test_error)
    return [train_error, test_error]

#### Baseline

In [None]:
tree_model = tree.DecisionTreeRegressor()
get_error(X_train, y_train, X_test, y_test, tree_model);

#### Log(y) - not useful for decision trees because DTs have no linearity assumption

In [None]:
tree_model = tree.DecisionTreeRegressor()
get_error(X_train, y_train.apply(np.log), X_test, y_test.apply(np.log),
          tree_model);

In [None]:
grid_para_tree = {'criterion': ["mse", "friedman_mse", "mae"],
                  'max_depth': range(1, 31)}
grid_search_tree = ms.GridSearchCV(tree_model, grid_para_tree, cv=10)

get_error(X_train, y_train, X_test, y_test, grid_search_tree)
print('The best score is %.4f' %grid_search_tree.best_score_)
grid_search_tree.best_params_

#### Gradient Boosting

In [None]:
tree_model = GradientBoostingRegressor()

grid_para_tree = {'loss': ['ls', 'lad', 'huber', 'quantile'],
#                   'learning_rate': np.arange(0.001, 0.5, 0.1),
#                   'n_estimators': np.arange(50, 550, 100),
#                   'subsample': [1.0, 0.9, 0.85, 0.8],
                  'criterion': ["mse", "friedman_mse", "mae"],
                  'max_depth': range(1, 10)}
grid_search_tree = ms.GridSearchCV(tree_model, grid_para_tree, cv=5)

get_error(X_train, y_train, X_test, y_test, grid_search_tree)
print('The best score is %.4f' %grid_search_tree.best_score_)