# Final project
## Robert Shaheen version

In [1]:
#Last run date
import datetime
print (datetime.datetime.now().strftime("%B %d, %Y %H:%M:%S"))
import warnings
warnings.filterwarnings('ignore')

December 02, 2018 19:46:31


In [2]:
# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import os

# to make this notebook's output stable across runs
np.random.seed(42)

create_directory = False

# Where to save the figures
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
if not(os.path.isdir(IMAGES_PATH)) and (create_directory):
    os.makedirs(IMAGES_PATH)

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

In [3]:
#BK Import section
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion


from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import expon, reciprocal, uniform
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

# Prepare data

In [4]:
#An alternate way to prepare data
df_train = pd.read_csv('datasets/train.csv')
df_test = pd.read_csv('datasets/train.csv')

sale_price=df_train.pop('SalePrice')
df_train.drop(['Id'], inplace=True, axis=1)
df_test.drop(['Id'], inplace=True, axis=1)

numerical_features_indices_tr = np.where(df_train.dtypes != np.object)[0]
categorical_features_indices_tr = np.where(df_train.dtypes == np.object)[0]

numerical_features_indices = np.where(df_train.dtypes != np.object)[0]
categorical_features_indices = np.where(df_train.dtypes == np.object)[0]

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('std_scaler', StandardScaler()),
        #('poly_features', PolynomialFeatures(degree=1, include_bias=False)), # Take much more time to run
    ])

cat_pipeline = Pipeline([
        ('cat_imputer', SimpleImputer(strategy='most_frequent')), 
        ('cat_Encoder', OneHotEncoder(handle_unknown='ignore')), 
    ])

full_pipeline = ColumnTransformer([
        ('num', num_pipeline, numerical_features_indices),
        ('cat1', cat_pipeline, categorical_features_indices),
    ])

df_prep = full_pipeline.fit_transform(df_train)
print(df_prep.shape)

df_test = full_pipeline.transform(df_test)
print(df_test.shape)

(1460, 288)
(1460, 288)


In [5]:
train_X, val_X, train_y, val_y = train_test_split(df_prep, sale_price, random_state = 42, test_size=0.2)

# Baseline estimators

In [6]:
# Compare different models with the same input data
names = ["Linear Regression", "SVR","Ridge","Lasso","ElasticNet","SGDRegressor","KNeighbors Regressor",
         "Decision Tree Regressor", "Random Forest Regressor"]

baseline_regressors = [
    LinearRegression(), 
    SVR(kernel="linear"),
    Ridge(random_state=42),
    Lasso(random_state=42),
    ElasticNet(random_state=42),
    SGDRegressor(random_state=42),
    KNeighborsRegressor(),
    DecisionTreeRegressor(random_state=42),
    RandomForestRegressor(random_state=42)
]

# Create a dictionary for the regressor mapped to the rmse of baseline regressor
base_rmse = {}
# dict to store final models
final_models = {}
    
tuned_regressors = [
    LinearRegression(), 
    SVR(kernel="linear"),
    Ridge(random_state=42),
    Lasso(random_state=42),
    ElasticNet(random_state=42),
    SGDRegressor(random_state=42),
    KNeighborsRegressor(),
    DecisionTreeRegressor(random_state=42),
    RandomForestRegressor(random_state=42)
]

In [7]:
def print_regressor_RMSE(X_tr, X_ts, y_tr, y_ts,regressors):
    for name, rgs in zip(names, regressors):
        rgs.fit(X_tr, y_tr)
        y_pred =rgs.predict(X_ts)
        rmse = np.sqrt(mean_squared_error(y_ts, y_pred)) 
        base_rmse[name] = rmse
        print('RMSE for {0} model is {1}'.format(name,  rmse))

In [8]:
%%time
print_regressor_RMSE(train_X, val_X, train_y, val_y,baseline_regressors)

RMSE for Linear Regression model is 29511.17445905894
RMSE for SVR model is 82230.28697572199
RMSE for Ridge model is 30081.345746646897




RMSE for Lasso model is 28166.597817133625
RMSE for ElasticNet model is 36274.987007837975
RMSE for SGDRegressor model is 32052.21241784793
RMSE for KNeighbors Regressor model is 38010.12204533282
RMSE for Decision Tree Regressor model is 45286.728826158025
RMSE for Random Forest Regressor model is 33190.903777621184
Wall time: 5.92 s


# Single learning methods

In [9]:
# print the reults from the grid/random searchCV runs
def print_scores(estimator, display_threshold=40000):
    rbf_cvrs = estimator.cv_results_
    for mean_score, params in zip(rbf_cvrs["mean_test_score"], rbf_cvrs["params"]):
        if np.sqrt(-mean_score) < display_threshold:
            print(np.sqrt(-mean_score), params)

In [10]:
# evaluate using validation data
def validate_model(model_name, estimator, original_list=names, X=val_X, y=val_y):
    print('The best parameters for {0} model are:\n{1}\n'.format(model_name, grid_search.best_params_))
    final_models[model_name]=estimator.best_estimator_
    y_pred = final_models[model_name].predict(X)
    rmse= np.sqrt(mean_squared_error(y, y_pred))
    print('The RMSE for model is: {0:.2f}'.format(rmse))
    if model_name in original_list:
        print('The overall improvement with tuned hyper parameters is {0:.2%}'
              .format((base_rmse[model_name]-rmse)/base_rmse[model_name]))

## Linear regression optimization

In [11]:
%%time
#Hyper parameters optimization - Linear Regression
param_grid = {
    'fit_intercept': [True],
}

# oddly enough it seems that the best fit is the unchanged algorthim

grid_search = GridSearchCV(LinearRegression(), param_grid, cv=10,scoring='neg_mean_squared_error',verbose=1,n_jobs=2)
grid_search.fit(train_X, train_y)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    3.3s finished


Wall time: 3.67 s


In [12]:
print_scores(grid_search)

In [13]:
validate_model('Linear Regression', grid_search)

The best parameters for Linear Regression model are:
{'fit_intercept': True}

The RMSE for model is: 29511.18
The overall improvement with tuned hyper parameters is -0.00%


## SVR optimization

In [14]:
%%time
#Hyper parameters optimization - SVR
param_distribs = {
    'kernel': ['rbf'],
    'C': [113564],
    'gamma': [0.0007790692366582295],
}

# oddly enough it seems that the best fit is the unchanged algorthim

rand_search = RandomizedSearchCV(SVR(), param_distributions=param_distribs, n_iter=1, cv=5, 
                                 scoring='neg_mean_squared_error', verbose=1, n_jobs=2, random_state=42)
rand_search.fit(train_X, train_y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    3.1s finished


Wall time: 3.83 s


In [15]:
print_scores(rand_search)

31013.627149321474 {'kernel': 'rbf', 'gamma': 0.0007790692366582295, 'C': 113564}


In [16]:
validate_model('SVR', rand_search)

The best parameters for SVR model are:
{'fit_intercept': True}

The RMSE for model is: 32261.25
The overall improvement with tuned hyper parameters is 60.77%


## Ridge regression optimization

In [17]:
%%time
#Hyper parameters optimization - Ridge
param_grid = {
    'alpha': [1.0],
    'fit_intercept': [True],
    #'solver': ['cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'auto'],
    'solver': ['auto'],
}

# oddly enough it seems that the best fit is the unchanged algorthim

grid_search = GridSearchCV(Ridge(random_state=42), param_grid, cv=3,scoring='neg_mean_squared_error',verbose=1,n_jobs=2)
grid_search.fit(train_X, train_y)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   3 out of   3 | elapsed:    1.5s finished


Wall time: 2.51 s


In [18]:
print_scores(grid_search)

35272.76116039517 {'alpha': 1.0, 'fit_intercept': True, 'solver': 'auto'}


In [19]:
validate_model('Ridge', grid_search)

The best parameters for Ridge model are:
{'alpha': 1.0, 'fit_intercept': True, 'solver': 'auto'}

The RMSE for model is: 30081.35
The overall improvement with tuned hyper parameters is 0.00%


## Lasso regression optimization

In [20]:
%%time
#Hyper parameters optimization - Lasso
param_grid = {
    'alpha': [1.0],
    'max_iter': [1000],
}

# oddly enough it seems that the best fit is the unchanged algorthim

grid_search = GridSearchCV(Lasso(random_state=42), param_grid, cv=10,scoring='neg_mean_squared_error',verbose=1,n_jobs=2)
grid_search.fit(train_X, train_y)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    6.6s finished


Wall time: 8.07 s




In [21]:
print_scores(grid_search)

37032.95471550481 {'alpha': 1.0, 'max_iter': 1000}


In [22]:
validate_model('Lasso', grid_search)

The best parameters for Lasso model are:
{'alpha': 1.0, 'max_iter': 1000}

The RMSE for model is: 28166.60
The overall improvement with tuned hyper parameters is 0.00%


## Elastic net optimization

In [23]:
%%time
#Hyper parameters optimization - Elastic Net
param_grid = {
    'alpha': [0.1],
    'l1_ratio': [0.775],
}

grid_search = GridSearchCV(ElasticNet(random_state=42), param_grid, cv=8,scoring='neg_mean_squared_error', n_jobs=2)
grid_search.fit(train_X, train_y)

Wall time: 2.39 s


In [24]:
print_scores(grid_search)

33628.6959313338 {'alpha': 0.1, 'l1_ratio': 0.775}


In [25]:
validate_model('ElasticNet', grid_search)

The best parameters for ElasticNet model are:
{'alpha': 0.1, 'l1_ratio': 0.775}

The RMSE for model is: 31138.20
The overall improvement with tuned hyper parameters is 14.16%


## SGDRegressor optimization - no regularization

In [26]:
%%time
#Hyper parameters optimization - SGDRegressor
param_grid = {
    'eta0': np.linspace(0.001, 0.1, num=20),
    'loss': ['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    'max_iter': [50],
    'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'power_t':[0.5],
    'penalty': ['none']
}

grid_search = GridSearchCV(SGDRegressor(random_state=42), param_grid, verbose=1, cv=3,scoring='neg_mean_squared_error', n_jobs=2)
grid_search.fit(train_X, train_y)

Fitting 3 folds for each of 320 candidates, totalling 960 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 484 tasks      | elapsed:    5.8s


Wall time: 13.1 s


[Parallel(n_jobs=2)]: Done 960 out of 960 | elapsed:   13.0s finished


In [27]:
print_scores(grid_search, display_threshold=35000)

34954.45710065991 {'eta0': 0.01663157894736842, 'learning_rate': 'invscaling', 'loss': 'squared_epsilon_insensitive', 'max_iter': 50, 'penalty': 'none', 'power_t': 0.5}
34961.38728992931 {'eta0': 0.021842105263157895, 'learning_rate': 'invscaling', 'loss': 'squared_epsilon_insensitive', 'max_iter': 50, 'penalty': 'none', 'power_t': 0.5}
34967.959445105684 {'eta0': 0.03226315789473684, 'learning_rate': 'invscaling', 'loss': 'squared_loss', 'max_iter': 50, 'penalty': 'none', 'power_t': 0.5}
34923.57392029842 {'eta0': 0.03747368421052632, 'learning_rate': 'invscaling', 'loss': 'squared_loss', 'max_iter': 50, 'penalty': 'none', 'power_t': 0.5}
34947.950573622315 {'eta0': 0.04268421052631579, 'learning_rate': 'invscaling', 'loss': 'squared_loss', 'max_iter': 50, 'penalty': 'none', 'power_t': 0.5}


In [28]:
validate_model('SGDRegressor', grid_search)

The best parameters for SGDRegressor model are:
{'eta0': 0.03747368421052632, 'learning_rate': 'invscaling', 'loss': 'squared_loss', 'max_iter': 50, 'penalty': 'none', 'power_t': 0.5}

The RMSE for model is: 31602.49
The overall improvement with tuned hyper parameters is 1.40%


## SGDRegressor Optimization - with regularization
commented out because it runs 72000 fits with no performance increase.

In [29]:
# %%time
# #Hyper parameters optimization - SGDRegressor
# param_grid = [
#     {'eta0': np.linspace(0.001, 0.1, num=20),
#     'loss': ['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
#     'max_iter': [50],
#     'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
#     'power_t':[0.5],
#     'penalty': ['l2','l1','elasticnet'],
#     'l1_ratio': np.linspace(0, 1, num=5),
#     'alpha': np.linspace(0.0001, 0.1, num=5),
#     }
# ]

# grid_search = GridSearchCV(SGDRegressor(random_state=42), param_grid, verbose=1, cv=3,scoring='neg_mean_squared_error', n_jobs=2)
# grid_search.fit(train_X, train_y)

In [30]:
# print_scores(grid_search)

In [31]:
# validate_model('SGDRegressor', grid_search)

## KNeighborsRegressor optimization

In [32]:
%%time
#Hyper parameters optimization - KNeighbors Regressor
param_grid = {
    'n_neighbors': np.arange(3,8),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto'],
    'p': [1, 2]
}

grid_search = GridSearchCV(KNeighborsRegressor(), param_grid, verbose=1, cv=3,scoring='neg_mean_squared_error', n_jobs=2)
grid_search.fit(train_X, train_y)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    8.0s


Wall time: 10.4 s


[Parallel(n_jobs=2)]: Done  60 out of  60 | elapsed:   10.3s finished


In [33]:
print_scores(grid_search)

35851.730793495444 {'algorithm': 'auto', 'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}
35437.67194871518 {'algorithm': 'auto', 'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
39996.77996214185 {'algorithm': 'auto', 'n_neighbors': 3, 'p': 2, 'weights': 'uniform'}
39970.572570900615 {'algorithm': 'auto', 'n_neighbors': 3, 'p': 2, 'weights': 'distance'}
35782.53637057415 {'algorithm': 'auto', 'n_neighbors': 4, 'p': 1, 'weights': 'uniform'}
35334.809806304154 {'algorithm': 'auto', 'n_neighbors': 4, 'p': 1, 'weights': 'distance'}
39138.54812932241 {'algorithm': 'auto', 'n_neighbors': 4, 'p': 2, 'weights': 'uniform'}
39098.05664835088 {'algorithm': 'auto', 'n_neighbors': 4, 'p': 2, 'weights': 'distance'}
36047.41645732144 {'algorithm': 'auto', 'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}
35580.81488372424 {'algorithm': 'auto', 'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
37852.31038590408 {'algorithm': 'auto', 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
37852.34914326504 {'algo

In [34]:
validate_model('KNeighbors Regressor', grid_search)

The best parameters for KNeighbors Regressor model are:
{'algorithm': 'auto', 'n_neighbors': 4, 'p': 1, 'weights': 'distance'}

The RMSE for model is: 33949.72
The overall improvement with tuned hyper parameters is 10.68%


## Decision Tree Regressor optimization

In [35]:
# Print the most important features
def most_important_features(model, rgr_name, numResults=288, sort=True):
    important_features = []
    df = pd.DataFrame(df_prep.toarray()) 
    for name, score in zip(df.columns, model.feature_importances_):
        important_features.append((name, score))

    if sort:
        important_features.sort(key=lambda tup: tup[1], reverse=True)

    print('Top 10 features according to {}.\n'.format(rgr_name))
    print('Column # represents the column location in df_prep (with one_hot_encoding):\n')
    print('{:>12}{:>7}\n'.format('Column #', 'Score'))

    for name, score in important_features[0:numResults]:
        print('{:>12}: {:>8}'.format(name, score))

In [36]:
%%time
#Hyper parameters optimization - Decision Tree Regressor
param_grid = {
    'max_depth': np.append(np.arange(1,30+1,2), None),
    'min_samples_split': np.arange(2,30+1,2),
    'min_samples_leaf': [2],
    'max_features': ['auto'],
    #'max_leaf_nodes': np.append(np.arange(2,25), None),
    'max_leaf_nodes': [None], # none is the best every time
}

grid_search = GridSearchCV(DecisionTreeRegressor(random_state=42), param_grid, verbose=1, cv=3,scoring='neg_mean_squared_error', n_jobs=2)
grid_search.fit(train_X, train_y)

Fitting 3 folds for each of 240 candidates, totalling 720 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 590 tasks      | elapsed:   12.9s


Wall time: 16.3 s


[Parallel(n_jobs=2)]: Done 720 out of 720 | elapsed:   16.2s finished


In [37]:
print_scores(grid_search, display_threshold=38416)

38415.557018841486 {'max_depth': 13, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_samples_leaf': 2, 'min_samples_split': 22}
38415.557018841486 {'max_depth': 15, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_samples_leaf': 2, 'min_samples_split': 22}
38415.557018841486 {'max_depth': 17, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_samples_leaf': 2, 'min_samples_split': 22}
38415.557018841486 {'max_depth': 19, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_samples_leaf': 2, 'min_samples_split': 22}
38415.557018841486 {'max_depth': 21, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_samples_leaf': 2, 'min_samples_split': 22}
38415.557018841486 {'max_depth': 23, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_samples_leaf': 2, 'min_samples_split': 22}
38415.557018841486 {'max_depth': 25, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_samples_leaf': 2, 'min_samples_split': 22}
38415.557018841486 {'max_depth': 27, 'max_features': 'auto', 'max_lea

In [38]:
validate_model('Decision Tree Regressor', grid_search)

The best parameters for Decision Tree Regressor model are:
{'max_depth': 13, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_samples_leaf': 2, 'min_samples_split': 22}

The RMSE for model is: 34931.28
The overall improvement with tuned hyper parameters is 22.87%


In [39]:
# Print the most important features according to decision tree analysis
most_important_features(final_models['Decision Tree Regressor'], 'Decision Tree Regressor', numResults=5, sort=True)

Top 10 features according to Decision Tree Regressor.

Column # represents the column location in df_prep (with one_hot_encoding):

    Column #  Score

           3: 0.685652492153181
          15: 0.11322765628744695
          11: 0.033788062958275966
          13: 0.030759413326429414
           8: 0.01806852160964238


## Random Forest Regressor optimization

### RandomSearchCV

In [40]:
# %%time
# # Hyperparameters optimization - Random Forest Regressor
# # Use RandomSearchCV
# # 38min 58s run time with the following params:

# param_dist = {
#      'n_estimators': np.arange(1500,2000+1),
#      'max_depth': np.arange(15,25+1),
#      'max_features': np.arange(20,100+1),
#      'min_samples_split': np.arange(5, 7+1), 
#      'min_samples_leaf': [2], 
# }

# rand_search = RandomizedSearchCV(RandomForestRegressor(random_state=42), param_distributions=param_dist, scoring='neg_mean_squared_error', 
#                                     n_iter=100, cv=3, verbose=1, random_state=42, n_jobs=2)
# rand_search.fit(train_X, train_y)

In [41]:
# print_scores(rand_search)

In [42]:
# validate_model('Random Forest Regressor', rand_search)

# # --- results ----
# # The best parameters for Random Forest Regressor model are:
# # {'n_estimators': 1762, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 100, 'max_depth': 18}

# # The RMSE for Random Forest Regressor model is: 29175.04
# # The overall improvement with tuned hyper parameters is 12.10%

# # Labeled data mean = 178840, rmse/mean = 16.31%

### GridSearchCV

In [43]:
%%time
#Hyper parameters optimization - Random Forest Regressor
param_grid = {
    'n_estimators': [2000],
    'max_depth': [18],
    'max_features': [120],
    'min_samples_split': [5], 
    'min_samples_leaf': [2], 
    # 'max_leaf_nodes': np.arange(2, 100+1, 10) # does not help rmse_val
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, verbose=1, cv=3,scoring='neg_mean_squared_error', n_jobs=2)
grid_search.fit(train_X, train_y)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   3 out of   3 | elapsed:   53.4s finished


Wall time: 2min 14s


In [44]:
print_scores(grid_search)

29053.836358164444 {'max_depth': 18, 'max_features': 120, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 2000}


In [45]:
validate_model('Random Forest Regressor', grid_search)

The best parameters for Random Forest Regressor model are:
{'max_depth': 18, 'max_features': 120, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 2000}

The RMSE for model is: 28853.04
The overall improvement with tuned hyper parameters is 13.07%


In [46]:
most_important_features(final_models['Random Forest Regressor'], 'Random Forest Regressor', numResults=5, sort=True)

Top 10 features according to Random Forest Regressor.

Column # represents the column location in df_prep (with one_hot_encoding):

    Column #  Score

           3: 0.3009408618104748
          15: 0.1356567961478179
          25: 0.08838503393349043
         170: 0.047836395154024755
          11: 0.04457602342625789


In [47]:
# ---- best results ----
# The best parameters for Random Forest Regressor model are:
# {'max_depth': 18, 'max_features': 120, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 2000}

# The RMSE for Random Forest Regressor model is: 28853.04
# The overall improvement with tuned hyper parameters is 13.07%

# Labeled data mean = 178840, rmse/mean = 16.13%

# Aggregate learning methods

## BaggingRegressor

In [79]:
%%time
#Hyper parameters optimization - BaggingRegressor
from sklearn.ensemble import BaggingRegressor

param_grid = {
    'max_samples': [1.0],
    'max_features': [1.0],
    'n_estimators': [7],
    'bootstrap': [True],
#     'base_estimator': [final_models['Lasso'], final_models['Linear Regression'], final_models['ElasticNet'],
#                       final_models['SVR'], final_models['Ridge'], final_models['SGDRegressor'], LinearRegression(),
#                       Lasso(random_state=42)],
    'base_estimator': [final_models['SVR']]
}

bag_rgr = BaggingRegressor(n_jobs=2, random_state=42)

grid_search = GridSearchCV(estimator=bag_rgr, param_grid=param_grid, verbose=1, 
                           cv=3,scoring='neg_mean_squared_error', n_jobs=2)
grid_search.fit(train_X, train_y)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   3 out of   3 | elapsed:    2.8s finished


Wall time: 3.7 s


In [80]:
print_scores(grid_search, display_threshold=33200)

31363.471514839377 {'base_estimator': SVR(C=113564, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma=0.0007790692366582295, kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False), 'bootstrap': True, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 7}


In [81]:
validate_model('BaggingRegressor', grid_search)

The best parameters for BaggingRegressor model are:
{'base_estimator': SVR(C=113564, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma=0.0007790692366582295, kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False), 'bootstrap': True, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 7}

The RMSE for model is: 32006.70


## GradientBoostingRegressor

In [None]:
%%time
#Hyper parameters optimization - GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor

param_grid = {
    'max_samples': [1.0],
    'max_features': [1.0],
    'n_estimators': [7],
    'bootstrap': [True],
#     'base_estimator': [final_models['Lasso'], final_models['Linear Regression'], final_models['ElasticNet'],
#                       final_models['SVR'], final_models['Ridge'], final_models['SGDRegressor'], LinearRegression(),
#                       Lasso(random_state=42)],
    'base_estimator': [final_models['SVR']]
}

bag_rgr = BaggingRegressor(n_jobs=2, random_state=42)

grid_search = GridSearchCV(estimator=bag_rgr, param_grid=param_grid, verbose=1, 
                           cv=3,scoring='neg_mean_squared_error', n_jobs=2)
grid_search.fit(train_X, train_y)

In [None]:
print_scores(grid_search, display_threshold=33200)

In [None]:
validate_model('GradientBoostingRegressor', grid_search)

## Stacking