In [1]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LassoCV, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, mean_squared_error
from modelling_functions import *

In [2]:
#### Read Data files
housing, housing_features, feat_labels, dict_dictonary = read_and_clean(filepath = "../data/clean_train.csv")

  housing.col, id_dictonary = to_numeric(housing, col, 'SalePrice')


In [3]:
htest_id, htest_features, htest_labels, htest_dictonary = read_and_clean(filepath = "../data/clean_test.csv", test = True, dictonary = dict_dictonary)

  housing.col, id_dictonary = to_numeric_test(housing, col, dictonary)


In [4]:
htrain, htest, ptrain, ptest = train_test_split(housing_features, housing.saleprice, test_size = 0.2)

In [5]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 10000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 200, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 20, 30, 40]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 8, 16, 32, 64]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

{'n_estimators': [200, 1288, 2377, 3466, 4555, 5644, 6733, 7822, 8911, 10000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 29, 48, 67, 86, 105, 124, 143, 162, 181, 200, None], 'min_samples_split': [2, 5, 10, 15, 20, 30, 40], 'min_samples_leaf': [1, 2, 4, 8, 16, 32, 64], 'bootstrap': [True, False]}


In [8]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 1000, cv = 3, verbose=2, random_state=0, n_jobs = -1)
# Fit the random search model
rf_random.fit(htrain, ptrain)

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 150 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 353 tasks      | elapsed: 13.4min
[Parallel(n_jobs=-1)]: Done 636 tasks      | elapsed: 22.4min
[Parallel(n_jobs=-1)]: Done 1001 tasks      | elapsed: 37.7min
[Parallel(n_jobs=-1)]: Done 1446 tasks      | elapsed: 55.9min
[Parallel(n_jobs=-1)]: Done 1973 tasks      | elapsed: 76.5min
[Parallel(n_jobs=-1)]: Done 2580 tasks      | elapsed: 99.0min
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed: 115.0min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=1000, n_jobs=-1,
          param_distributions={'n_estimators': [200, 1288, 2377, 3466, 4555, 5644, 6733, 7822, 8911, 10000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 29, 48, 67, 86, 105, 124, 143, 162, 181, 200, None], 'min_samples_split': [2, 5, 10, 15, 20, 30, 40], 'min_samples_leaf': [1, 2, 4, 8, 16, 32, 64], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=0, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [9]:
rf_random.best_params_

{'n_estimators': 4555,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 162,
 'bootstrap': False}

In [10]:
clf = RandomForestRegressor(n_estimators=1800, 
                            random_state=99, 
                            n_jobs=-1, 
                            min_samples_split = 2, 
                            min_samples_leaf = 1, 
                            max_features = 'sqrt', 
                            max_depth = 162,
                            bootstrap = False)
sfm = SelectFromModel(clf, threshold = 0.01)
sfm.fit(htrain, ptrain)

# ## Not Finished
# # Measure Feature Importance
feature_selected = []
for feature_list_index in sfm.get_support(indices=True):
     feature_selected.append(feat_labels[feature_list_index])
proxy = feature_selected
#trimmed = ['x1stflrsf', 'x2ndflrsf', 'garagecars', 'overallcond', 'saleprice', 'Unnamed: 0', 'bsmtfinsf1']
housing_features = housing_features[proxy]
#housing_features = housing_features[trimmed_features]
print(housing_features.columns)

Index(['mssubclass', 'lotfrontage', 'lotarea', 'overallqual', 'yearbuilt',
       'yearremodadd', 'exterqual', 'foundation', 'bsmtqual', 'bsmtfinsf1',
       'totalbsmtsf', 'centralair', 'x1stflrsf', 'x2ndflrsf', 'grlivarea',
       'fullbath', 'kitchenqual', 'totrmsabvgrd', 'fireplaces', 'fireplacequ',
       'garagetype', 'garageyrblt', 'garagefinish', 'garagecars', 'garagearea',
       'openporchsf'],
      dtype='object')


In [75]:

trim = ['lotfrontage', 'lotarea', 'landcontour', 'lotconfig', 'condition1', 'roofmatl']
features = housing_features.drop(trim, axis = 1)

In [76]:
features.columns

Index(['mssubclass', 'mszoning', 'street', 'alley', 'neighborhood', 'bldgtype',
       'housestyle', 'overallqual', 'yearbuilt', 'yearremodadd', 'roofstyle',
       'exterior1st', 'exterior2nd', 'masvnrtype', 'masvnrarea', 'exterqual',
       'bsmtqual'],
      dtype='object')

In [54]:
htest_features.columns

Index(['mssubclass', 'lotarea', 'neighborhood', 'overallqual', 'yearbuilt',
       'yearremodadd', 'masvnrarea', 'exterqual', 'foundation', 'bsmtqual',
       'bsmtfintype1', 'bsmtunfsf', 'totalbsmtsf', 'heatingqc', 'centralair',
       'grlivarea', 'fullbath', 'bedroomabvgr', 'kitchenqual', 'totrmsabvgrd',
       'fireplaces', 'fireplacequ', 'garagetype', 'garageyrblt',
       'garagefinish', 'garagecars', 'garagearea', 'garagequal', 'garagecond',
       'openporchsf'],
      dtype='object')

In [41]:
lm, _ = run_linear_model(housing, 
                         features.columns, 
                         housing.saleprice)

In [77]:
lm = LinearRegression()

In [78]:
htrain, htest, ptrain, ptest = train_test_split(features,
                                                housing.saleprice,
                                                test_size = 0.2)

In [79]:
lm.fit(htrain, ptrain)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [80]:
lm.score(htest, ptest)

0.6987053601688558

In [81]:
housing.saleprice[1:10]

1    12.109011
2    12.317167
3    11.849398
4    12.429216
5    11.870600
6    12.634603
7    12.206073
8    11.774520
9    11.678440
Name: saleprice, dtype: float64

In [59]:
htest_features = htest_features.loc[:, features.columns]

In [60]:
lm.score(htest, ptest)

0.8036953062490403

In [63]:
pred = lm.predict(htest_features) / 2

In [64]:
pred[1:10]

array([14.84999164, 15.06585999, 15.08500229, 14.83627072, 15.03451169,
       14.95373946, 15.01726031, 14.87148124, 14.84570636])

In [25]:
Submission(htest_id, pred)