In [1]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.linear_model import LinearRegression, LassoCV, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, mean_squared_error
from modelling_functions import *

## Processing for Models

In [40]:
#### Read Data files
housing, housing_features, feat_labels, dict_dictonary = read_and_clean(filepath = "../data/train_selected_features.csv")

  housing.col, id_dictonary = to_numeric(housing, col, 'SalePrice')


In [23]:
test = pd.read_csv('../data/test_selected_features.csv')

In [43]:
htest_id, htest_features, htest_labels, htest_dictonary = read_and_clean(filepath = "../data/test_selected_features.csv", test = True, dictonary = dict_dictonary)

In [44]:
#housing_features.lotarea = np.sqrt(housing_features.lotarea)
#htest_features.lotarea = np.sqrt(htest_features.lotarea)

print(htest_features.lotarea[1:5], housing_features.lotarea[1:5])

1    119.444548
2    117.601020
3     99.889939
4     70.746025
Name: lotarea, dtype: float64 1     97.979590
2    106.066017
3     97.724101
4    119.415242
Name: lotarea, dtype: float64


In [45]:
htrain, htest, ptrain, ptest = train_test_split(housing_features, housing.saleprice, test_size = 0.33)

## RandomForestRegressor

In [36]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 10000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 200, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 20, 30, 40]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 8, 16, 32, 64]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

{'n_estimators': [200, 1288, 2377, 3466, 4555, 5644, 6733, 7822, 8911, 10000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 29, 48, 67, 86, 105, 124, 143, 162, 181, 200, None], 'min_samples_split': [2, 5, 10, 15, 20, 30, 40], 'min_samples_leaf': [1, 2, 4, 8, 16, 32, 64], 'bootstrap': [True, False]}


In [37]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=0, n_jobs = -1)
# Fit the random search model
rf_random.fit(htrain, ptrain)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [9]:
rf_random.best_params_

{'n_estimators': 4555,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 162,
 'bootstrap': False}

In [27]:
clf = RandomForestRegressor(n_estimators=4555, 
                            random_state=9, 
                            n_jobs=-1, 
                            min_samples_split = 2, 
                            min_samples_leaf = 1, 
                            max_features = 'sqrt', 
                            max_depth = 162,
                            bootstrap = False)
#sfm = SelectFromModel(clf, threshold = 0.01)
#sfm.fit(htrain, ptrain)
# ## Not Finished
# # Measure Feature Importance
#feature_selected = []
#for feature_list_index in sfm.get_support(indices=True):
#     feature_selected.append(feat_labels[feature_list_index])
#proxy = feature_selected
#trimmed = ['x1stflrsf', 'x2ndflrsf', 'garagecars', 'overallcond', 'saleprice', 'Unnamed: 0', 'bsmtfinsf1']
#testing = testing[proxy]
#housing_features = housing_features[trimmed_features]
#print(feature_selected)

In [28]:
clf.fit(htrain, ptrain)

clf.score(htest, ptest)

0.8581040577145225

In [29]:
nice = clf.predict(htest)
mean_squared_error(nice, ptest)

0.019224582188331164

In [34]:
pred = np.exp(clf.predict(htest_features))
Submission(htest_id, pred)

## Lasso

In [46]:
alp = optimize_penalty(htrain, ptrain, model=Lasso, min_=1e-5, max_=1, step_=1000, random=False, riter=100)

In [47]:
alp.best_params_

{'alpha': 1e-05}

In [51]:
lasso = Lasso()
lasso.set_params(alpha = 1e-5)
lasso.fit(htrain, ptrain)

print(lasso.score(htrain, ptrain))
print(lasso.score(htest, ptest))
#print(selected_coefs)
pred = lasso.predict(htest)

0.9129123753258224
0.8850140943081866


In [52]:
mean_squared_error(pred, ptest)

0.01664395756367181

In [53]:
pred = np.exp(lasso.predict(htest_features))

In [54]:
pred[1:10]

array([159423.58250598, 160661.46119967, 184103.46777951, 205526.73314711,
       161548.60632505, 177708.52830291, 158505.45321712, 196103.79276287,
       111260.77050211])

In [55]:
Submission(htest_id, pred)