In [1]:
import pandas as pd
import numpy as np 

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

import global_variables as g_vars
import global_functions as g_funcs

In [2]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [3]:
raw_df = pd.read_csv(g_vars.train_path)
test_raw_df = pd.read_csv(g_vars.test_path)

g_funcs.handle_missing_values(raw_df)
g_funcs.handle_missing_values(test_raw_df)

In [4]:
raw_df.SalePrice = np.log(raw_df.SalePrice)

In [5]:
raw_df['TotalSF'] = raw_df['GrLivArea'] + raw_df['TotalBsmtSF'] + raw_df['GarageArea'] + raw_df['EnclosedPorch'] + raw_df['ScreenPorch']
test_raw_df['TotalSF'] = test_raw_df['GrLivArea'] + test_raw_df['TotalBsmtSF'] + test_raw_df['GarageArea'] + test_raw_df['EnclosedPorch'] + test_raw_df['ScreenPorch']

In [6]:
one_hot_encoded_df = pd.get_dummies(data=raw_df)
test_one_hot_encoded_df = pd.get_dummies(data=test_raw_df)

test_missing_features = one_hot_encoded_df.columns.difference(test_one_hot_encoded_df.columns)
test_missing_features = test_missing_features.delete(test_missing_features.get_loc('SalePrice'))
one_hot_encoded_df.drop(test_missing_features, axis=1, inplace=True)

one_hot_encoded_df.drop('SalePrice', axis=1, inplace=True)
X_train, X_valid, y_train, y_valid = train_test_split(one_hot_encoded_df, raw_df.SalePrice, random_state=50)

In [7]:
def fit_and_score_rf(rf):
    rf.fit(X=X_train, y=y_train)
    y_pred = rf.predict(X_valid)
    print(np.sqrt(metrics.mean_squared_error(y_valid, y_pred)))
    return rf

In [8]:
rf = RandomForestRegressor(n_estimators=80)
rf = fit_and_score_rf(rf)

0.13205599362580575


In [9]:
preds = np.stack([t.predict(X_valid) for t in rf.estimators_])
preds[:,0], np.mean(preds[:,0]), y_valid.iloc[0]

In [None]:
# plt.plot([g_funcs.rmse(y_valid, np.mean(preds[:i+2])) for i in range(100)])

preds_score = []

for i in range(100):
    rf = RandomForestRegressor(n_estimators=i+1)
    rf.fit(X=X_train, y=y_train)
    y_pred = rf.predict(X_valid)
    preds_score.append(np.sqrt(metrics.mean_squared_error(y_valid, y_pred)))

In [11]:
plt.plot(preds_score)

In [12]:
rf = RandomForestRegressor(n_estimators=15, max_features=0.4)
rf = fit_and_score_rf(rf)

0.13187876876436608


In [13]:
feat_importance = pd.DataFrame({'cols': one_hot_encoded_df.columns, 'imp': rf.feature_importances_}).sort_values('imp', ascending=False)

In [15]:
feat_importance[:10]

Unnamed: 0,cols,imp
37,TotalSF,0.312394
4,OverallQual,0.161118
16,GrLivArea,0.083432
161,ExterQual_TA,0.051595
6,YearBuilt,0.046738
7,YearRemodAdd,0.038282
19,FullBath,0.033283
12,TotalBsmtSF,0.023182
26,GarageCars,0.022867
3,LotArea,0.014358
