In [36]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import math
#model import 
from sklearn.ensemble import RandomForestRegressor

#splitting and scaling
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#parameter tuning
from sklearn.model_selection import GridSearchCV

#evaluation
from sklearn import linear_model, metrics
from sklearn.metrics import (mean_squared_error, r2_score, mean_absolute_error, 
mean_squared_log_error, explained_variance_score, max_error)
from sklearn.model_selection import LeaveOneOut, cross_val_score, cross_val_predict

import warnings
# We silence warnings concerning future version updates
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

In [37]:
# Load training and testing data
X_train = np.loadtxt("X_train.csv", delimiter=',', skiprows=1)
X_test = np.loadtxt("X_test.csv", delimiter=',', skiprows=1)
y_train = np.loadtxt("y_train.csv", delimiter=',', skiprows=1)[:,1]

In [38]:
#scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [49]:
def saveFile(y_pred,name):
    test_header = "Id,PRP"
    n_points = X_test.shape[0]
    y_pred_pp = np.ones((n_points, 2))
    y_pred_pp[:, 0] = range(n_points)
    y_pred_pp[:, 1] = y_pred
    np.savetxt(name, y_pred_pp, fmt='%d,%f', delimiter=",",
               header=test_header, comments="")

In [40]:
#split the dataset for training
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

#scaling split training dataset
scaler = StandardScaler()
X_train1_scaled = scaler.fit_transform(X_train1)
X_test1_scaled = scaler.fit_transform(X_test1)

In [50]:
# Defining basic untuned random forest regression model

def basicrfr(X_train1, X_test1, y_train1, y_test1):
    rfr = RandomForestRegressor(random_state = 0)
    rfr.fit(X_train1, y_train1)
    y_pred = rfr.predict(X_test1)
    
    print("R2 score: " + str(rfr.score(X_train1, y_train1)))
    print("Explained variance: " + str(explained_variance_score(y_test1, y_pred)))
    print("Max error: " + str(max_error(y_test1, y_pred)))
    print("Mean absolute error: " + str(mean_absolute_error(y_test1, y_pred)))
    print("Root mean squared error: " + str(math.sqrt(mean_squared_error(y_test1, y_pred))))

In [44]:
# using standard training data
basicrfr(X_train1, X_test1, y_train1, y_test1)

R2 score: 0.9616061630082258
Explained variance: 0.8979307222226864
Max error: 112.19999999999999
Mean absolute error: 22.09493464052288
Root mean squared error: 35.677440209798505


In [45]:
# using scaled training data
basicrfr(X_train1_scaled, X_test1_scaled, y_train1, y_test1)

R2 score: 0.960288631845655
Explained variance: 0.7935244048163663
Max error: 192.3
Mean absolute error: 31.881437908496736
Root mean squared error: 50.97707305585569


In [46]:
# Further Tuning with Grid Search
rfr = RandomForestRegressor()
# Look at parameters used by our regression
print('Parameters currently in use:\n')
print(rfr.get_params())


#Creating the parameter grid

param_grid = [
   {'max_depth': [None, 3, 5, 7],
    'oob_score' : [True, False],
    'min_samples_leaf':[1, 2], 
    'min_samples_split': [2, 3, 4],
    'n_estimators': [100, 500, 1000, 2500, 5000],
    'warm_start': [True, False]}
]

Parameters currently in use:

{'bootstrap': True, 'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 'warn', 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [48]:
#fit the model with the parameter
rfr = GridSearchCV(RandomForestRegressor(), param_grid = param_grid, cv = 10, verbose=True, n_jobs=-1)
# Fit the random search model
best_reg= rfr.fit(X_train1_scaled, y_train1)


best_reg.best_params_

Fitting 10 folds for each of 480 candidates, totalling 4800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   50.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 13.1min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 17.1min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 21.9min
[Parallel(n_jobs=-1)]: Done 4800 out of 4800 | elapsed: 26.4min finished


{'max_depth': 5,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 100,
 'oob_score': True,
 'warm_start': False}

In [51]:
def tunedrfr(X_train1, X_test1, y_train1, y_test1):
    rfr = RandomForestRegressor(max_depth = 5, min_samples_leaf = 2, min_samples_split = 4, n_estimators = 100, oob_score = True, warm_start = False, random_state = 0)
    rfr.fit(X_train1, y_train1)
    y_pred = rfr.predict(X_test1)
    
    print("R2 score: " + str(rfr.score(X_train1, y_train1)))
    print("Explained variance: " + str(explained_variance_score(y_test1, y_pred)))
    print("Max error: " + str(max_error(y_test1, y_pred)))
    print("Mean absolute error: " + str(mean_absolute_error(y_test1, y_pred)))
    print("Root mean squared error: " + str(math.sqrt(mean_squared_error(y_test1, y_pred))))

In [52]:
tunedrfr(X_train1_scaled, X_test1_scaled, y_train1, y_test1)

R2 score: 0.9276144927762131
Explained variance: 0.8671487422024539
Max error: 126.69923370927319
Mean absolute error: 29.62469746651357
Root mean squared error: 42.38705061801293


In [20]:
#run our final prediction
rfr_tuned = RandomForestRegressor(max_depth = 5, min_samples_leaf = 2, min_samples_split = 4, n_estimators = 100, oob_score = True, warm_start = False, random_state = 0)
rfr_tuned.fit(X_train_scaled, y_train)
y_pred = rfr_tuned.predict(X_test_scaled)
saveFile(y_pred, "randomForest_submission.csv")