# Random Forest Regression Model

In [3]:
# Importing libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

In [4]:
# Setting random seed for reproducibility
np.random.seed(42)

In [5]:
# Creating pipeline for Random Forest Regressor

# Scales the data and then applies the Random Forest model
pipeline = Pipeline([('scaler', StandardScaler()), ('rf', RandomForestRegressor(random_state=42))])

In [6]:
# Creating parameter grid for hyperparameter tuning
param_grid ={'rf__n_estimators': [100, 200, 300],
                'rf__max_depth': [None, 10, 20],
                'rf__max_features': ['sqrt', 'log2', 1.0],
                'rf__min_samples_split': [2, 5, 10]}


In [7]:
# Performing Grid Search with Cross-Validation

# Using 5 fold cross-validation to find the best hyperparameters

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, 
                           cv=5, n_jobs=-1, scoring='r2', verbose=3)

# Importing training data
X_train = pd.read_csv('../data/train_set_X.csv')
y_train = pd.read_csv('../data/train_set_y.csv').values.ravel() # Ensuring y is in the correct shape

# Fitting the model
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV 5/5] END rf__max_depth=None, rf__max_features=sqrt, rf__min_samples_split=2, rf__n_estimators=100;, score=-0.104 total time=   0.1s
[CV 4/5] END rf__max_depth=None, rf__max_features=sqrt, rf__min_samples_split=2, rf__n_estimators=100;, score=-0.049 total time=   0.2s
[CV 2/5] END rf__max_depth=None, rf__max_features=sqrt, rf__min_samples_split=2, rf__n_estimators=100;, score=-0.118 total time=   0.2s
[CV 3/5] END rf__max_depth=None, rf__max_features=sqrt, rf__min_samples_split=2, rf__n_estimators=100;, score=-0.082 total time=   0.2s
[CV 1/5] END rf__max_depth=None, rf__max_features=sqrt, rf__min_samples_split=2, rf__n_estimators=100;, score=-0.103 total time=   0.2s
[CV 1/5] END rf__max_depth=None, rf__max_features=sqrt, rf__min_samples_split=2, rf__n_estimators=200;, score=-0.092 total time=   0.3s
[CV 3/5] END rf__max_depth=None, rf__max_features=sqrt, rf__min_samples_split=2, rf__n_estimators=200;, score=-0.090 total

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'rf__max_depth': [None, 10, ...], 'rf__max_features': ['sqrt', 'log2', ...], 'rf__min_samples_split': [2, 5, ...], 'rf__n_estimators': [100, 200, ...]}"
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,5
,verbose,3
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,10
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [8]:
# Finding the best parameters and model
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

optimal_model = grid_search.best_estimator_
print("Optimized model:", grid_search.best_estimator_)


Best Parameters: {'rf__max_depth': 10, 'rf__max_features': 'sqrt', 'rf__min_samples_split': 10, 'rf__n_estimators': 300}
Optimized model: Pipeline(steps=[('scaler', StandardScaler()),
                ('rf',
                 RandomForestRegressor(max_depth=10, max_features='sqrt',
                                       min_samples_split=10, n_estimators=300,
                                       random_state=42))])


In [None]:
# Printing the best score
grid_search.best_score_

np.float64(-0.05016460483482486)