# Random Forest with Hyper Tuning

## Use hyper parameter tuning for Random Forest (RF) and GridSearchCV to try out different combinations of hyper parameters and get the best model/parameter combo and score. 


### Import the necessary packages

In [1]:
import pyodbc 
import pandas as pd
import numpy as np


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

### Fetch data from sql server that will be used for Training

In [2]:
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=LAPTOP-HVRMUNPF;'
                      'PORT=1433;'
                      'Database=RENTERS_STP;'
                      'Trusted_Connection=yes;'
                      )

query = 'SELECT  * from [RENTERS_STP].[dbo].[DR_DetailedRequest_classification]'

    
df = pd.read_sql(query, conn)


### Get the 'features' and 'labels' from the dataset. Split the dataset to training and testing datasets


In [3]:
y=df['PREDICTION_VALUE_Y_BOOL']
X = pd.get_dummies(df.drop(['PREDICTION_VALUE_Y_BOOL'], axis=1)).fillna(0)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state=42, stratify=y)

SEED=1

### Setup the dictionary for Hyper parameters

In [4]:
# Define the dictionary 'params_rf'
params_rf = {'n_estimators':[100, 350, 500],
            'max_features':['log2', 'auto', 'sqrt'],
            'min_samples_leaf':[2, 10, 30]
}



### Instantiate the classifiers

In [5]:
# Instantiate rf
rf = RandomForestRegressor(n_estimators=2,
            random_state=2)

# Instantiate grid_rf
grid_rf = GridSearchCV(estimator=rf,
                       param_grid=params_rf,
                       scoring='neg_mean_squared_error',
                       cv=3,
                       verbose=1,
                       n_jobs=-1)

### Fit and train the model

In [6]:
# Fit sgbr to the training set
grid_rf.fit(X_train,y_train)


Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:  7.8min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=2, n_jobs=None,
                                             oob_score=False, random_state=2,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jobs=-1

### Model Evaluation

In [7]:
best_model = grid_rf.best_estimator_

# Predict test set labels
y_pred = best_model.predict(X_test)

# Compute rmse_test
rmse_test = MSE(y_test,y_pred) ** (1/2)

# Print rmse_test
print('Test RMSE of best model: {:.6f}'.format(rmse_test)) 

Test RMSE of best model: 0.086020


### Display best parameters from grid search

In [8]:
best_model = grid_rf.best_estimator_
print('Best parameters: {}'.format(grid_rf.best_params_))

Best parameters: {'max_features': 'auto', 'min_samples_leaf': 2, 'n_estimators': 350}
