# Optimizing Random Forest (Model Selection - 1)

- Read the Best Selected Feature Set
- Optimize the Random Forest with the best Feature Set

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder 
import joblib

from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report

from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score

In [None]:
from datetime import datetime, date
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
import joblib

In [None]:
start_time = datetime.now()
print ('Current Starting Time is : ',start_time)

In [None]:
# function to save classification report
def classification_report_csv(report,classifier_name,ascore):
    report_data = []
    counter=0
    lines = report.split('\n')
    
    for line in lines[2:-5]:
        row = {}
        row_data = line.split('      ')
        
        
        row['classifier'] = classifier_name
        row['accuracy_score'] = ascore
        
        if counter==0:
            row['class'] = row_data[2].strip()
            row['precision'] = float(row_data[3].strip())
            row['recall'] = float(row_data[4].strip())
            row['f1_score'] = float(row_data[5].strip())
            row['accuracy_score'] = ascore
        elif counter==1:
            row['class'] = row_data[0].strip()
            row['precision'] = float(row_data[1].strip())
            row['recall'] = float(row_data[2].strip())
            row['f1_score'] = float(row_data[3].strip())
            row['accuracy_score'] = ascore
        elif counter==2:
            row['class'] = row_data[1].strip()
            row['precision'] = float(row_data[2].strip())
            row['recall'] = float(row_data[3].strip())
            row['f1_score'] = float(row_data[4].strip())
        
        report_data.append(row)
        
        counter+=1
        
    dataframe = pd.DataFrame.from_dict(report_data)
        
    if os.path.exists('classification_reports/classification_report.csv'):
        df_cr = pd.read_csv('classification_reports/classification_report.csv')
                
        t = df_cr[df_cr['classifier']==classifier_name].index
        if len(t)>0:
            df_cr.drop(t, inplace=True)
        
        df_cr = pd.concat([df_cr,dataframe])
        df_cr.to_csv('classification_reports/classification_report.csv', index = False)
    else:
        dataframe.to_csv('classification_reports/classification_report.csv', index = False)

**reading all the feature set files**

In [None]:
# base feature set with advanced and mean encoded features
df_base_adv_mean = pd.read_csv('input/feature_sets/base_adv_mean.csv')

# 1. Model with Base + Advanced + Mean Features

In [None]:
X = df_base_adv_mean.drop(['status_group','id','functional needs repair','non functional'], axis=1)
y = df_base_adv_mean['status_group'].values

# i have changed below to test size .21 based on results
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.21, random_state=42)

**verifying the results as were during the feature set selection**

In [None]:
X_train.info()

**fit the model**

In [None]:
# get the rf object
rf_base = RandomForestClassifier(n_estimators=101, n_jobs=-1)

# fit the same base model
rf_clf = rf_base.fit(X_train, y_train)

# prediction
rf_clf_pred = rf_base.predict(X_test)

**confusion matrix**

In [None]:
# Generate confusion matrix
fig, axes = plt.subplots(figsize=(19,11))

matrix = plot_confusion_matrix(rf_clf, X_test, y_test,
                                 cmap=plt.cm.Blues,
                                 normalize='true',ax= axes)

plt.title('Confusion Matrix')
plt.show(matrix)
plt.show()

**classification report**

In [None]:
print(classification_report(y_test, rf_clf_pred))

# Optimizing the Random Forest

### Halving Search CV-1

In [None]:
# parameter grid
pgrid = {    
    'max_depth' : [51,None],    
    'min_samples_split' : [5,6],
    'min_samples_leaf' : [1,2],    
}

# specifying the cv
cv_skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

# specifying the model 
rfgs = RandomForestClassifier(n_jobs=-1, verbose=1)

# keep track of the date and time
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")


# specify the grid search cv
cv = HalvingGridSearchCV(estimator=rfgs,param_grid=pgrid,cv=cv_skf,n_jobs=-1,verbose=10, scoring='accuracy',random_state=0)

# pring the date and time 
print("date and time =", dt_string)

**executing the halving grid search**

In [None]:
%%time
joblib.dump(cv.fit(X_train,y_train),'models/HalvingGridSearchCV.pkl')

**display the best params**

In [None]:
loaded_cv = joblib.load('models/HalvingGridSearchCV.pkl')
loaded_cv.best_params_

**get the best estimator model and execute predictions on it, along with the classification score**

In [None]:
rf_best = loaded_cv.best_estimator_

# get the prediction
rfpred = rf_best.predict(X_test)

# print classification report
cr = print(classification_report(y_test, rfpred))
print(cr)
classification_report_csv(cr,'HalvingGridSearchCV.pkl',accuracy_score(y_test, rfpred))


### Halving Search CV - 2

In [None]:
# parameter grid
pgrid = {    
    'max_depth' : [51,None],    
    'min_samples_split' : [5,6],
    'min_samples_leaf' : [1,2],    
}

# specifying the cv
cv_skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

# specifying the model 
rfgs = RandomForestClassifier(n_jobs=-1, verbose=1)

# keep track of the date and time
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")


# specify the grid search cv
cv = HalvingGridSearchCV(
    estimator=rfgs,param_grid=pgrid,cv=cv_skf,n_jobs=-1,verbose=10,scoring='balanced_accuracy',random_state=0,
    resource='n_estimators',max_resources=1000)

# pring the date and time 
print("date and time =", dt_string)

**executing the halving grid search cv-2**

In [None]:
%%time
joblib.dump(cv.fit(X_train,y_train),'models/HalvingGridSearchCV_2.pkl')

**display the best params**

In [None]:
loaded_cv = joblib.load('models/HalvingGridSearchCV_2.pkl')
loaded_cv.best_params_

**get the best estimator model and execute predictions on it, along with the classification score**

In [None]:
rf_best = loaded_cv.best_estimator_

# get the prediction
rfpred = rf_best.predict(X_test)

# print classification report
cr = classification_report(y_test, rfpred)

print(cr)
classification_report_csv(cr,'HalvingGridSearchCV_2.pkl',accuracy_score(y_test, rfpred))

### Halving Search CV - 3

In [None]:
# parameter grid
pgrid = {    
    'n_estimators':[451,751],
    'min_samples_split' : [5,6],
    'min_samples_leaf' : [1,2],    
}

# specifying the cv
cv_skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

# specifying the model 
rfgs = RandomForestClassifier(n_jobs=-1, verbose=1)

# keep track of the date and time
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")


# specify the grid search cv
cv = HalvingGridSearchCV(
    estimator=rfgs,param_grid=pgrid,cv=cv_skf,n_jobs=-1,verbose=10,scoring='accuracy',random_state=0,
    resource='max_depth',max_resources=150)

# pring the date and time 
print("date and time =", dt_string)

**executing the halving grid search cv-2**

In [None]:
%%time
joblib.dump(cv.fit(X_train,y_train),'models/HalvingGridSearchCV_3.pkl')

**display the best params**

In [None]:
loaded_cv = joblib.load('models/HalvingGridSearchCV_3.pkl')
loaded_cv.best_params_

**get the best estimator model and execute predictions on it, along with the classification score**

In [None]:
rf_best = loaded_cv.best_estimator_

# get the prediction
rfpred = rf_best.predict(X_test)

# print classification report
cv = classification_report(y_test, rfpred)

print(cr)

classification_report_csv(cr,'HalvingGridSearchCV_3.pkl',accuracy_score(y_test, rfpred))

### Randomized Grid Search -1

In [None]:
# parameter grid
pgrid = {    
    'max_depth' : [51,None],    
    'max_features' : ['sqrt','log2'],
    'min_samples_split' : [5,6],
    'min_samples_leaf' : [1,2,3],
    'criterion' : ['gini','entropy']    
}

# specifying the cv
cv_skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

# specifying the model 
rfgs = RandomForestClassifier(n_jobs=-1, verbose=1)

# keep track of the date and time
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")

# specify the grid search cv
cv = RandomizedSearchCV(estimator=rfgs,param_distributions=pgrid,cv=cv_skf,n_jobs=-1, 
                        verbose=10, scoring='accuracy',random_state=0)

# pring the date and time 
print("date and time =", dt_string)

**executing the randomized grid search cv-1**

In [None]:
%%time
joblib.dump(cv.fit(X_train,y_train),'models/RandomizedSearchCV.pkl')

**display the best params**

In [None]:
loaded_cv = joblib.load('models/RandomizedSearchCV.pkl')
loaded_cv.best_params_

**get the best estimator model and execute predictions on it, along with the classification score**

In [None]:
rf_best = loaded_cv.best_estimator_

# get the prediction
rfpred = rf_best.predict(X_test)

# print classification report
cv = classification_report(y_test, rfpred)

print(cr)

classification_report_csv(cr,'RandomizedSearchCV.pkl',accuracy_score(y_test, rfpred))

### Randomized Grid Search - 2

In [None]:
# parameter grid
pgrid = {    
    'n_estimators': [451,751,1000,1100],
    'max_depth' : [51,None],    
    'min_samples_split' : [3,5],
    'min_samples_leaf' : [1,2],
    'criterion' : ['gini','entropy']  
}

# specifying the cv
cv_skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

# specifying the model 
rfgs = RandomForestClassifier(n_jobs=-1, verbose=1)

# keep track of the date and time
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")

# specify the grid search cv
cv = RandomizedSearchCV(estimator=rfgs,param_distributions=pgrid,cv=cv_skf,n_jobs=-1,verbose=10,
                        scoring='balanced_accuracy',random_state=0)

# pring the date and time 
print("date and time =", dt_string)

**executing the randomized grid search cv-2**

In [None]:
%%time
joblib.dump(cv.fit(X_train,y_train),'models/RandomizedSearchCV_2.pkl')

**display the best params**

In [None]:
loaded_cv = joblib.load('models/RandomizedSearchCV_2.pkl')
loaded_cv.best_params_

**get the best estimator model and execute predictions on it, along with the classification score**

In [None]:
rf_best = loaded_cv.best_estimator_

# get the prediction
rfpred = rf_best.predict(X_test)

# print classification report
cv = classification_report(y_test, rfpred)

print(cr)

classification_report_csv(cr,'RandomizedSearchCV_2.pkl',accuracy_score(y_test, rfpred))

### Grid Search - 1

In [None]:
# parameter grid
pgrid = {
    'n_estimators'      : [451,1001],
    'bootstrap'         : [True,False],
    'criterion'         : ['gini','entropy'],
    'max_depth'         : [21,51],        
    'min_samples_split' : [2,3,5],
    'min_samples_leaf'  : [1,2,3]
}

# specifying the cv
cv_ss = StratifiedShuffleSplit(n_splits=3, train_size=0.75, test_size=.25,random_state=0)

# specifying the model 
rfgs = RandomForestClassifier(n_jobs=-1, verbose=1)

# keep track of the date and time
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")

# specify the grid search cv
cv = GridSearchCV(estimator=rfgs, param_grid=pgrid, cv=cv_ss, n_jobs=-1, verbose=10, scoring='accuracy')

# pring the date and time 
print("date and time =", dt_string)

**execute grid search 1**

In [None]:
%%time
joblib.dump(cv.fit(X_train,y_train),'models/GridSearchCV.pkl')

**display the best params**

In [None]:
loaded_cv = joblib.load('models/GridSearchCV.pkl')
loaded_cv.best_params_

**get the best estimator model and execute predictions on it, along with the classification score**

In [None]:
rf_best = loaded_cv.best_estimator_

# get the prediction
rfpred = rf_best.predict(X_test)

# print classification report
cv = classification_report(y_test, rfpred)

print(cr)

classification_report_csv(cr,'GridSearchCV.pkl',accuracy_score(y_test, rfpred))

### Grid Search - 2

In [None]:
# parameter grid 
pgrid = {
    'n_estimators' : [451,751],
    'max_depth' : [51,None],    
    'max_features' : ['sqrt','log2','auto'],
    'min_samples_split' : [5,6],
    'min_samples_leaf' : [1,2,3],
    'bootstrap': [True, False]
}

# specifying the cv
cv_skf = StratifiedKFold(n_splits=3, random_state=None, shuffle=False) # need to change to 3 splits based on results

# specifying the model 
rfgs = RandomForestClassifier(n_jobs=-1, verbose=1)

# keep track of the date and time
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")

# specify the grid search cv
cv = GridSearchCV(estimator=rfgs, param_grid=pgrid, cv=cv_skf, n_jobs=-1, verbose=10, scoring='balanced_accuracy')

# pring the date and time 
print("date and time =", dt_string)

**execuate the grid search - 2**

In [None]:
%%time
joblib.dump(cv.fit(X_train,y_train),'models/GridSearchCV_2.pkl')

**display the best param**

In [None]:
loaded_cv = joblib.load('models/GridSearchCV_2.pkl')
loaded_cv.best_params_

**get the best estimator model and execute predictions on it, along with the classification score**

In [None]:
rf_best = loaded_cv.best_estimator_

# get the prediction
rfpred = rf_best.predict(X_test)

# print classification report
cv = classification_report(y_test, rfpred)

print(cr)

classification_report_csv(cr,'GridSearchCV_2.pkl',accuracy_score(y_test, rfpred))

# Showing the Results - Sorted by Accuracy 

In [None]:
df_cr = pd.read_csv('classification_reports/classification_report.csv')
df_cr[['classifier','accuracy_score']].drop_duplicates().sort_values(by='accuracy_score')

In [None]:
end_time = datetime.now()
print ('Current Starting Time is : ',end_time)
c = end_time - start_time  

In [None]:
print('The execution ended after {} minutes'.format(c.seconds/60))