# Optimizing Random Forest 2

based on the result of Optimizing Random Forest - 1, we will re-tune some parameters on the observations of best_param_ property

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder 
import joblib

from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report

from sklearn.metrics import plot_confusion_matrix

In [2]:
from datetime import datetime
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score
import joblib
import os

In [3]:
# function to save classification report
def classification_report_csv(report,classifier_name,ascore):
    report_data = []
    counter=0
    lines = report.split('\n')
    
    for line in lines[2:-5]:
        row = {}
        row_data = line.split('      ')
        
        
        row['classifier'] = classifier_name
        row['accuracy_score'] = ascore
        
        if counter==0:
            row['class'] = row_data[2].strip()
            row['precision'] = float(row_data[3].strip())
            row['recall'] = float(row_data[4].strip())
            row['f1_score'] = float(row_data[5].strip())
            row['accuracy_score'] = ascore
        elif counter==1:
            row['class'] = row_data[0].strip()
            row['precision'] = float(row_data[1].strip())
            row['recall'] = float(row_data[2].strip())
            row['f1_score'] = float(row_data[3].strip())
            row['accuracy_score'] = ascore
        elif counter==2:
            row['class'] = row_data[1].strip()
            row['precision'] = float(row_data[2].strip())
            row['recall'] = float(row_data[3].strip())
            row['f1_score'] = float(row_data[4].strip())
        
        report_data.append(row)
        
        counter+=1
        
    dataframe = pd.DataFrame.from_dict(report_data)
        
    if os.path.exists('classification_reports/classification_report.csv'):
        df_cr = pd.read_csv('classification_reports/classification_report.csv')
                
        t = df_cr[df_cr['classifier']==classifier_name].index
        if len(t)>0:
            df_cr.drop(t, inplace=True)
        
        df_cr = pd.concat([df_cr,dataframe])
        df_cr.to_csv('classification_reports/classification_report.csv', index = False)
    else:
        dataframe.to_csv('classification_reports/classification_report.csv', index = False)
    

**reading all the feature set files**

In [4]:
# base feature set with advanced and mean encoded features
df_base_adv_mean = pd.read_csv('input/feature_sets/base_adv_mean.csv')

# 1. Model with Base + Advanced + Mean Features

In [5]:
X = df_base_adv_mean.drop(['status_group','id','functional needs repair','non functional'], axis=1)
y = df_base_adv_mean['status_group'].values

# i have changed below to test size .21 based on results
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.21, random_state=42)

### Halving Grid Search CV - 1 (re-test)

In [6]:
# parameter grid
pgrid = {    
    'max_depth' : [101,151],    
    'min_samples_split' : [3,4],
    'min_samples_leaf' : [2],    
}

# specifying the cv
cv_skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

# specifying the model 
rfgs = RandomForestClassifier(n_jobs=-1, verbose=1)

# keep track of the date and time
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")


# specify the grid search cv
cv = HalvingGridSearchCV(estimator=rfgs,param_grid=pgrid,cv=cv_skf,n_jobs=-1,verbose=10, scoring='accuracy',random_state=0)

# pring the date and time 
print("date and time =", dt_string)

date and time = 03/09/2021 02:10:23


**execution of halving grid search cv**

In [7]:
%%time
joblib.dump(cv.fit(X_train,y_train),'models/halving_grid_search_cv_1_ms2.pkl')

fd is '5'
fd is '5'
fd is '5'
n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 15642
max_resources_: 46926
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 4
n_resources: 15642
Fitting 5 folds for each of 4 candidates, totalling 20 fits
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.0s finished


Wall time: 37.9 s


['models/halving_grid_search_cv_1_ms2.pkl']

**displaying the best param**

In [8]:
loaded_cv = joblib.load('models/halving_grid_search_cv_1_ms2.pkl')
loaded_cv.best_params_

{'max_depth': 151, 'min_samples_leaf': 2, 'min_samples_split': 3}

**get the best estimator and classification report**

In [9]:
rf_best = loaded_cv.best_estimator_
rf_best.fit(X_train,y_train)

# get the prediction
rfpred = rf_best.predict(X_test)

# print classification report
cr = classification_report(y_test, rfpred)

print(cr)

classification_report_csv(cr,'halving_grid_search_cv_1_ms2.pkl',accuracy_score(y_test, rfpred))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


                         precision    recall  f1-score   support

             functional       0.81      0.90      0.85      6790
functional needs repair       0.57      0.33      0.42       885
         non functional       0.85      0.79      0.82      4799

               accuracy                           0.82     12474
              macro avg       0.75      0.67      0.70     12474
           weighted avg       0.81      0.82      0.81     12474



### Halving Grid Search CV 2

In [10]:
# parameter grid
pgrid = {    
    'max_depth' : [101,111,121],    
    'min_samples_split' : [3,4],
    'min_samples_leaf' : [1,2],    
}

# specifying the cv
cv_skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

# specifying the model 
rfgs = RandomForestClassifier(n_jobs=-1, verbose=1)

# keep track of the date and time
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")


# specify the grid search cv
cv = HalvingGridSearchCV(
    estimator=rfgs,param_grid=pgrid,cv=cv_skf,n_jobs=-1,verbose=10,scoring='balanced_accuracy',random_state=0,
    resource='n_estimators',max_resources=1100)

# pring the date and time 
print("date and time =", dt_string)

date and time = 03/09/2021 02:11:05


**execution**

In [11]:
%%time
joblib.dump(cv.fit(X_train,y_train),'models/halving_grid_search_cv_2_ms2.pkl') # based on halving_grid_search_cv_2_2_ms2.pkl

fd is '5'
fd is '5'
fd is '5'
fd is '5'
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 122
max_resources_: 1100
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 12
n_resources: 122
Fitting 5 folds for each of 12 candidates, totalling 60 fits
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done 1098 out of 1098 | elapsed:   23.9s finished


Wall time: 8min 50s


['models/halving_grid_search_cv_2_ms2.pkl']

**displaying the best params**

In [12]:
loaded_cv = joblib.load('models/halving_grid_search_cv_2_ms2.pkl')
loaded_cv.best_params_

{'max_depth': 111,
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'n_estimators': 1098}

**get the best estimator and classification report**

In [13]:
rf_best = loaded_cv.best_estimator_
rf_best.fit(X_train,y_train)

# get the prediction
rfpred = rf_best.predict(X_test)

# print classification report
cr = classification_report(y_test, rfpred)

print(cr)

classification_report_csv(cr,'halving_grid_search_cv_2_ms2.pkl',accuracy_score(y_test, rfpred))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 1098 out of 1098 | elapsed:   24.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 1098 out of 1098 | elapsed:    1.4s finished


                         precision    recall  f1-score   support

             functional       0.82      0.88      0.85      6790
functional needs repair       0.51      0.37      0.43       885
         non functional       0.84      0.79      0.82      4799

               accuracy                           0.81     12474
              macro avg       0.72      0.68      0.70     12474
           weighted avg       0.81      0.81      0.81     12474



### Halving Grid Search CV - 3

In [14]:
# parameter grid
pgrid = {    
    'n_estimators':[751,1000,1100],
    'min_samples_split' : [3,4],
    'min_samples_leaf' : [1,2],    
}

# specifying the cv
cv_skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

# specifying the model 
rfgs = RandomForestClassifier(n_jobs=-1, verbose=1)

# keep track of the date and time
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")


# specify the grid search cv
cv = HalvingGridSearchCV(
    estimator=rfgs,param_grid=pgrid,cv=cv_skf,n_jobs=-1,verbose=10,scoring='accuracy',random_state=0,
    resource='max_depth',max_resources=300)

# pring the date and time 
print("date and time =", dt_string)

date and time = 03/09/2021 02:20:27


**execution**

In [15]:
%%time
joblib.dump(cv.fit(X_train,y_train),'models/halving_grid_search_cv_3_ms2.pkl') # based on halving_grid_search_cv_2_ms2.pkl

fd is '5'
fd is '5'
fd is '5'
fd is '5'
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 33
max_resources_: 300
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 12
n_resources: 33
Fitting 5 folds for each of 12 candidates, totalling 60 fits
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   15.9s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   20.2s finished


Wall time: 25min 49s


['models/halving_grid_search_cv_3_ms2.pkl']

**best params**

In [16]:
loaded_cv = joblib.load('models/halving_grid_search_cv_3_ms2.pkl')
loaded_cv.best_params_

{'min_samples_leaf': 2,
 'min_samples_split': 3,
 'n_estimators': 1000,
 'max_depth': 297}

**classification report**

In [17]:
rf_best = loaded_cv.best_estimator_
rf_best.fit(X_train,y_train)

# get the prediction
rfpred = rf_best.predict(X_test)

# print classification report
cr = classification_report(y_test, rfpred)

print(cr)

classification_report_csv(cr,'halving_grid_search_cv_3_ms2.pkl',accuracy_score(y_test, rfpred))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   19.9s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    0.7s finished


                         precision    recall  f1-score   support

             functional       0.81      0.90      0.85      6790
functional needs repair       0.57      0.33      0.42       885
         non functional       0.85      0.78      0.82      4799

               accuracy                           0.81     12474
              macro avg       0.74      0.67      0.70     12474
           weighted avg       0.81      0.81      0.81     12474



### Randomized Grid Search - 1

In [18]:
# parameter grid
pgrid = {    
    'max_depth' : [51,100,200,300],    
    'max_features' : ['sqrt'],
    'min_samples_split' : [3,4],
    'min_samples_leaf' : [1,2,3],
    'criterion' : ['gini','entropy']    
}

# specifying the cv
cv_skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

# specifying the model 
rfgs = RandomForestClassifier(n_jobs=-1, verbose=1)

# keep track of the date and time
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")

# specify the grid search cv
cv = RandomizedSearchCV(estimator=rfgs,param_distributions=pgrid,cv=cv_skf,n_jobs=-1, 
                        verbose=10, scoring='accuracy',random_state=0)

# pring the date and time 
print("date and time =", dt_string)

date and time = 03/09/2021 02:46:43


**execution**

In [19]:
%%time
joblib.dump(cv.fit(X_train,y_train),'models/randomized_grid_search_cv_1_ms2.pkl')

fd is '5'
fd is '5'
fd is '5'
fd is '5'
Fitting 5 folds for each of 10 candidates, totalling 50 fits
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.3s finished


Wall time: 1min 28s


['models/randomized_grid_search_cv_1_ms2.pkl']

**best params**

In [20]:
loaded_cv = joblib.load('models/randomized_grid_search_cv_1_ms2.pkl')
loaded_cv.best_params_

{'min_samples_split': 3,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 51,
 'criterion': 'entropy'}

**classification report**

In [21]:
rf_best = loaded_cv.best_estimator_
rf_best.fit(X_train,y_train)

# get the prediction
rfpred = rf_best.predict(X_test)

# print classification report
cr = classification_report(y_test, rfpred)

print(cr)

classification_report_csv(cr,'randomized_grid_search_cv_1_ms2.pkl',accuracy_score(y_test, rfpred))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


                         precision    recall  f1-score   support

             functional       0.81      0.90      0.85      6790
functional needs repair       0.57      0.34      0.42       885
         non functional       0.85      0.78      0.82      4799

               accuracy                           0.81     12474
              macro avg       0.74      0.67      0.70     12474
           weighted avg       0.81      0.81      0.81     12474



### Randomized Grid Search - 2

In [22]:
# parameter grid
pgrid = {    
    'n_estimators': [1000,1100],
    'max_depth' : [51,101,151],    
    'min_samples_split' : [3],
    'min_samples_leaf' : [1,2],
    'criterion' : ['gini','entropy']  
}

# specifying the cv
cv_skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

# specifying the model 
rfgs = RandomForestClassifier(n_jobs=-1, verbose=1)

# keep track of the date and time
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")

# specify the grid search cv
cv = RandomizedSearchCV(estimator=rfgs,param_distributions=pgrid,cv=cv_skf,n_jobs=-1,verbose=10,
                        scoring='balanced_accuracy',random_state=0)

# pring the date and time 
print("date and time =", dt_string)

date and time = 03/09/2021 02:48:16


**execution**

In [23]:
%%time
joblib.dump(cv.fit(X_train,y_train),'models/randomized_grid_search_cv_2_ms2.pkl')

fd is '5'
fd is '5'
fd is '5'
fd is '5'
Fitting 5 folds for each of 10 candidates, totalling 50 fits
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   20.9s finished


Wall time: 15min 42s


['models/randomized_grid_search_cv_2_ms2.pkl']

**best params**

In [24]:
loaded_cv = joblib.load('models/randomized_grid_search_cv_2_ms2.pkl')
loaded_cv.best_params_

{'n_estimators': 1000,
 'min_samples_split': 3,
 'min_samples_leaf': 1,
 'max_depth': 151,
 'criterion': 'gini'}

**classification report**

In [25]:
rf_best = loaded_cv.best_estimator_
rf_best.fit(X_train,y_train)

# get the prediction
rfpred = rf_best.predict(X_test)

# print classification report
cr = classification_report(y_test, rfpred)

print(cr)

classification_report_csv(cr,'randomized_grid_search_cv_2_ms2.pkl',accuracy_score(y_test, rfpred))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   16.9s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   21.6s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    1.2s finished


                         precision    recall  f1-score   support

             functional       0.82      0.88      0.85      6790
functional needs repair       0.52      0.36      0.43       885
         non functional       0.84      0.79      0.82      4799

               accuracy                           0.81     12474
              macro avg       0.73      0.68      0.70     12474
           weighted avg       0.81      0.81      0.81     12474



### Grid Search - 1

In [26]:
# parameter grid
pgrid = {
    'n_estimators'      : [451,1001],
    'bootstrap'         : [True,False],
    'criterion'         : ['gini','entropy'],
    'max_depth'         : [51,101,151],        
    'min_samples_split' : [2,3],
    'min_samples_leaf'  : [1,2]
}

# specifying the cv
cv_ss = StratifiedShuffleSplit(n_splits=3, train_size=0.75, test_size=.25,random_state=0)

# specifying the model 
rfgs = RandomForestClassifier(n_jobs=-1, verbose=1)

# keep track of the date and time
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")

# specify the grid search cv
cv = GridSearchCV(estimator=rfgs, param_grid=pgrid, cv=cv_ss, n_jobs=-1, verbose=10, scoring='accuracy')

# pring the date and time 
print("date and time =", dt_string)

date and time = 03/09/2021 03:04:27


**execution**

In [27]:
%%time
joblib.dump(cv.fit(X_train,y_train),'models/grid_search_cv_1_ms2.pkl')

fd is '5'
fd is '5'
fd is '5'
fd is '5'
Fitting 3 folds for each of 96 candidates, totalling 288 fits
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5

fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 451 out of 451 | elapsed:   10.1s finished


Wall time: 1h 8min 37s


['models/grid_search_cv_1_ms2.pkl']

**best params**

In [28]:
loaded_cv = joblib.load('models/grid_search_cv_1_ms2.pkl')
loaded_cv.best_params_

{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': 51,
 'min_samples_leaf': 2,
 'min_samples_split': 3,
 'n_estimators': 451}

**classification report**

In [29]:
rf_best = loaded_cv.best_estimator_
rf_best.fit(X_train,y_train)

# get the prediction
rfpred = rf_best.predict(X_test)

# print classification report
cr = classification_report(y_test, rfpred)

print(cr)

classification_report_csv(cr,'grid_search_cv_1_ms2.pkl',accuracy_score(y_test, rfpred))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 451 out of 451 | elapsed:    9.9s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 451 out of 451 | elapsed:    0.3s finished


                         precision    recall  f1-score   support

             functional       0.81      0.90      0.85      6790
functional needs repair       0.57      0.33      0.42       885
         non functional       0.85      0.79      0.82      4799

               accuracy                           0.82     12474
              macro avg       0.75      0.67      0.70     12474
           weighted avg       0.81      0.82      0.81     12474



### Grid Search - 2

In [30]:
# parameter grid 
pgrid = {
    'n_estimators' : [751,1000,1100],
    'max_depth' : [51,101,151],        
    'min_samples_split' : [2,3,6],
    'min_samples_leaf' : [1],
    'bootstrap': [True, False]
}

# specifying the cv
cv_skf = StratifiedKFold(n_splits=3, random_state=None, shuffle=False) # need to change to 3 splits based on results

# specifying the model 
rfgs = RandomForestClassifier(n_jobs=-1, verbose=1)

# keep track of the date and time
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")

# specify the grid search cv
cv = GridSearchCV(estimator=rfgs, param_grid=pgrid, cv=cv_skf, n_jobs=-1, verbose=10, scoring='balanced_accuracy')

# pring the date and time 
print("date and time =", dt_string)

date and time = 03/09/2021 04:13:19


**execution**

In [31]:
%%time
joblib.dump(cv.fit(X_train,y_train),'models/grid_search_cv_2_ms2.pkl')

fd is '5'
fd is '5'
fd is '5'
fd is '5'
Fitting 3 folds for each of 54 candidates, totalling 162 fits
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   28.2s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   39.1s finished


Wall time: 44min 49s


['models/grid_search_cv_2_ms2.pkl']

**best params**

In [32]:
loaded_cv = joblib.load('models/grid_search_cv_2_ms2.pkl')
loaded_cv.best_params_

{'bootstrap': False,
 'max_depth': 101,
 'min_samples_leaf': 1,
 'min_samples_split': 6,
 'n_estimators': 1000}

**classification report**

In [33]:
rf_best = loaded_cv.best_estimator_
rf_best.fit(X_train,y_train)

# get the prediction
rfpred = rf_best.predict(X_test)

# print classification report
cr = classification_report(y_test, rfpred)

print(cr)

classification_report_csv(cr,'grid_search_cv_2_ms2.pkl',accuracy_score(y_test, rfpred))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   22.9s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   29.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    0.8s finished


                         precision    recall  f1-score   support

             functional       0.82      0.88      0.85      6790
functional needs repair       0.52      0.37      0.43       885
         non functional       0.83      0.80      0.81      4799

               accuracy                           0.81     12474
              macro avg       0.72      0.68      0.70     12474
           weighted avg       0.80      0.81      0.81     12474



# Showing the Results - Sorted by Accuracy 

In [34]:
df_cr = pd.read_csv('classification_reports/classification_report.csv')
df_cr[['classifier','accuracy_score']].drop_duplicates().sort_values(by='accuracy_score')

Unnamed: 0,classifier,accuracy_score
18,grid_search_cv_2_ms2.pkl,0.809684
3,halving_grid_search_cv_2_ms2.pkl,0.811207
12,randomized_grid_search_cv_2_ms2.pkl,0.812249
9,randomized_grid_search_cv_1_ms2.pkl,0.814494
6,halving_grid_search_cv_3_ms2.pkl,0.814895
0,halving_grid_search_cv_1_ms2.pkl,0.815456
15,grid_search_cv_1_ms2.pkl,0.815456


In [40]:
rf_balanced = RandomForestClassifier(n_jobs=-1, 
                                     verbose=1, 
                                     bootstrap=True, 
                                     max_depth=101,
                                     min_samples_leaf=1,
                                     min_samples_split=3,
                                     n_estimators=1000,
                                     class_weight='balanced'
                                    )

In [41]:
rf_balanced.fit(X_train,y_train)
# get the prediction
rfbpred = rf_balanced.predict(X_test)
print(classification_report(y_test, rfbpred))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   16.9s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   21.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    0.8s finished


                         precision    recall  f1-score   support

             functional       0.83      0.86      0.84      6790
functional needs repair       0.44      0.45      0.44       885
         non functional       0.84      0.79      0.81      4799

               accuracy                           0.80     12474
              macro avg       0.70      0.70      0.70     12474
           weighted avg       0.80      0.80      0.80     12474



In [47]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [45]:
rf_balanced = BalancedRandomForestClassifier(n_jobs=-1, 
                                     verbose=1,                                      
                                     max_depth=101,
                                     min_samples_leaf=1,
                                     min_samples_split=3,
                                     n_estimators=1000,
                                     class_weight='balanced'
                                    )

In [46]:
rf_balanced.fit(X_train,y_train)
# get the prediction
rfbpred = rf_balanced.predict(X_test)
print(classification_report(y_test, rfbpred))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   16.8s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   21.7s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    1.5s finished


                         precision    recall  f1-score   support

             functional       0.87      0.67      0.76      6790
functional needs repair       0.24      0.81      0.37       885
         non functional       0.85      0.75      0.79      4799

               accuracy                           0.71     12474
              macro avg       0.65      0.74      0.64     12474
           weighted avg       0.81      0.71      0.74     12474



In [48]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(rf_balanced, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print(np.mean(scores))

fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
fd is '5'
