# Data inputs and Display Libraries

In [None]:

import pandas as pd
import numpy as np
import pickle

pd.set_option('display.float_format', lambda x: '%.5f' % x)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Modelling Libraries

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection

from xgboost import XGBClassifier
import pickle
from sklearn.model_selection import GridSearchCV


# Metrics Libraries


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import f1_score
from matplotlib import pyplot
import matplotlib.pyplot as plt
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve

In [None]:
!git clone https://github.com/ragamudra/workshop_trees

Cloning into 'workshop_trees'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 6 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (6/6), done.


In [None]:
!unrar x 'workshop_trees/datasets.rar'  'workshop_trees'


UNRAR 5.50 freeware      Copyright (c) 1993-2017 Alexander Roshal


Extracting from workshop_trees/datasets.rar

Extracting  workshop_trees/y_train1                                        0%  OK 
Extracting  workshop_trees/df_all_test2                                    1%  2%  3%  4%  5%  6%  7%  8%  9% 10% 11% 12% 13% 14% 15% 16% 17% 18% 19% 20% 21% 22% 23% 24% 25% 26% 27% 28% 29% 30% 31% 32% 33%  OK 
Extracting  workshop_trees/df_all_train2                                  34% 35% 36% 37% 38% 39% 40% 41% 42% 43% 44% 45% 46% 47% 48% 49% 50% 51% 52% 53% 54% 55% 56% 57% 58% 59% 60% 61% 62% 63% 64% 65% 66% 67% 68% 69% 70% 71% 72% 73% 74% 75% 76% 77% 78% 79% 

In [None]:
fl_out = "./workshop_trees"
def pick_in(obj_name):
    fl_out1 = fl_out + "/" + obj_name
    pickle_in = open(fl_out1,"rb")
    mod1= pickle.load(pickle_in)
    
    return mod1

In [None]:
list_objs = ["df_all_train2","y_train1","df_all_test2","y_test1"]

for i in list_objs:
     globals()[i]= pick_in(i)

In [None]:
def auc1_scr(mod1,test_set,actual1):
    mod = eval(mod1)
    pred1=mod.predict_proba(test_set)[:,1]
    fpr, tpr, thresholds = roc_curve(actual1, pred1)
    auc1 = auc(fpr, tpr)
    
    return auc1

In [None]:
###bagging classifier

bc = BaggingClassifier(random_state=0)
bc.fit(df_all_train2,y_train1)

BaggingClassifier(random_state=0)

In [None]:
auc1_te = auc1_scr("bc",df_all_test2,y_test1)
auc1_tr = auc1_scr("bc",df_all_train2,y_train1)

In [None]:
auc1_te,auc1_tr

(0.9421740207850983, 0.9819321235084637)

In [None]:
kfold = model_selection.KFold(n_splits = 3)
  
results = model_selection.cross_val_score(bc, df_all_test2,y_test1, 
                                          cv = kfold,scoring='roc_auc',n_jobs=-1)
print (results.mean())

0.9379505474051072


In [None]:
###random forest


rf = RandomForestClassifier( random_state=0)
rf.fit(df_all_train2,y_train1)

RandomForestClassifier(random_state=0)

In [None]:
auc1_te = auc1_scr("rf",df_all_test2,y_test1)
auc1_tr = auc1_scr("rf",df_all_train2,y_train1)

In [None]:
auc1_te,auc1_tr

(0.9606192393892811, 0.9823078290676196)

In [None]:
kfold = model_selection.KFold(n_splits = 3)
  
results = model_selection.cross_val_score(rf, df_all_test2,y_test1, 
                                          cv = kfold,scoring='roc_auc',n_jobs=-1)
results.mean()

0.9581477079994082

# Grid Search

In [None]:
rf_grid= RandomForestClassifier(random_state=42)

params = {
    'n_estimators': [50,100],
    'criterion': ["gini", "entropy"],
    'max_samples': [0.2,0.5],
    'max_features':[0.2,0.3]
    
}

grid_search = GridSearchCV(estimator=rf_grid, 
                           param_grid=params, 
                           cv=2, n_jobs=5, verbose=1, scoring = "roc_auc")


grid_search.fit(df_all_test2,y_test1)

Fitting 2 folds for each of 16 candidates, totalling 32 fits


GridSearchCV(cv=2, estimator=RandomForestClassifier(random_state=42), n_jobs=5,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': [0.2, 0.3], 'max_samples': [0.2, 0.5],
                         'n_estimators': [50, 100]},
             scoring='roc_auc', verbose=1)

In [None]:
score_df = pd.DataFrame(grid_search.cv_results_)
score_df.head()

In [None]:
score_df.sort_values(["rank_test_score"]).head(5)

In [None]:
grid_search.best_estimator_