## Part II - Train and evaluate the model

####  Required Python libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importing an ipynb file from another ipynb file
!pip install ipynb

Collecting ipynb
  Downloading ipynb-0.5.1-py3-none-any.whl (6.9 kB)
Installing collected packages: ipynb
Successfully installed ipynb-0.5.1


In [3]:
# Importing functions from another jupyter notebook
!pip install nbimporter

Collecting nbimporter
  Downloading nbimporter-0.3.4-py3-none-any.whl (4.9 kB)
Installing collected packages: nbimporter
Successfully installed nbimporter-0.3.4


In [4]:
%run GlobalConfig.ipynb

ERROR:root:File `'GlobalConfig.ipynb.py'` not found.


#### Load the Data

In [5]:
import nbimporter
import loader_nb
import model_selection_helper_nb

loader = loader_nb.UrlDatasetLoader()

ModuleNotFoundError: ignored

In [None]:
df = loader.load_data()

In [None]:
X, y = loader.prepare_data(df)

#### Split the Data

In [None]:
from sklearn.model_selection import train_test_split    

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

#### Train and Optimize models

In [None]:
# assignment number 7 classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

def get_models():
    '''
    (None) --> dict
    
    This function returns the models to be tested.
    '''
    models = dict()
    models['LogisticRegression'] = LogisticRegression(random_state=RANDOM_STATE, max_iter=MAX_ITER)
    models['AdaBoostClassifier'] = AdaBoostClassifier(random_state=RANDOM_STATE)
    models['ExtraTreesClassifier'] = ExtraTreesClassifier(random_state=RANDOM_STATE)
    models['GradientBoostingClassifier'] = GradientBoostingClassifier(random_state=RANDOM_STATE)        
    models['RandomForestClassifier'] = RandomForestClassifier(random_state=RANDOM_STATE)
    models['DecisionTreeClassifier'] = DecisionTreeClassifier(random_state=RANDOM_STATE)
        
    return models

In [None]:
def get_params(is_best_params=False):
    '''
    (Boolean) --> dict
    
    This function returns the params to be used for model testing.
    
    Parameters
    ----------
    is_best_params : True to use the already defined best params from previous runs. 
                     False to use all the possible hyperparameters.
    '''
        
    params_log = dict()
    params_ada = dict()
    params_ext = dict()
    params_gra = dict()
    params_ran = dict()
    params_tre = dict()
    params_svc = dict()
    if is_best_params == False:        
        params_log['multi_class'] = ['multinomial']
        params_log['solver'] = ['lbfgs']
        params_log['penalty'] = ['l1', 'l2', 'elasticnet']
        params_log['C'] = [100, 10, 1.0, 0.1, 0.01]        
        
        params_ada['learning_rate'] = [0.01, 0.1, 1.0]
        params_ada['algorithm'] = ['SAMME', 'SAMME.R']
        
        params_ext['criterion'] = ['gini', 'entropy']
        params_ext['max_depth'] = [2, 3, 4]
        params_ext['max_leaf_nodes'] = [10, 20, 30]
        params_ext['min_samples_leaf'] = [1, 3, 4]
        params_ext['min_samples_split'] = [2, 3, 4]
        
        params_gra['criterion'] = ['friedman_mse']
        params_gra['max_depth'] = [2, 3, 4]         
        params_gra['max_leaf_nodes'] = [10, 20, 30]
        params_gra['min_samples_leaf'] = [1, 3, 4]
        params_gra['min_samples_split'] = [2, 3, 4]
        
        params_ran['criterion'] = ['gini', 'entropy']
        params_ran['max_depth'] = [2, 3, 4]         
        params_ran['max_leaf_nodes'] = [10, 20, 30]
        params_ran['min_samples_leaf'] = [1, 3, 4]  
        params_ran['min_samples_split'] = [2, 3, 4] 
        
        params_tre['criterion'] = ['gini', 'entropy']
        params_tre['max_depth'] = [2, 3, 4]
        params_tre['min_samples_split'] = [2, 3, 4]
        params_tre['min_samples_leaf'] = [1, 3, 4]
        params_tre['max_leaf_nodes'] = [None, 10, 20, 30]
        
    else:
        # https://www.kaggle.com/code/satishgunjal/multiclass-logistic-regression-using-sklearn/notebook
        # Since we are going to use One Vs Rest algorithm, set > multi_class='ovr'
        # Note: since we are using One Vs Rest algorithm we must use 'liblinear' solver with it.
        params_log['multi_class'] = ['ovr']
        params_log['solver'] = ['liblinear']
        params_log['penalty'] = ['l1', 'l2', 'elasticnet']
        params_log['C'] = [100, 10, 1.0, 0.1, 0.01]            
        
        params_ada['learning_rate'] = [1.0]
        params_ada['algorithm'] = ['SAMME']
        
        params_ext['criterion'] = ['entropy']
        params_ext['max_depth'] = [4] 
        params_ext['max_leaf_nodes'] = [20]
        params_ext['min_samples_leaf'] = [3]
        params_ext['min_samples_split'] = [2]
    
        params_gra['criterion'] = ['friedman_mse']
        params_gra['max_depth'] = [4] 
        params_gra['max_leaf_nodes'] = [20]
        params_gra['min_samples_leaf'] = [1]
        params_gra['min_samples_split'] = [3]
        
        params_ran['criterion'] = ['entropy']
        params_ran['max_depth'] = [4]         
        params_ran['max_leaf_nodes'] = [20]
        params_ran['min_samples_leaf'] = [1]  
        params_ran['min_samples_split'] = [2] 
        
        params_tre['criterion'] = ['gini']
        params_tre['max_depth'] = [4]         
        params_tre['max_leaf_nodes'] = [None]
        params_tre['min_samples_leaf'] = [1]  
        params_tre['min_samples_split'] = [2] 
        
    params = dict()
    params['LogisticRegression'] = params_log
    params['AdaBoostClassifier'] = params_ada
    params['ExtraTreesClassifier'] = params_ext
    params['GradientBoostingClassifier'] = params_gra
    params['RandomForestClassifier'] = params_ran
    params['DecisionTreeClassifier'] = params_tre    
        
    return params    

In [None]:
models_to_train = get_models()
parameters_to_train = get_params(True)

In [None]:
model_selection_helper = model_selection_helper_nb.ModelSelectionHelper(models_to_train, parameters_to_train)

In [None]:
%%time

model_selection_helper.fit(X_train, y_train, cv=5, scoring='accuracy', verbose=5)


In [None]:
''' The 
---------------------------------------------------------------------------
LogisticRegression
Fitting 3 folds for each of 25 candidates, totalling 75 fits
LogisticRegression :  {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.8297656944509201
---------------------------------------------------------------------------
AdaBoostClassifier
Fitting 3 folds for each of 6 candidates, totalling 18 fits
AdaBoostClassifier :  {'algorithm': 'SAMME', 'learning_rate': 1.0}
0.6958045140061907
---------------------------------------------------------------------------
ExtraTreesClassifier
Fitting 3 folds for each of 162 candidates, totalling 486 fits
ExtraTreesClassifier :  {'criterion': 'entropy', 'max_depth': 4, 'max_leaf_nodes': 20, 'min_samples_leaf': 3, 'min_samples_split': 2}
0.7070910972943757
---------------------------------------------------------------------------
GradientBoostingClassifier
Fitting 3 folds for each of 81 candidates, totalling 243 fits
GradientBoostingClassifier :  {'criterion': 'friedman_mse', 'max_depth': 4, 'max_leaf_nodes': 20, 'min_samples_leaf': 1, 'min_samples_split': 3}
0.9596793312198327
---------------------------------------------------------------------------
RandomForestClassifier
Fitting 3 folds for each of 162 candidates, totalling 486 fits
RandomForestClassifier :  {'criterion': 'entropy', 'max_depth': 4, 'max_leaf_nodes': 20, 'min_samples_leaf': 1, 'min_samples_split': 2}
0.7542616703562496
Wall time: 1h 3min 59s
'''

#### Run the models with test data using the best params

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import  accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

def print_predict_scores(model_name, y_train, y_pred):
    #roc = roc_auc_score(y_train, y_pred)
    acc = accuracy_score(y_train, y_pred)
    #prec = precision_score(y_train, y_pred)
    #rec = recall_score(y_train, y_pred)
    f1 = f1_score(y_train, y_pred)
    #results = pd.DataFrame([[model_name, acc, prec, rec, f1,roc]], columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score','AUC'])
    results = pd.DataFrame([[model_name, acc, f1]], columns = ['Model', 'Accuracy', 'F1 Score'])
    print(results)

for key in models_to_train.keys():
    
    model = model_selection_helper.get_model_best_estimator(key)
    
    model.fit(X_test, y_test)

    y_pred = model.predict(X_test)
        
    #print(key, 'accuracy_score: ', accuracy_score(y_test, y_pred))
    #print(key, 'f1_score: ', f1_score(y_test, y_pred, average='weighted'))
    print_predict_scores(key, y_test, y_pred)
    