## Part II - Train and evaluate the model

####  Required Python libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Importing an ipynb file from another ipynb file
!pip install ipynb



In [3]:
# Importing functions from another jupyter notebook
!pip install nbimporter



In [4]:
%run GlobalConfig.ipynb

Setting global variables...


#### Load the Data

In [5]:
import nbimporter
import loader_nb
import model_selection_helper_nb

loader = loader_nb.UrlDatasetLoader()

init Loader notebook


In [6]:
df = loader.load_data()

In [7]:
X, y = loader.prepare_data(df)

#### Split the Data

In [8]:
from sklearn.model_selection import train_test_split    

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

#### Train and Optimize models

In [9]:
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

# assignment number 7 classifiers
def get_models(): 
    models = dict()
    models['LogisticRegression'] = LogisticRegression(random_state=RANDOM_STATE, max_iter=MAX_ITER)
    models['AdaBoostClassifier'] = AdaBoostClassifier(random_state=RANDOM_STATE)
    models['ExtraTreesClassifier'] = ExtraTreesClassifier(random_state=RANDOM_STATE)
    models['GradientBoostingClassifier'] = GradientBoostingClassifier(random_state=RANDOM_STATE)        
    models['RandomForestClassifier'] = RandomForestClassifier(random_state=RANDOM_STATE, max_depth=MAX_DEPTH)
        
    return models

In [10]:
def get_params(): 
    
    params_log = dict()
    params_log['solver'] = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    #params_log['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
    #params_log['penalty'] = ['l2']
    #params_log['C'] = [100, 10, 1.0, 0.1, 0.01]
       
    params_ada = dict()
    params_ada['learning_rate'] = [0.01, 0.1, 1.0]
    params_ada['algorithm'] = ['SAMME', 'SAMME.R']
    
    params_ext = dict()
    params_ext['criterion'] = ['gini', 'entropy']
    params_ext['max_depth'] = [2, 3, 4]         # default: 3
    #params_ext['max_leaf_nodes'] = [10, 20, 30]
    #params_ext['min_samples_leaf'] = [1, 3, 4]  # default: 1
    #params_ext['min_samples_split'] = [2, 3, 4] # default: 2
    
    params_gra = dict()
    params_gra['criterion'] = ['friedman_mse']
    #params_gra['max_depth'] = [2, 3, 4]         # default: 3
    #params_gra['max_leaf_nodes'] = [10, 20, 30]
    #params_gra['min_samples_leaf'] = [1, 3, 4]  # default: 1
    #params_gra['min_samples_split'] = [2, 3, 4] # default: 2
        
    params_ran = dict()
    params_ran['criterion'] = ['gini', 'entropy']
    #params_ran['max_depth'] = [2, 3, 4]         # default: 3
    #params_ran['max_leaf_nodes'] = [10, 20, 30]
    #params_ran['min_samples_leaf'] = [1, 3, 4]  # default: 1
    #params_ran['min_samples_split'] = [2, 3, 4] # default: 2
        
    params = dict()
    params['LogisticRegression'] = params_log
    params['AdaBoostClassifier'] = params_ada
    params['ExtraTreesClassifier'] = params_ext
    params['GradientBoostingClassifier'] = params_gra
    params['RandomForestClassifier'] = params_ran    
    
    return params    

In [11]:
models_to_train = get_models()
parameters_to_train = get_params()

In [12]:
model_selection_helper = model_selection_helper_nb.ModelSelectionHelper(models_to_train, parameters_to_train)

In [13]:
%%time

model_selection_helper.fit(X_train, y_train, scoring='accuracy', verbose=3)


---------------------------------------------------------------------------
LogisticRegression
Fitting 3 folds for each of 5 candidates, totalling 15 fits
LogisticRegression :  {'solver': 'sag'}
0.7650814775672549
---------------------------------------------------------------------------
AdaBoostClassifier
Fitting 3 folds for each of 6 candidates, totalling 18 fits
AdaBoostClassifier :  {'algorithm': 'SAMME', 'learning_rate': 1.0}
0.6958045140061907
---------------------------------------------------------------------------
ExtraTreesClassifier
Fitting 3 folds for each of 6 candidates, totalling 18 fits
ExtraTreesClassifier :  {'criterion': 'entropy', 'max_depth': 4}
0.7050673209120863
---------------------------------------------------------------------------
GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits
GradientBoostingClassifier :  {'criterion': 'friedman_mse'}
0.9379622532821563
---------------------------------------------------------------

In [None]:
X

#### Run the models with test data using the best params

In [16]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

for key in models_to_train.keys():
    
    model = model_selection_helper.get_model_best_estimator(key)
    
    model.fit(X_test, y_test)

    y_pred = model.predict(X_test)
        
    print(key, 'accuracy_score: ', accuracy_score(y_test, y_pred))
    print(key, 'f1_score: ', f1_score(y_test, y_pred, average='weighted'))
    

LogisticRegression accuracy_score:  0.7594660855352765
LogisticRegression f1_score:  0.7561260115308194
AdaBoostClassifier accuracy_score:  0.6874602742213748
AdaBoostClassifier f1_score:  0.6814893406164179
ExtraTreesClassifier accuracy_score:  0.6997185144828839
ExtraTreesClassifier f1_score:  0.6899379042982596
GradientBoostingClassifier accuracy_score:  0.9496958140379551
GradientBoostingClassifier f1_score:  0.9497825725059457
RandomForestClassifier accuracy_score:  1.0
RandomForestClassifier f1_score:  1.0
