## Part II - Train and evaluate the model

####  Required Python libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Importing an ipynb file from another ipynb file
!pip install ipynb



In [3]:
# Importing functions from another jupyter notebook
!pip install nbimporter



In [4]:
%run GlobalConfig.ipynb

Setting global variables...


#### Load the Data

In [5]:
import nbimporter
import loader_nb
import model_selection_helper_nb

loader = loader_nb.UrlDatasetLoader()

init Loader notebook


In [6]:
df = loader.load_data()

In [7]:
X, y = loader.prepare_data(df)

#### Split the Data

In [8]:
from sklearn.model_selection import train_test_split    

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

#### Train and Optimize models

In [9]:
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

# assignment number 7 classifiers
def get_models(): 
    models = dict()
    models['LogisticRegression'] = LogisticRegression(random_state=RANDOM_STATE, max_iter=MAX_ITER)
    models['AdaBoostClassifier'] = AdaBoostClassifier(random_state=RANDOM_STATE)
    models['ExtraTreesClassifier'] = ExtraTreesClassifier(random_state=RANDOM_STATE)
    models['GradientBoostingClassifier'] = GradientBoostingClassifier(random_state=RANDOM_STATE)        
    models['RandomForestClassifier'] = RandomForestClassifier(random_state=RANDOM_STATE, max_depth=MAX_DEPTH)
        
    return models

In [10]:
def get_params(): 
    
    params_log = dict()
    params_log['solver'] = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    params_log['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
    params_log['penalty'] = ['l2']
    params_log['C'] = [100, 10, 1.0, 0.1, 0.01]
       
    params_ada = dict()
    params_ada['learning_rate'] = [0.01, 0.1, 1.0]
    params_ada['algorithm'] = ['SAMME', 'SAMME.R']
    
    params_ext = dict()
    params_ext['criterion'] = ['gini', 'entropy']
    params_ext['max_depth'] = [2, 3, 4]         # default: 3
    params_ext['max_leaf_nodes'] = [10, 20, 30]
    params_ext['min_samples_leaf'] = [1, 3, 4]  # default: 1
    params_ext['min_samples_split'] = [2, 3, 4] # default: 2
    
    params_gra = dict()
    params_gra['criterion'] = ['friedman_mse']
    params_gra['max_depth'] = [2, 3, 4]         # default: 3
    params_gra['max_leaf_nodes'] = [10, 20, 30]
    params_gra['min_samples_leaf'] = [1, 3, 4]  # default: 1
    params_gra['min_samples_split'] = [2, 3, 4] # default: 2
        
    params_ran = dict()
    params_ran['criterion'] = ['gini', 'entropy']
    params_ran['max_depth'] = [2, 3, 4]         # default: 3
    params_ran['max_leaf_nodes'] = [10, 20, 30]
    params_ran['min_samples_leaf'] = [1, 3, 4]  # default: 1
    params_ran['min_samples_split'] = [2, 3, 4] # default: 2
        
    params = dict()
    params['LogisticRegression'] = params_log
    params['AdaBoostClassifier'] = params_ada
    params['ExtraTreesClassifier'] = params_ext
    params['GradientBoostingClassifier'] = params_gra
    params['RandomForestClassifier'] = params_ran    
    
    return params    

In [11]:
models_to_train = get_models()
parameters_to_train = get_params()

In [12]:
model_selection_helper = model_selection_helper_nb.ModelSelectionHelper(models_to_train, parameters_to_train)

In [13]:
%%time

model_selection_helper.fit(X_train, y_train, scoring='accuracy', verbose=3)


---------------------------------------------------------------------------
LogisticRegression
Fitting 3 folds for each of 25 candidates, totalling 75 fits
LogisticRegression :  {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.8297656944509201
---------------------------------------------------------------------------
AdaBoostClassifier
Fitting 3 folds for each of 6 candidates, totalling 18 fits
AdaBoostClassifier :  {'algorithm': 'SAMME', 'learning_rate': 1.0}
0.6958045140061907
---------------------------------------------------------------------------
ExtraTreesClassifier
Fitting 3 folds for each of 162 candidates, totalling 486 fits
ExtraTreesClassifier :  {'criterion': 'entropy', 'max_depth': 4, 'max_leaf_nodes': 20, 'min_samples_leaf': 3, 'min_samples_split': 2}
0.7070910972943757
---------------------------------------------------------------------------
GradientBoostingClassifier
Fitting 3 folds for each of 81 candidates, totalling 243 fits
GradientBoostingClassifier :  {'c

In [14]:
X

Unnamed: 0,Querylength,domain_token_count,path_token_count,avgdomaintokenlen,longdomaintokenlen,avgpathtokenlen,charcompvowels,ldl_domain,ldl_filename,dld_url,...,NumberRate_DirectoryName,NumberRate_FileName,NumberRate_Extension,NumberRate_AfterPath,SymbolCount_URL,SymbolCount_Directoryname,Entropy_URL,Entropy_Domain,Entropy_DirectoryName,Entropy_Extension
0,0.000000,0.117647,0.073529,0.142857,0.196721,0.041905,0.041451,0.0,0.0,0.000000,...,0.516779,0.533333,1.000000,0.000000,0.133333,0.12,0.644621,0.508074,0.965557,0.754106
1,0.000000,0.117647,0.073529,0.142857,0.196721,0.057143,0.062176,0.0,0.0,0.000000,...,0.516779,0.500000,0.518790,0.000000,0.133333,0.16,0.565470,0.508074,0.924711,0.500000
2,0.000000,0.117647,0.073529,0.142857,0.196721,0.055238,0.062176,0.0,0.0,0.000000,...,0.516779,0.500000,0.518790,0.000000,0.133333,0.16,0.578950,0.508074,0.924711,0.500000
3,0.000000,0.117647,0.176471,0.142857,0.196721,0.052381,0.165803,0.0,0.0,0.000000,...,0.516779,0.500000,0.518790,0.000000,0.133333,0.16,0.463537,0.508074,0.924711,0.500000
4,0.000000,0.117647,0.088235,0.142857,0.196721,0.069841,0.093264,0.0,0.0,0.000000,...,0.516779,0.500000,0.518790,0.000000,0.133333,0.16,0.550071,0.508074,0.924711,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36702,0.020939,0.117647,0.205882,0.151786,0.163934,0.034921,0.103627,0.0,0.0,0.000000,...,0.550119,0.764706,0.813953,0.533333,0.377778,0.48,0.569505,0.523532,0.905741,0.828342
36703,0.000000,0.117647,0.191176,0.080357,0.098361,0.080586,0.124352,0.0,0.0,0.000000,...,0.516779,0.593750,0.600000,0.000000,0.466667,0.12,0.516835,0.589147,0.957762,0.837336
36704,0.041877,0.058824,0.397059,0.184524,0.229508,0.032143,0.212435,0.0,0.0,0.315789,...,0.554592,0.688889,0.709016,0.514706,0.533333,0.60,0.498584,0.546069,0.858495,0.858593
36705,0.025271,0.058824,0.191176,0.101190,0.114754,0.034286,0.077720,0.0,0.0,0.105263,...,0.516779,0.642045,0.666667,0.709091,0.266667,0.08,0.643917,0.766295,0.953411,0.879412


#### Run the models with test data using the best params

In [15]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

for key in models_to_train.keys():
    
    model = model_selection_helper.get_model_best_estimator(key)
    
    model.fit(X_test, y_test)

    y_pred = model.predict(X_test)
        
    print(key, 'accuracy_score: ', accuracy_score(y_test, y_pred))
    print(key, 'f1_score: ', f1_score(y_test, y_pred, average='weighted'))
    

LogisticRegression accuracy_score:  0.8298374648143103
LogisticRegression f1_score:  0.8287082208704686
AdaBoostClassifier accuracy_score:  0.6874602742213748
AdaBoostClassifier f1_score:  0.6814893406164179
ExtraTreesClassifier accuracy_score:  0.7122491600835377
ExtraTreesClassifier f1_score:  0.7033655023930944
GradientBoostingClassifier accuracy_score:  0.9812948333787342
GradientBoostingClassifier f1_score:  0.981323592073486
RandomForestClassifier accuracy_score:  0.7687278670661944
RandomForestClassifier f1_score:  0.7613463057380457
