## Part II - Train and evaluate the model

####  Required Python libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importing an ipynb file from another ipynb file
!pip install ipynb



In [3]:
# Importing functions from another jupyter notebook
!pip install nbimporter



In [4]:
%run GlobalConfig.ipynb

Setting global variables...


#### Load the Data

In [5]:
import nbimporter
import loader_nb
import model_selection_helper_nb

loader = loader_nb.UrlDatasetLoader()

init Loader notebook


In [6]:
df = loader.load_data()

In [7]:
X, y = loader.prepare_data(df)

#### Split the Data

In [9]:
from sklearn.model_selection import train_test_split    

X_train, X_test, y_train, y_test = loader.train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

The X_train, y_train shape
(25694, 51)
(25694,)
The shape after unsupervised anomaly detection:
(25437, 51)
(25437,)
The X_test, y_test shape
(11013, 51)
(11013,)
The shape after unsupervised anomaly detection:
(10902, 51)
(10902,)


In [10]:
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn import metrics
# calculating the accuracy of models with different values of k
#mean_acc = np.zeros(20)
#for i in range(1,21):
#    #Train Model and Predict  
#    knn = KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
#    yhat= knn.predict(X_test)
#    mean_acc[i-1] = metrics.accuracy_score(y_test, yhat)

In [11]:
#loc = np.arange(1,21,step=1.0)
#plt.figure(figsize = (10, 6))
#plt.plot(range(1,21), mean_acc)
#plt.xticks(loc)
#plt.xlabel('Number of Neighbors ')
#plt.ylabel('Accuracy')
#plt.show()

#### Train models with Hyperparameter optimization

Load the models and parameters to Train.

In [12]:
models_to_train = loader.get_models_to_train()
parameters_to_train = loader.get_parameters_to_train(True)

Tune Hyperparameters for Classification Machine Learning Algorithms

In [13]:
model_selection_helper = model_selection_helper_nb.ModelSelectionHelper(models_to_train, parameters_to_train)

init model selection helper notebook


In [14]:
%%time

model_selection_helper.fit(X_train, y_train, cv=3, scoring='accuracy', verbose=2)

---------------------------------------------------------------------------
KNeighborsClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits
KNeighborsClassifier :  {'metric': 'manhattan', 'n_neighbors': 2, 'weights': 'distance'}
0.9583284192318277
---------------------------------------------------------------------------
DecisionTreeClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits
DecisionTreeClassifier :  {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 5}
0.8888626803475254
---------------------------------------------------------------------------
RandomForestClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits
RandomForestClassifier :  {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 200}
0.9347407320045603
---------------------------------------------------------------------------
GradientBoostingClassifier
Fitting 3 folds for each of 

#### Run the models with test data using the best estimator

In [15]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import  accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

def print_predict_scores(model_name, y_train, y_pred):
    #roc = roc_auc_score(y_train, y_pred)
    acc = accuracy_score(y_train, y_pred)
    #prec = precision_score(y_train, y_pred)
    #rec = recall_score(y_train, y_pred)
    f1 = f1_score(y_train, y_pred, average='weighted')
    #results = pd.DataFrame([[model_name, acc, prec, rec, f1,roc]], columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score','AUC'])
    results = pd.DataFrame([[model_name, acc, f1]], columns = ['Model', 'Accuracy', 'F1 Score'])
    print(results)

for key in models_to_train.keys():
    
    model = model_selection_helper.get_model_best_estimator(key)
    
    model.fit(X_test, y_test)

    y_pred = model.predict(X_test)

    print_predict_scores(key, y_test, y_pred)
    

                  Model  Accuracy  F1 Score
0  KNeighborsClassifier       1.0       1.0
                    Model  Accuracy  F1 Score
0  DecisionTreeClassifier  0.919464   0.92022
                    Model  Accuracy  F1 Score
0  RandomForestClassifier   0.95056  0.950964
                        Model  Accuracy  F1 Score
0  GradientBoostingClassifier       1.0       1.0
                Model  Accuracy  F1 Score
0  LogisticRegression  0.816639  0.815096
                Model  Accuracy  F1 Score
0  AdaBoostClassifier  0.698954  0.691719
