#### Part I Experiments

####  Required Python libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Importing an ipynb file from another ipynb file
!pip install ipynb



In [3]:
# Importing functions from another jupyter notebook
!pip install nbimporter



In [4]:
%run GlobalConfig.ipynb

Setting global variables...


In [5]:
import nbimporter
import loader_nb
import model_selection_helper_nb

loader = loader_nb.UrlDatasetLoader()

init Loader notebook


In [6]:
help(loader.load_data)

Help on method load_data in module loader_nb:

load_data(url='https://raw.githubusercontent.com/quickheaven/scs-3253-machine-learning/master/datasets/ISCX-URL2016_All.csv') method of loader_nb.UrlDatasetLoader instance
    (string) --> dataframe
    
    This function returns the dataframe of maliciours url.    
    
    Parameters
    ----------
    url: By default, it fetch the data from github otherwise a local path or url can be provided so the data can be loaded faster.



In [7]:
help(loader.prepare_data)

Help on method prepare_data in module loader_nb:

prepare_data(data, fill_na=True, feature_selection=True) method of loader_nb.UrlDatasetLoader instance
    (DataFrame, boolean, boolean) --> X and y of the dataframe.
    
    This function returns the X and y of the malicious url dataframe.
    
    Parameters
    ----------
    fill_na : True to fill the na records with mean values otherwise drop the features.
    
    feature_selection : True to remove one or more features that have a correlation higher than 0.9 othewise do not perform that kind of feature selection.
    
    anomaly_detection: True to remove outliers using unsupervised anomaly detection via Isolation Forest, otherwise no anomaly detection will be performed.



In [8]:
df = loader.load_data(DATASET_LOCAL_PATH)

In [26]:
def train_and_tune_models( data, fill_na=True, feature_selection=True, anomaly_detection=True):
    print('train_and_tune_models fill_na:', str(fill_na), 'feature_selection:', str(feature_selection), 'anomaly_detection:' + str(anomaly_detection))
    
    X, y = loader.prepare_data(data.copy(), fill_na=fill_na, feature_selection=feature_selection)
    
    from sklearn.model_selection import train_test_split    

    X_train, X_test, y_train, y_test = loader.train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, anomaly_detection=anomaly_detection)
    
    models_to_train = loader.get_models_to_train()
    parameters_to_train = loader.get_parameters_to_train(True)
    
    model_selection_helper = model_selection_helper_nb.ModelSelectionHelper(models_to_train, parameters_to_train)
    
    model_selection_helper.fit(X_train, y_train, cv=3, scoring='accuracy', verbose=2)
    

In [27]:
%%time
# TRUE TRUE TRUE
train_and_tune_models(df, fill_na=True, feature_selection=True, anomaly_detection=True)

train_and_tune_models fill_na:  True feature_selection:  True anomaly_detection: True
The X_train, y_train shape
(25694, 51)
(25694,)
The shape after unsupervised anomaly detection:
(25437, 51)
(25437,)
The X_test, y_test shape
(11013, 51)
(11013,)
The shape after unsupervised anomaly detection:
(10902, 51)
(10902,)
init model selection helper notebook
---------------------------------------------------------------------------
KNeighborsClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits
KNeighborsClassifier :  {'metric': 'manhattan', 'n_neighbors': 2, 'weights': 'distance'}
0.9583284192318277
---------------------------------------------------------------------------
DecisionTreeClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits
DecisionTreeClassifier :  {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 5}
0.8888626803475254
---------------------------------------------------------------------------
RandomForestC

In [28]:
%%time
# TRUE TRUE FALSE
train_and_tune_models(df, fill_na=True, feature_selection=True, anomaly_detection=False)

train_and_tune_models fill_na:  True feature_selection:  True anomaly_detection: False
The X_train, y_train shape
(25694, 51)
(25694,)
The X_test, y_test shape
(11013, 51)
(11013,)
init model selection helper notebook
---------------------------------------------------------------------------
KNeighborsClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits
KNeighborsClassifier :  {'metric': 'manhattan', 'n_neighbors': 2, 'weights': 'distance'}
0.9586284476967669
---------------------------------------------------------------------------
DecisionTreeClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits
DecisionTreeClassifier :  {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 5}
0.8911807655627547
---------------------------------------------------------------------------
RandomForestClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits
RandomForestClassifier :  {'criterion': 'entropy', 'max_depth': 10, 

In [12]:
%%time
# TRUE FALSE FALSE
# train_and_tune_models(df, fill_na=True, feature_selection=False, anomaly_detection=False)

Wall time: 0 ns


In [13]:
%%time
# FALSE FALSE FALSE
# train_and_tune_models(df, fill_na=False, feature_selection=False, anomaly_detection=False)

Wall time: 0 ns


In [14]:
%%time
# FALSE TRUE TRUE
# train_and_tune_models(df, fill_na=False, feature_selection=True, anomaly_detection=True)

Wall time: 0 ns


In [15]:
%%time
# FALSE FALSE TRUE
# train_and_tune_models(df, fill_na=False, feature_selection=False, anomaly_detection=True)

Wall time: 0 ns


#### Experiment: Find which data preparation setup return better result.

In [16]:
from sklearn.model_selection import train_test_split    
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import IsolationForest

def eval_data_prep(X, y, anomaly_detection=False): 
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)
    
    #if (anomaly_detection==True):
        #iforest = IsolationForest(contamination=0.01, random_state=RANDOM_STATE).fit(X_train)

        #y_pred_iforest = iforest.predict(X_train)

        #X_train, y_train = X_train[(y_pred_iforest != -1)], y_train[(y_pred_iforest != -1)]        

    #lr_clf = LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)

    #lr_clf.fit(X_train, y_train)

    #y_pred_lr = lr_clf.predict(X_train)

    #return accuracy_score(y_train, y_pred_lr)
    return None

In [17]:
#### 

In [18]:
df = loader.load_data(url=DATASET_LOCAL_PATH)
X, y = loader.prepare_data(df) # default
print(X.shape)
print(y.shape)
print(eval_data_prep(X, y))

(36707, 51)
(36707,)
None


In [19]:
df = loader.load_data(url=DATASET_LOCAL_PATH)
X, y = loader.prepare_data(df, fill_na=True, feature_selection=False)
print(X.shape)
print(y.shape)
print(eval_data_prep(X, y))

(36707, 78)
(36707,)
None


In [20]:
df = loader.load_data(url=DATASET_LOCAL_PATH)
X, y = loader.prepare_data(df, fill_na=False, feature_selection=True)
print(X.shape)
print(y.shape)
print(eval_data_prep(X, y))

(18982, 49)
(18982,)
None


In [21]:
df = loader.load_data(url=DATASET_LOCAL_PATH)
X, y = loader.prepare_data(df, fill_na=False, feature_selection=False)
print(X.shape)
print(y.shape)
print(eval_data_prep(X, y))

(18982, 78)
(18982,)
None


#### Experiment: Unsupervised Anomaly Detection

In [22]:
df = loader.load_data(url=DATASET_LOCAL_PATH)
X, y = loader.prepare_data(df) # default
print(X.shape)
print(y.shape)
print(eval_data_prep(X, y, anomaly_detection=True))

(36707, 51)
(36707,)
None


In [23]:
df = loader.load_data(url=DATASET_LOCAL_PATH)
X, y = loader.prepare_data(df, fill_na=True, feature_selection=False)
print(X.shape)
print(y.shape)
print(eval_data_prep(X, y, anomaly_detection=True))

(36707, 78)
(36707,)
None


In [24]:
df = loader.load_data(url=DATASET_LOCAL_PATH)
X, y = loader.prepare_data(df, fill_na=False, feature_selection=True)
print(X.shape)
print(y.shape)
print(eval_data_prep(X, y, anomaly_detection=True))

(18982, 49)
(18982,)
None


In [25]:
df = loader.load_data(url=DATASET_LOCAL_PATH)
X, y = loader.prepare_data(df, fill_na=False, feature_selection=False)
print(X.shape)
print(y.shape)
print(eval_data_prep(X, y, anomaly_detection=True))

(18982, 78)
(18982,)
None
