# ABOUT

In [None]:
!pip install -U scikit-multiflow

Requirement already up-to-date: scikit-multiflow in /usr/local/lib/python3.6/dist-packages (0.5.3)


In [None]:
import numpy as np 
import scipy as sp
import pandas as pd 
from sklearn.feature_selection import mutual_info_classif
import matplotlib.pylab as plt
plt.style.use('ggplot')

from sklearn import preprocessing
from skmultiflow.trees import HoeffdingTreeClassifier, HoeffdingAdaptiveTreeClassifier
from skmultiflow.meta import OnlineBoostingClassifier, OzaBaggingClassifier, DynamicWeightedMajorityClassifier
from skmultiflow.meta import LearnPPNSEClassifier
from skmultiflow.data import DataStream
from skmultiflow.evaluation import EvaluatePrequential, EvaluateHoldout

from google.colab import drive, files

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def standardize_df(df): 
    for key in df.keys(): 
        if key != 'target': 
            df[key] = (df[key].values - df[key].values.mean())/df[key].values.std()
    return df

In [None]:
pretrain_size = 2000
batch_size = 1000
metrics = ['accuracy', 'f1', 'kappa']
data_path = '/content/drive/My Drive/Data/UNSWNB15/'
code_path = '/content/drive/My Drive/Colab Notebooks/Projects/TIS-IEEE-2020/'
output_path = code_path+'outputs/'
max_samples = 1000000

In [None]:

df_tr = pd.read_csv(data_path + 'UNSW_NB15_training-set.csv')
df_te = pd.read_csv(data_path + 'UNSW_NB15_testing-set.csv')

drop_cols = ['id', 'proto', 'service', 'state', 'attack_cat', 'is_sm_ips_ports']
# drop_cols = ['id', 'proto', 'service', 'state', 'attack_cat', 'label', 'is_sm_ips_ports']
# drop_cols = ['id', 'proto', 'service', 'state', 'attack_cat', 'label']

# binary
df_sub = df_tr.drop(drop_cols, axis = 1)
df_sub = df_sub.rename(columns={"label": "target"})


df_sub.head()
df_sub = standardize_df(df_sub)
df_sub = df_sub.sample(frac=1).reset_index(drop=True)

stream = DataStream(df_sub)

In [None]:
df_sub.head()

Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,sloss,dloss,sinpkt,dinpkt,sjit,djit,swin,stcpb,dtcpb,dwin,tcprtt,synack,ackdat,smean,dmean,trans_depth,response_body_len,ct_srv_src,ct_state_ttl,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,target
0,-0.209773,-0.133677,-0.172047,-0.049958,-0.103923,0.094951,0.723268,-0.720406,-0.12098,-0.277208,-0.07504,-0.131759,-0.136142,-0.08937,-0.110668,-0.148818,-0.915407,-0.715177,-0.715569,-0.906432,-0.52166,-0.484346,-0.503014,-0.389647,-0.480703,-0.136415,-0.039557,0.625315,0.729064,1.466145,1.319293,2.039181,0.663565,-0.11859,-0.11859,-0.189768,1.086853,0.641377,1
1,-0.209774,-0.133677,-0.172047,-0.049958,-0.103923,0.178922,0.723268,-0.720406,-0.087356,-0.277208,-0.07504,-0.131759,-0.136142,-0.08937,-0.110668,-0.148818,-0.915407,-0.715177,-0.715569,-0.906432,-0.52166,-0.484346,-0.503014,-0.389647,-0.480703,-0.136415,-0.039557,0.718736,0.729064,1.341959,1.443562,2.212085,0.754838,-0.11859,-0.11859,-0.189768,1.207024,0.73434,1
2,1.168926,-0.119066,-0.172047,-0.04958,-0.103923,-0.576817,0.723268,-0.720406,-0.389972,-0.277208,-0.029589,-0.131759,0.27507,-0.08937,-0.017004,-0.148818,1.092456,-0.715177,-0.715569,-0.906432,-0.52166,-0.484346,-0.503014,-0.448276,-0.480703,-0.136415,-0.039557,0.251634,4.920164,0.596845,-0.420468,-0.38147,0.2072,-0.11859,-0.11859,-0.189768,0.485997,0.17656,0
3,-0.209774,-0.133677,-0.172047,-0.050164,-0.103923,0.178922,0.723268,-0.720406,-0.182919,-0.277208,-0.07504,-0.131759,-0.136142,-0.08937,-0.110668,-0.148818,-0.915407,-0.715177,-0.715569,-0.906432,-0.52166,-0.484346,-0.503014,-0.477591,-0.480703,-0.136415,-0.039557,-0.122048,0.729064,-0.645013,-0.296199,-0.554373,-0.614256,-0.11859,-0.11859,-0.189768,-0.475371,-0.567147,1
4,0.130476,0.04165,0.880031,-0.041947,0.894735,-0.576433,-1.141901,1.560002,-0.389945,-0.064087,-0.044739,0.911234,-0.123965,-0.070613,-0.002309,0.355218,1.092456,0.830488,0.464208,1.103244,1.232045,1.244055,1.080718,-0.384762,4.308,-0.136415,-0.039557,-0.775991,-0.318711,-0.396641,-0.544736,-0.554373,-0.43171,-0.11859,-0.11859,-0.189768,-0.595543,-0.753074,1


In [None]:
mdl = HoeffdingTreeClassifier()
evaluator = EvaluatePrequential(show_plot=False, 
                                pretrain_size=pretrain_size, 
                                batch_size=batch_size, 
                                metrics=metrics, 
                                max_samples=max_samples,
                                output_file=output_path + 'output_ht.csv')
mdl = evaluator.evaluate(stream=stream, model=mdl)

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 2000 sample(s).
Evaluating...
 #################### [100%] [81.68s]
Processed samples: 176000
Mean performance:
M0 - Accuracy     : 0.9185
M0 - Kappa        : 0.8038
M0 - F1 score: 0.9424


In [None]:
mdl = HoeffdingAdaptiveTreeClassifier()
evaluator = EvaluatePrequential(show_plot=False, 
                                pretrain_size=pretrain_size, 
                                batch_size=batch_size, 
                                metrics=metrics, 
                                max_samples=max_samples,
                                output_file=output_path + 'output_hta.csv')
mdl = evaluator.evaluate(stream=stream, model=mdl)

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 2000 sample(s).
Evaluating...
 #################### [100%] [287.85s]
Processed samples: 176000
Mean performance:
M0 - Accuracy     : 0.9295
M0 - Kappa        : 0.8343
M0 - F1 score: 0.9492


In [None]:
mdl = DynamicWeightedMajorityClassifier()
evaluator = EvaluatePrequential(show_plot=False, 
                                pretrain_size=pretrain_size, 
                                batch_size=batch_size, 
                                metrics=metrics, 
                                max_samples=max_samples,
                                output_file=output_path + 'output_dwm.csv')
mdl = evaluator.evaluate(stream=stream, model=mdl)

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 2000 sample(s).
Evaluating...
 #################### [100%] [549.10s]
Processed samples: 176000
Mean performance:
M0 - Accuracy     : 0.8113
M0 - Kappa        : 0.5909
M0 - F1 score: 0.8534


In [None]:
mdl = OzaBaggingClassifier(base_estimator=HoeffdingAdaptiveTreeClassifier())
evaluator = EvaluatePrequential(show_plot=False, 
                                pretrain_size=pretrain_size, 
                                batch_size=batch_size, 
                                metrics=metrics, 
                                max_samples=max_samples,
                                output_file=output_path + 'output_obag.csv')
mdl = evaluator.evaluate(stream=stream, model=mdl)

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 2000 sample(s).
Evaluating...
 #################### [100%] [3187.26s]
Processed samples: 176000
Mean performance:
M0 - Accuracy     : 0.9354
M0 - Kappa        : 0.8458
M0 - F1 score: 0.9540


In [None]:
mdl = LearnPPNSEClassifier(base_estimator=HoeffdingAdaptiveTreeClassifier())
evaluator = EvaluatePrequential(show_plot=False, 
                                pretrain_size=pretrain_size, 
                                batch_size=batch_size, 
                                metrics=metrics, 
                                max_samples=max_samples,
                                output_file=output_path + 'output_nse.csv')
mdl = evaluator.evaluate(stream=stream, model=mdl)

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 2000 sample(s).
Evaluating...


In [None]:
df_ht = pd.read_csv(output_path + 'output_ht.csv', comment='#')
df_hta = pd.read_csv(output_path + 'output_hta.csv', comment='#')
df_dwm = pd.read_csv(output_path + 'output_dwm.csv', comment='#')
df_bag = pd.read_csv(output_path + 'output_obag.csv', comment='#')
df_nse = pd.read_csv(output_path + 'output_nse.csv', comment='#')

plt.figure()
plt.plot(df_ht['id'], df_ht['mean_acc_[M0]'], color='r', label='HT')
plt.plot(df_hta['id'], df_hta['mean_acc_[M0]'], color='b', label='HTA')
plt.plot(df_dwm['id'], df_dwm['mean_acc_[M0]'], color='k', label='DWM')
plt.plot(df_bag['id'], df_bag['mean_acc_[M0]'], color='m', label='Obag')
plt.plot(df_nse['id'], df_nse['mean_acc_[M0]'], color='c', label='L++')
plt.legend()
plt.xlabel('Sample Number')
plt.ylabel('Accuracy')
plt.savefig(output_path + 'online_accuracy.pdf')

In [None]:
plt.figure()
plt.plot(df_ht['id'], df_ht['mean_kappa_[M0]'], color='r', label='HT')
plt.plot(df_hta['id'], df_hta['mean_kappa_[M0]'], color='b', label='HTA')
plt.plot(df_dwm['id'], df_dwm['mean_kappa_[M0]'], color='k', label='DWM')
plt.plot(df_bag['id'], df_bag['mean_kappa_[M0]'], color='m', label='Obag')
plt.plot(df_nse['id'], df_nse['mean_kappa_[M0]'], color='c', label='L++')

plt.legend()
plt.xlabel('Sample Number')
plt.ylabel('Kappa')
plt.savefig(output_path + 'online_kappa.pdf')

In [None]:
plt.figure()
plt.plot(df_ht['id'], df_ht['mean_f1_[M0]'], color='r', label='HT')
plt.plot(df_hta['id'], df_hta['mean_f1_[M0]'], color='b', label='HTA')
plt.plot(df_dwm['id'], df_dwm['mean_f1_[M0]'], color='k', label='DWM')
plt.plot(df_bag['id'], df_bag['mean_f1_[M0]'], color='m', label='Obag')
plt.plot(df_nse['id'], df_nse['mean_f1_[M0]'], color='c', label='L++')

plt.legend()
plt.xlabel('Sample Number')
plt.ylabel('F1-Score')
plt.savefig(output_path + 'online_f2.pdf')