# Train ML Methods
- Load CIC-IDS2017 improved version, CES-CIC-IDS2018 data
- Generate data only contains selected features
- Apply algorithms

### Load CIC-IDS2017 improved version, CES-CIC-IDS2018 data

In [8]:
# Import Necessary Libraries
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score # performance metrics

import time
import joblib

In [6]:
data = pd.read_csv('results/data_2017_original')
data

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label_encoded
0,3,2,0,12,0,6,6,6.0,0.00000,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
1,109,1,1,6,6,6,6,6.0,0.00000,6,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
2,52,1,1,6,6,6,6,6.0,0.00000,6,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
3,34,1,1,6,6,6,6,6.0,0.00000,6,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
4,3,2,0,12,0,6,6,6.0,0.00000,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2522004,32215,4,2,112,152,28,28,28.0,0.00000,76,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
2522005,324,2,2,84,362,42,42,42.0,0.00000,181,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
2522006,82,2,1,31,6,31,0,15.5,21.92031,6,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
2522007,1048635,6,2,192,256,32,32,32.0,0.00000,128,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0


In [7]:
data_resampling = pd.read_csv('results/data_2017_original_resampling')
data_resampling

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label_encoded
0,858591,2,2,72,124,36,36,36.000000,0.000000,62,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
1,60706,2,2,70,290,35,35,35.000000,0.000000,145,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
2,257846,1,1,46,208,46,46,46.000000,0.000000,208,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
3,3,2,0,0,0,0,0,0.000000,0.000000,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
4,1559160,35,42,2622,7038,408,0,74.914286,105.280961,976,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
851745,11512204,8,5,326,11632,326,0,40.750000,115.258405,10184,...,32,892.0,0.0,892,892,6507197.0,0.0,6507197,6507197,1
851746,11513325,5,5,471,3525,471,0,94.200000,210.637604,2077,...,32,918.0,0.0,918,918,6508582.0,0.0,6508582,6508582,1
851747,11509201,7,6,314,11632,314,0,44.857143,118.680845,5792,...,32,899.0,0.0,899,899,6503248.0,0.0,6503248,6503248,1
851748,11509095,8,5,369,11632,369,0,46.125000,130.461201,10184,...,32,914.0,0.0,914,914,6504954.0,0.0,6504954,6504954,1


In [9]:
feature_scores = pd.read_csv('features_scores.csv', names=['feature', 'importance ratio'], header=0)

In [10]:
feature_scores

Unnamed: 0,feature,importance ratio
0,Packet Length Std,0.077944
1,Bwd Packet Length Std,0.075532
2,Packet Length Variance,0.075460
3,Bwd Packet Length Mean,0.069707
4,Average Packet Size,0.051550
...,...,...
71,Bwd Avg Packets/Bulk,0.000000
72,Bwd Avg Bytes/Bulk,0.000000
73,Fwd Avg Bulk Rate,0.000000
74,Fwd Avg Packets/Bulk,0.000000


In [14]:
feature_extract_item = list(feature_scores[feature_scores['importance ratio'] > 0.03]['feature'])
feature_extract_item

['Packet Length Std',
 'Bwd Packet Length Std',
 'Packet Length Variance',
 'Bwd Packet Length Mean',
 'Average Packet Size',
 'Bwd Packet Length Max',
 'Avg Bwd Segment Size',
 'Packet Length Mean',
 'Max Packet Length',
 'Subflow Bwd Bytes',
 'label_encoded']

In [11]:
feature_scores_resampling = pd.read_csv('feature_scores_resampling.csv', names=['feature', 'importance ratio'], header = 0)

In [12]:
feature_scores_resampling

Unnamed: 0,feature,importance ratio
0,Average Packet Size,0.068069
1,Packet Length Variance,0.066387
2,Bwd Packet Length Std,0.065651
3,Packet Length Std,0.064082
4,Bwd Packet Length Max,0.046398
...,...,...
71,Bwd PSH Flags,0.000000
72,Fwd Avg Bytes/Bulk,0.000000
73,Bwd Avg Bulk Rate,0.000000
74,Bwd Avg Packets/Bulk,0.000000


In [13]:
feature_resampling_extract_item = list(feature_scores_resampling[feature_scores_resampling['importance ratio'] > 0.03]['feature'])
feature_resampling_extract_item

['Average Packet Size',
 'Packet Length Variance',
 'Bwd Packet Length Std',
 'Packet Length Std',
 'Bwd Packet Length Max',
 'Packet Length Mean',
 'Bwd Packet Length Mean',
 'Max Packet Length',
 'Avg Bwd Segment Size',
 'Avg Fwd Segment Size',
 'Init_Win_bytes_backward',
 'label_encoded']

### Generate data only contains selected features

In [63]:
data = data[feature_extract_item]

In [64]:
data_resampling = data_resampling[feature_resampling_extract_item]

In [65]:
data.head()

Unnamed: 0,Packet Length Std,Bwd Packet Length Std,Packet Length Variance,Bwd Packet Length Mean,Average Packet Size,Bwd Packet Length Max,Avg Bwd Segment Size,Packet Length Mean,Max Packet Length,Subflow Bwd Bytes,label_encoded
0,0.0,0.0,0.0,0.0,9.0,0,0.0,6.0,6,0,0
1,0.0,0.0,0.0,6.0,9.0,6,6.0,6.0,6,6,0
2,0.0,0.0,0.0,6.0,9.0,6,6.0,6.0,6,6,0
3,0.0,0.0,0.0,6.0,9.0,6,6.0,6.0,6,6,0
4,0.0,0.0,0.0,0.0,9.0,0,0.0,6.0,6,0,0


In [66]:
data_resampling.head()

Unnamed: 0,Average Packet Size,Packet Length Variance,Bwd Packet Length Std,Packet Length Std,Bwd Packet Length Max,Packet Length Mean,Bwd Packet Length Mean,Max Packet Length,Avg Bwd Segment Size,Avg Fwd Segment Size,Init_Win_bytes_backward,label_encoded
0,58.0,202.8,0.0,14.240787,62,46.4,62.0,62,62.0,36.0,-1,0
1,98.75,3630.0,0.0,60.249481,145,79.0,145.0,145,145.0,35.0,-1,0
2,150.0,8748.0,0.0,93.530744,208,100.0,208.0,208,208.0,46.0,-1,0
3,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,-1,0
4,125.454545,60784.57343,317.156355,246.545277,976,123.846154,167.571429,976,167.571429,74.914286,245,0


In [23]:
df2017_corrected_y = df2017_corrected['label_encoded'].values
df2017_corrected_X = df2017_corrected[feature_extract_item]

In [24]:
df2017_corrected_X_r = df2017_corrected[feature_resampling_extract_item]

In [25]:
data_test_2017_corrected = pd.DataFrame(df2017_corrected_X, df2017_corrected_y)
data_test_2017_corrected_r = pd.DataFrame(df2017_corrected_X_r, df2017_corrected_y)
scaler = StandardScaler()
X_test = scaler.fit_transform(df2017_corrected_X)
X_test_r = scaler.fit_transform(df2017_corrected_X_r)


### Apply algorithms

In [29]:
# Define constant value
METHOD_RANDOM_FOREST = 'rf'
METHOD_XGBOOST = 'xgb'
METHOD_KNN = 'knn'


In [30]:
methods = [METHOD_RANDOM_FOREST, METHOD_XGBOOST, METHOD_KNN]

In [26]:
def load_pretrained_model(method, is_resample):
    suffix_txt = ""
    if is_resample:
        suffix_txt = "_resample"
    if method == METHOD_XGBOOST:
        ml_file_name = f"model_{method}{suffix_txt}.json"
        ml = XGBClassifier()
        ml.load_model(ml_file_name)
    else:
        ml_file_name = f"model_{method}{suffix_txt}.joblib"
        ml = joblib.load(ml_file_name)
    
    return ml

In [40]:
def predict_methods(method, is_resample):
    print(f'predict {method}')
    
    
        
    ml = load_pretrained_model(method, is_resample)
    
    suffix_txt = ""
    if is_resample:
        suffix_txt = "_resample"
        ypreds = ml.predict(X_test_r)
        data_test_2017_corrected_r[f'ypreds_{method}{suffix_txt}'] = ypreds
    else:
        ypreds = ml.predict(X_test)
        data_test_2017_corrected[f'ypreds_{method}{suffix_txt}'] = ypreds
        
        
    print("Accuracy",accuracy_score(df2017_corrected_y, ypreds))
    print(classification_report(df2017_corrected_y, ypreds))
    print("Confusion Matrix:")
    print(confusion_matrix(df2017_corrected_y, ypreds))

In [41]:
for method in methods:
    predict_methods(method, False)

predict rf
Accuracy 0.7536048031025117
              precision    recall  f1-score   support

           0       0.75      1.00      0.86   1582566
           1       0.06      0.00      0.00    517410

    accuracy                           0.75   2099976
   macro avg       0.41      0.50      0.43   2099976
weighted avg       0.58      0.75      0.65   2099976

Confusion Matrix:
[[1582551      15]
 [ 517409       1]]
predict xgb
Accuracy 0.746194242219911
              precision    recall  f1-score   support

           0       0.75      0.99      0.85   1582566
           1       0.01      0.00      0.00    517410

    accuracy                           0.75   2099976
   macro avg       0.38      0.50      0.43   2099976
weighted avg       0.57      0.75      0.64   2099976

Confusion Matrix:
[[1566836   15730]
 [ 517256     154]]
predict knn
Accuracy 0.812015470652998
              precision    recall  f1-score   support

           0       0.80      1.00      0.89   1582566
      

In [42]:
for method in methods:
    predict_methods(method, True)

predict rf
Accuracy 0.7699454660434214
              precision    recall  f1-score   support

           0       0.77      1.00      0.87   1582566
           1       1.00      0.07      0.12    517410

    accuracy                           0.77   2099976
   macro avg       0.88      0.53      0.50   2099976
weighted avg       0.82      0.77      0.68   2099976

Confusion Matrix:
[[1582448     118]
 [ 482991   34419]]
predict xgb
Accuracy 0.768215446271767
              precision    recall  f1-score   support

           0       0.77      1.00      0.87   1582566
           1       0.95      0.06      0.12    517410

    accuracy                           0.77   2099976
   macro avg       0.86      0.53      0.49   2099976
weighted avg       0.81      0.77      0.68   2099976

Confusion Matrix:
[[1580809    1757]
 [ 484985   32425]]
predict knn
Accuracy 0.8479044522413589
              precision    recall  f1-score   support

           0       0.86      0.96      0.90   1582566
     

In [45]:
data_test_2017_corrected.head()


Unnamed: 0,Packet Length Std,Bwd Packet Length Std,Packet Length Variance,Bwd Packet Length Mean,Average Packet Size,Bwd Packet Length Max,Avg Bwd Segment Size,Packet Length Mean,Max Packet Length,Subflow Bwd Bytes,ypreds_rf,ypreds_xgb,ypreds_knn
0,177.341758,0.0,31450.099291,72.0,158.333333,72,72.0,158.333333,403,24,0,0,0
0,177.341758,0.0,31450.099291,72.0,158.333333,72,72.0,158.333333,403,24,0,0,0
0,177.341758,0.0,31450.099291,72.0,158.333333,72,72.0,158.333333,403,24,0,0,0
0,177.341758,0.0,31450.099291,72.0,158.333333,72,72.0,158.333333,403,24,0,0,0
0,177.341758,0.0,31450.099291,72.0,158.333333,72,72.0,158.333333,403,24,0,0,0


In [46]:
data_test_2017_corrected_r.head()

Unnamed: 0,Average Packet Size,Packet Length Variance,Bwd Packet Length Std,Packet Length Std,Bwd Packet Length Max,Packet Length Mean,Bwd Packet Length Mean,Max Packet Length,Avg Bwd Segment Size,Avg Fwd Segment Size,Init_Win_bytes_backward,ypreds_rf_resample,ypreds_xgb_resample,ypreds_knn_resample
0,158.333333,31450.099291,0.0,177.341758,72,158.333333,72.0,403,72.0,201.5,2079,0,0,0
0,158.333333,31450.099291,0.0,177.341758,72,158.333333,72.0,403,72.0,201.5,2079,0,0,0
0,158.333333,31450.099291,0.0,177.341758,72,158.333333,72.0,403,72.0,201.5,2079,0,0,0
0,158.333333,31450.099291,0.0,177.341758,72,158.333333,72.0,403,72.0,201.5,2079,0,0,0
0,158.333333,31450.099291,0.0,177.341758,72,158.333333,72.0,403,72.0,201.5,2079,0,0,0


In [47]:
data_test_2017_corrected.to_csv('data_test_2017_corrected.csv')
data_test_2017_corrected_r.to_csv('data_test_2017_corrected_resample.csv')

In [None]:
encode_label(df2018_original)


In [None]:
df2018_original_y = df2018_original['label_encoded'].values
df2018_original_X = df2018_original[['Packet Length Variance', 'Subflow Bwd Bytes', 'Packet Length Std', 'Avg Bwd Segment Size', 'Max Packet Length', 'Total Length of Fwd Packets', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Bwd Packet Length Max', 'Total Length of Bwd Packets', 'Subflow Fwd Bytes', 'Average Packet Size']]
df2018_original_X = fixDataType(df2018_original_X)


In [None]:
encode_label(df2018_corrected)
drop_unnecessary(df2018_corrected)


In [None]:
df2018_corrected_y = df2018_corrected['label_encoded'].values
df2018_corrected_X = df2018_corrected[['Packet Length Variance', 'Subflow Bwd Bytes', 'Packet Length Std', 'Avg Bwd Segment Size', 'Max Packet Length', 'Total Length of Fwd Packets', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Bwd Packet Length Max', 'Total Length of Bwd Packets', 'Subflow Fwd Bytes', 'Average Packet Size']]
df2018_corrected_X = fixDataType(df2018_corrected_X)


In [None]:
data = pd.DataFrame(data=df2018_corrected_X)

data['label_encoded'] = df2018_corrected_y
data['label_encoded'].value_counts()

In [None]:
X_train, X_test, y_train, y_test = split_data(data)

xgb = XGBClassifier()
xgb.load_model("model_sklearn.json")

xgbpreds = xgb.predict(X_test)

print("Accuracy",accuracy_score(y_test,xgbpreds))
print(classification_report(y_test,xgbpreds))
print("Confusion Matrix:")
print(confusion_matrix(y_test,xgbpreds))