# Train ML Methods
- Load CIC-IDS2017 original as upstream data
- Extract features by random forest (feature importance)
- Generate data only contains selected features
- Train-Test split
- Train model

### Load CIC-IDS2017 original as upstream data

In [8]:
# Import Necessary Libraries
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score # performance metrics

import time
import joblib

In [6]:
data = pd.read_csv('results/data_2017_original')
data

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label_encoded
0,3,2,0,12,0,6,6,6.0,0.00000,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
1,109,1,1,6,6,6,6,6.0,0.00000,6,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
2,52,1,1,6,6,6,6,6.0,0.00000,6,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
3,34,1,1,6,6,6,6,6.0,0.00000,6,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
4,3,2,0,12,0,6,6,6.0,0.00000,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2522004,32215,4,2,112,152,28,28,28.0,0.00000,76,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
2522005,324,2,2,84,362,42,42,42.0,0.00000,181,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
2522006,82,2,1,31,6,31,0,15.5,21.92031,6,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
2522007,1048635,6,2,192,256,32,32,32.0,0.00000,128,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0


In [7]:
data_resampling = pd.read_csv('results/data_2017_original_resampling')
data_resampling

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label_encoded
0,858591,2,2,72,124,36,36,36.000000,0.000000,62,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
1,60706,2,2,70,290,35,35,35.000000,0.000000,145,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
2,257846,1,1,46,208,46,46,46.000000,0.000000,208,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
3,3,2,0,0,0,0,0,0.000000,0.000000,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
4,1559160,35,42,2622,7038,408,0,74.914286,105.280961,976,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
851745,11512204,8,5,326,11632,326,0,40.750000,115.258405,10184,...,32,892.0,0.0,892,892,6507197.0,0.0,6507197,6507197,1
851746,11513325,5,5,471,3525,471,0,94.200000,210.637604,2077,...,32,918.0,0.0,918,918,6508582.0,0.0,6508582,6508582,1
851747,11509201,7,6,314,11632,314,0,44.857143,118.680845,5792,...,32,899.0,0.0,899,899,6503248.0,0.0,6503248,6503248,1
851748,11509095,8,5,369,11632,369,0,46.125000,130.461201,10184,...,32,914.0,0.0,914,914,6504954.0,0.0,6504954,6504954,1


### Extract Features - RandomForest

In [25]:
def train_rf(data):
    X_train, X_test, y_train, y_test = split_data(data)
    X_train, X_test = normalise_data(X_train, X_test)
    forest = RandomForestClassifier(random_state=42)
    start_time = time.time()
    # Train the model on the training data
    forest.fit(X_train, y_train)
    end_time = time.time()
    time_taken = end_time - start_time
    print("Time taken to fit the model:", time_taken, "seconds")
    return forest

In [28]:
forest = train_rf(data)
feature_scores = (pd.Series(forest.feature_importances_, index=data.columns[0:len(data.columns)-1]).sort_values(ascending=False))
print(f'feature_scores: {feature_scores}')

Number of rows of train data: 2017607
Number of columns of train data: 76
Number of rows of test data: 504402
Number of columns of test data: 76
Time taken to fit the model: 926.2445492744446 seconds
feature_scores: Packet Length Std         0.077944
Bwd Packet Length Std     0.075532
Packet Length Variance    0.075460
Bwd Packet Length Mean    0.069707
Average Packet Size       0.051550
                            ...   
Bwd Avg Packets/Bulk      0.000000
Bwd Avg Bytes/Bulk        0.000000
Fwd Avg Bulk Rate         0.000000
Fwd Avg Packets/Bulk      0.000000
Fwd Avg Bytes/Bulk        0.000000
Length: 76, dtype: float64


In [29]:
feature_scores

Packet Length Std         0.077944
Bwd Packet Length Std     0.075532
Packet Length Variance    0.075460
Bwd Packet Length Mean    0.069707
Average Packet Size       0.051550
                            ...   
Bwd Avg Packets/Bulk      0.000000
Bwd Avg Bytes/Bulk        0.000000
Fwd Avg Bulk Rate         0.000000
Fwd Avg Packets/Bulk      0.000000
Fwd Avg Bytes/Bulk        0.000000
Length: 76, dtype: float64

In [51]:
feature_scores.to_csv('features_scores.csv', )

In [9]:
feature_scores = pd.read_csv('features_scores.csv', names=['feature', 'importance ratio'], header=0)

In [10]:
feature_scores

Unnamed: 0,feature,importance ratio
0,Packet Length Std,0.077944
1,Bwd Packet Length Std,0.075532
2,Packet Length Variance,0.075460
3,Bwd Packet Length Mean,0.069707
4,Average Packet Size,0.051550
...,...,...
71,Bwd Avg Packets/Bulk,0.000000
72,Bwd Avg Bytes/Bulk,0.000000
73,Fwd Avg Bulk Rate,0.000000
74,Fwd Avg Packets/Bulk,0.000000


In [14]:
feature_extract_item = list(feature_scores[feature_scores['importance ratio'] > 0.03]['feature'])
feature_extract_item.append('label_encoded')
feature_extract_item

['Packet Length Std',
 'Bwd Packet Length Std',
 'Packet Length Variance',
 'Bwd Packet Length Mean',
 'Average Packet Size',
 'Bwd Packet Length Max',
 'Avg Bwd Segment Size',
 'Packet Length Mean',
 'Max Packet Length',
 'Subflow Bwd Bytes',
 'label_encoded']

In [41]:
forest_resampling = train_rf(data_resampling)
feature_scores_resampling = (pd.Series(forest_resampling.feature_importances_, index=data_resampling.columns[0:len(data_resampling.columns)-1]).sort_values(ascending=False))
print(f'feature_scores: {feature_scores_resampling}')

Number of rows of train data: 681400
Number of columns of train data: 76
Number of rows of test data: 170350
Number of columns of test data: 76
Time taken to fit the model: 223.63538432121277 seconds
feature_scores: Average Packet Size       0.068069
Packet Length Variance    0.066387
Bwd Packet Length Std     0.065651
Packet Length Std         0.064082
Bwd Packet Length Max     0.046398
                            ...   
Bwd PSH Flags             0.000000
Fwd Avg Bytes/Bulk        0.000000
Bwd Avg Bulk Rate         0.000000
Bwd Avg Packets/Bulk      0.000000
Bwd Avg Bytes/Bulk        0.000000
Length: 76, dtype: float64


In [55]:
feature_scores_resampling.to_csv('feature_scores_resampling.csv')

In [11]:
feature_scores_resampling = pd.read_csv('feature_scores_resampling.csv', names=['feature', 'importance ratio'], header = 0)

In [12]:
feature_scores_resampling

Unnamed: 0,feature,importance ratio
0,Average Packet Size,0.068069
1,Packet Length Variance,0.066387
2,Bwd Packet Length Std,0.065651
3,Packet Length Std,0.064082
4,Bwd Packet Length Max,0.046398
...,...,...
71,Bwd PSH Flags,0.000000
72,Fwd Avg Bytes/Bulk,0.000000
73,Bwd Avg Bulk Rate,0.000000
74,Bwd Avg Packets/Bulk,0.000000


In [13]:
feature_resampling_extract_item = list(feature_scores_resampling[feature_scores_resampling['importance ratio'] > 0.03]['feature'])
feature_resampling_extract_item.append('label_encoded')
feature_resampling_extract_item

['Average Packet Size',
 'Packet Length Variance',
 'Bwd Packet Length Std',
 'Packet Length Std',
 'Bwd Packet Length Max',
 'Packet Length Mean',
 'Bwd Packet Length Mean',
 'Max Packet Length',
 'Avg Bwd Segment Size',
 'Avg Fwd Segment Size',
 'Init_Win_bytes_backward',
 'label_encoded']

### Preparing data

In [63]:
data = data[feature_extract_item]

In [64]:
data_resampling = data_resampling[feature_resampling_extract_item]

In [65]:
data.head()

Unnamed: 0,Packet Length Std,Bwd Packet Length Std,Packet Length Variance,Bwd Packet Length Mean,Average Packet Size,Bwd Packet Length Max,Avg Bwd Segment Size,Packet Length Mean,Max Packet Length,Subflow Bwd Bytes,label_encoded
0,0.0,0.0,0.0,0.0,9.0,0,0.0,6.0,6,0,0
1,0.0,0.0,0.0,6.0,9.0,6,6.0,6.0,6,6,0
2,0.0,0.0,0.0,6.0,9.0,6,6.0,6.0,6,6,0
3,0.0,0.0,0.0,6.0,9.0,6,6.0,6.0,6,6,0
4,0.0,0.0,0.0,0.0,9.0,0,0.0,6.0,6,0,0


In [66]:
data_resampling.head()

Unnamed: 0,Average Packet Size,Packet Length Variance,Bwd Packet Length Std,Packet Length Std,Bwd Packet Length Max,Packet Length Mean,Bwd Packet Length Mean,Max Packet Length,Avg Bwd Segment Size,Avg Fwd Segment Size,Init_Win_bytes_backward,label_encoded
0,58.0,202.8,0.0,14.240787,62,46.4,62.0,62,62.0,36.0,-1,0
1,98.75,3630.0,0.0,60.249481,145,79.0,145.0,145,145.0,35.0,-1,0
2,150.0,8748.0,0.0,93.530744,208,100.0,208.0,208,208.0,46.0,-1,0
3,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,-1,0
4,125.454545,60784.57343,317.156355,246.545277,976,123.846154,167.571429,976,167.571429,74.914286,245,0


### Train-test split

In [67]:
def split_data(data):
    X_train, X_test, y_train, y_test = train_test_split(data.loc[:, data.columns != 'label_encoded'], data['label_encoded'], test_size=0.2, random_state=42) #reproducible
    
    imputer = SimpleImputer(strategy='mean')
    # Fit the imputer to X_train and transform X_train and X_test with it
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)

    num_rows, num_cols = X_train.shape
    print("Number of rows of train data:", num_rows)
    print("Number of columns of train data:", num_cols)

    num_rows, num_cols = X_test.shape
    print("Number of rows of test data:", num_rows)
    print("Number of columns of test data:", num_cols)

    return X_train, X_test, y_train, y_test

In [14]:
def normalise_data(X_train, X_test):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test

In [69]:
X_train, X_test, y_train, y_test = split_data(data)

data_test_2017_original = pd.DataFrame(X_test, y_test)
X_train, X_test = normalise_data(X_train, X_test)

Number of rows of train data: 2017607
Number of columns of train data: 10
Number of rows of test data: 504402
Number of columns of test data: 10


In [70]:
X_train_r, X_test_r, y_train_r, y_test_r = split_data(data_resampling)

data_test_2017_original_resample = pd.DataFrame(X_test_r, y_test_r)
X_train_r, X_test_r = normalise_data(X_train_r, X_test_r)

Number of rows of train data: 681400
Number of columns of train data: 11
Number of rows of test data: 170350
Number of columns of test data: 11


### Algorithm

In [29]:
# Define constant value
METHOD_RANDOM_FOREST = 'rf'
METHOD_XGBOOST = 'xgb'
METHOD_KNN = 'knn'


In [30]:
methods = [METHOD_RANDOM_FOREST, METHOD_XGBOOST, METHOD_KNN]

#### Random Forest

In [33]:
def train_rf(X_train, y_train, is_resample):
    print('~~~~START TRAIN RANDOM FOREST~~~~')
    rf = RandomForestClassifier(random_state=42)
    start_time = time.time()
    # Train the model on the training data
    rf.fit(X_train, y_train)
    end_time = time.time()
    time_taken = end_time - start_time
    print("Time taken to fit the model:", time_taken, "seconds")
    
    if is_resample:
        joblib.dump(rf, f"model_{METHOD_RANDOM_FOREST}_resample.joblib")
    else:
        joblib.dump(rf, f"model_{METHOD_RANDOM_FOREST}.joblib")
    return rf

#### XGBoost

In [34]:
def train_xgb(X_train, y_train, is_resample):
    print('~~~~START TRAIN XGBOOST~~~~')
    xgb = XGBClassifier(random_state=42)
    start_time = time.time()
    # Train the model on the training data
    xgb.fit(X_train, y_train)
    end_time = time.time()
    time_taken = end_time - start_time
    print("Time taken to fit the model:", time_taken, "seconds")
    if is_resample:
        xgb.save_model(f"model_{METHOD_XGBOOST}_resample.json")
    else:
        xgb.save_model(f"model_{METHOD_XGBOOST}.json")

    return xgb

#### KNN

In [35]:
def train_knn(X_train, y_train, is_resample):
    print('~~~~START TRAIN KNN~~~~')
    knn = KNeighborsClassifier()
    start_time = time.time()
    # Train the model on the training data
    knn.fit(X_train, y_train)
    end_time = time.time()
    time_taken = end_time - start_time
    print("Time taken to fit the model:", time_taken, "seconds")
    # save in JSON format
    if is_resample:
        joblib.dump(knn, f"model_{METHOD_KNN}_resample.joblib")
    else:
        joblib.dump(knn, f"model_{METHOD_KNN}.joblib")
    return knn

In [37]:
def predict_methods(ml, method, is_resample):
    print(f'predict {method}')
    ypreds = ml.predict(X_test_r) if is_resample else ml.predict(X_test)
    
    
    if is_resample:
        data_test_2017_original_resample[f'ypreds_{method}'] = ypreds

    else:
        data_test_2017_original[f'ypreds_{method}'] = ypreds
        
    y = y_test_r if is_resample else y_test
    print("Accuracy",accuracy_score(y, ypreds))
    print(classification_report(y, ypreds))
    print("Confusion Matrix:")
    print(confusion_matrix(y, ypreds))



In [83]:
def training_methods(methods, is_resample):
    ml, md = None, None
    
    X = X_train_r if is_resample else X_train
    y = y_train_r if is_resample else y_train
    
    
    for method in methods:
        print(f'start train {method} method')
        if method == METHOD_RANDOM_FOREST:
            ml = train_rf(X, y, is_resample)
        elif method == METHOD_XGBOOST:
            ml = train_xgb(X, y, is_resample)
        elif method == METHOD_KNN:
            ml = train_knn(X, y, is_resample)
        elif method == METHOD_SVM:
            ml = train_svm(X, y, is_resample)
        predict_methods(ml, method, is_resample)


In [86]:
training_methods(methods, False)

start train rf method
~~~~START TRAIN RANDOM FOREST~~~~
Time taken to fit the model: 459.9014222621918 seconds
predict rf
Accuracy 0.9897026578007224
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    419303
           1       0.99      0.94      0.97     85099

    accuracy                           0.99    504402
   macro avg       0.99      0.97      0.98    504402
weighted avg       0.99      0.99      0.99    504402

Confusion Matrix:
[[418796    507]
 [  4687  80412]]
start train xgb method
~~~~START TRAIN XGBOOST~~~~
Time taken to fit the model: 169.24312615394592 seconds
predict xgb
Accuracy 0.9896927450723827
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    419303
           1       0.99      0.95      0.97     85099

    accuracy                           0.99    504402
   macro avg       0.99      0.97      0.98    504402
weighted avg       0.99      0.99      0.99    5044

In [87]:
training_methods(methods, True)

start train rf method
~~~~START TRAIN RANDOM FOREST~~~~
Time taken to fit the model: 153.69545030593872 seconds
predict rf
Accuracy 0.9787085412386264
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     85123
           1       0.99      0.96      0.98     85227

    accuracy                           0.98    170350
   macro avg       0.98      0.98      0.98    170350
weighted avg       0.98      0.98      0.98    170350

Confusion Matrix:
[[84635   488]
 [ 3139 82088]]
start train xgb method
~~~~START TRAIN XGBOOST~~~~
Time taken to fit the model: 41.293028831481934 seconds
predict xgb
Accuracy 0.9787085412386264
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     85123
           1       0.99      0.96      0.98     85227

    accuracy                           0.98    170350
   macro avg       0.98      0.98      0.98    170350
weighted avg       0.98      0.98      0.98    170350


In [88]:
data_test_2017_original.to_csv('data_test_2017_original.csv')
data_test_2017_original_resample.to_csv('data_test_2017_resample.csv')