In [5]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_selection import RFE

In [7]:
def save_dataframe(x_train,y_train,x_test,y_test,features,name):
    # Save the final dataset with the 23 features
    X_train = pd.DataFrame(x_train)
    X_train.columns= features
    Y_train = pd.DataFrame(y_train)
    train_frame = [X_train,Y_train]
    train_final = pd.concat(train_frame,axis=1)
    train_final.to_csv('/content/drive/MyDrive/ITU_Competition_Intrusion_and_Vulnerability_Detection_in_Software_Defined_Networks(SDN)/train_data_' + name +'.csv',index = False)
    print("Train dataset Saved")

    # Save test data set as well
    X_test = pd.DataFrame(x_test)
    X_test.columns= features
    Y_test = pd.DataFrame(y_test)
    test_frame = [X_test,Y_test]
    test_final = pd.concat(test_frame,axis=1)
    test_final.to_csv('/content/drive/MyDrive/ITU_Competition_Intrusion_and_Vulnerability_Detection_in_Software_Defined_Networks(SDN)/test_data_'+ name+'.csv',index = False)
    print("Test dataset Saved")

# Read Files and Basic Pre-processing

In [8]:
train_file_path = '/content/drive/MyDrive/ITU_Competition_Intrusion_and_Vulnerability_Detection_in_Software_Defined_Networks(SDN)/Train_ULAK.csv'
test_file_path = '/content/drive/MyDrive/ITU_Competition_Intrusion_and_Vulnerability_Detection_in_Software_Defined_Networks(SDN)/Test_ULAK.csv'

train_df = pd.read_csv(train_file_path,skipinitialspace=True)
test_df = pd.read_csv(test_file_path,skipinitialspace=True)

if train_df is not None and test_df is not None:
    print("Data loaded successfully!")
    print("Train Data:")
    print(train_df.head())  # Display the first few rows of the train DataFrame
    print("\nTest Data:")
    print(test_df.head())   # Display the first few rows of the test DataFrame

Data loaded successfully!
Train Data:
   Destination Port  Flow Duration  Total Fwd Packets  Total Backward Packets  \
0                80       68855579                 10                       6   
1                53            196                  2                       2   
2               123            118                  2                       2   
3                80         295657                  7                      10   
4                80       15705351                  7                       5   

   Total Length of Fwd Packets  Total Length of Bwd Packets  \
0                         1038                        11595   
1                           70                          174   
2                           96                           96   
3                         1114                        15841   
4                          407                          452   

   Fwd Packet Length Max  Fwd Packet Length Min  Fwd Packet Length Mean  \
0                    

In [9]:
# Print the sizes of train and test dataframes
print("Shape of Training Dataset:", train_df.shape)
print("Shape of Testing Dataset:", test_df.shape)

Shape of Training Dataset: (1783356, 79)
Shape of Testing Dataset: (512077, 79)


In [10]:
#train_df.info()
#test_df.info()

In [11]:
# Summary statistics of the dataset
#train_df.describe()

In [12]:
# number of each label type train df
train_df['Label'].value_counts()

BENIGN                        1432050
DoS Hulk                       145575
PortScan                       100125
DDoS                            80656
DoS GoldenEye                    6484
FTP-Patator                      5000
SSH-Patator                      3714
DoS slowloris                    3651
DoS Slowhttptest                 3464
Bot                              1238
Web Attack � Brute Force          949
Web Attack � XSS                  410
Infiltration                       22
Web Attack � Sql Injection         12
Heartbleed                          6
Name: Label, dtype: int64

In [13]:
# number of each label type test df
test_df['Label'].value_counts()

BENIGN                        411203
DoS Hulk                       41801
PortScan                       28751
DDoS                           23160
DoS GoldenEye                   1861
FTP-Patator                     1436
SSH-Patator                     1067
DoS slowloris                   1048
DoS Slowhttptest                 994
Bot                              355
Web Attack � Brute Force         272
Web Attack � XSS                 117
Infiltration                       6
Web Attack � Sql Injection         4
Heartbleed                         2
Name: Label, dtype: int64

# Pre-processing of Data

Label-Encoding to Replace Categorical Values with Numerical

In [14]:
labels = train_df['Label'].values.reshape(-1, 1)
# Initialize the Label Encoder
encoder = LabelEncoder()
# transform fit labels of train and test daframe
train_labels = encoder.fit_transform(labels.reshape(-1,))
test_labels = encoder.transform(test_df['Label'].values.reshape(-1,))
label_order_classes = encoder.classes_
train_df['Label'] = train_labels
test_df['Label'] = test_labels

In [15]:
print(train_df['Label'].unique())

[ 4  0  2 10  3  7  6  5 11 14 12 13  1  9  8]


Drop Records Containing Infitiy Values, are Null and Nan Values

In [16]:
# Replace infinite number with NaN values
train_df.replace([np.inf, -np.inf], np.nan, inplace=True)
test_df.replace([np.inf, -np.inf], np.nan, inplace=True)
# Drop null and NaN values
print("Records deleted in training dataset:", train_df.isnull().sum().sum())
print("Records deleted in test dataset:", test_df.isnull().sum().sum())

train_df = train_df.dropna().reset_index(drop=True)
test_df = test_df.dropna().reset_index(drop=True)

Records deleted in training dataset: 3622
Records deleted in test dataset: 1074


In [17]:
# Print the new sizes of train and test dataframes
print("Shape of Training Dataset:", train_df.shape)
print("Shape of Testing Dataset:", test_df.shape)

Shape of Training Dataset: (1781545, 79)
Shape of Testing Dataset: (511540, 79)


Split Data

In [18]:
X_train = train_df.drop(train_df.columns[-1], axis=1)
y_train = train_df[train_df.columns[-1]]

X_test = test_df.drop(test_df.columns[-1], axis=1)
y_test = test_df[test_df.columns[-1]]

In [19]:
features = X_train.columns
print(features)

Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
       'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Co

In [20]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1781545, 78)
(1781545,)
(511540, 78)
(511540,)


In [21]:
# Delete dafarames to empty memory
del train_df, test_df

Scale Data using Standard Scaler

In [22]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
x_train = scaler.transform(X_train)
x_test = scaler.transform(X_test)

Save the initial dataset with all the features

In [23]:
save_dataframe(x_train,y_train,x_test,y_test,features,name='all_features')

Train dataset Saved
Test dataset Saved


Random Forest for Feature Importance

In [24]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=20, max_depth=None,
                                 bootstrap=False, n_jobs=-1,
                                 random_state=0,verbose=2)
rf.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


building tree 1 of 20building tree 2 of 20

building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  4.5min finished


Test accuracy to find the base performance with all features

In [25]:
y_pred = rf.predict(x_test)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


Accuracy: 0.9986824099777144


[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed:    2.4s finished


Find out what features contribute the most

In [26]:
feature_imp = pd.Series(rf.feature_importances_,
                        index=X_train.columns).sort_values(ascending=False)

In [27]:
for i in range(0,len(feature_imp)):
    print(i,end =" ")
    print(feature_imp.index[i],end = ": ")
    print(feature_imp[i])

0 Packet Length Std: 0.07308699675876518
1 Packet Length Variance: 0.0600106012850512
2 Bwd Packet Length Mean: 0.05624262169275905
3 Total Length of Fwd Packets: 0.052241017442509044
4 Destination Port: 0.047620037659482105
5 Average Packet Size: 0.038991809925308395
6 Subflow Bwd Bytes: 0.03857951366743775
7 Total Length of Bwd Packets: 0.03763403762070734
8 Max Packet Length: 0.03677800727315327
9 Fwd IAT Std: 0.033922582540323695
10 Packet Length Mean: 0.03068786381648504
11 Avg Bwd Segment Size: 0.02846005719777543
12 Avg Fwd Segment Size: 0.026486193192670194
13 Bwd Packet Length Std: 0.022550677600253755
14 Bwd Packet Length Min: 0.020794965729979416
15 Bwd Packets/s: 0.019982476829569717
16 Idle Max: 0.019210811090958013
17 Init_Win_bytes_forward: 0.017432342107491196
18 Fwd Header Length.1: 0.01713093882319454
19 Bwd Packet Length Max: 0.017045495126466637
20 ACK Flag Count: 0.01655715012066586
21 Init_Win_bytes_backward: 0.016289245084174155
22 Fwd Packet Length Max: 0.015939

Remove features that contribute zero

In [28]:
new_features = [] # list containing the new features names
for i in range(0,len(feature_imp)): # for for the range of all features
    if feature_imp[i] > 0.0: # check if importance is over the threshold
        new_features.append(feature_imp.index[i])
print(new_features)
# save the new data frame
X_train_0 = X_train[new_features]
X_test_0 = X_test[new_features]

['Packet Length Std', 'Packet Length Variance', 'Bwd Packet Length Mean', 'Total Length of Fwd Packets', 'Destination Port', 'Average Packet Size', 'Subflow Bwd Bytes', 'Total Length of Bwd Packets', 'Max Packet Length', 'Fwd IAT Std', 'Packet Length Mean', 'Avg Bwd Segment Size', 'Avg Fwd Segment Size', 'Bwd Packet Length Std', 'Bwd Packet Length Min', 'Bwd Packets/s', 'Idle Max', 'Init_Win_bytes_forward', 'Fwd Header Length.1', 'Bwd Packet Length Max', 'ACK Flag Count', 'Init_Win_bytes_backward', 'Fwd Packet Length Max', 'Fwd Header Length', 'Flow IAT Max', 'Fwd IAT Max', 'Bwd Header Length', 'Subflow Fwd Bytes', 'Total Fwd Packets', 'min_seg_size_forward', 'PSH Flag Count', 'Flow Duration', 'Idle Mean', 'Fwd Packets/s', 'Fwd Packet Length Mean', 'Fwd IAT Min', 'Fwd Packet Length Std', 'Flow Packets/s', 'Subflow Bwd Packets', 'Flow IAT Mean', 'act_data_pkt_fwd', 'Fwd IAT Total', 'Subflow Fwd Packets', 'Flow IAT Min', 'Total Backward Packets', 'Bwd IAT Max', 'Fwd IAT Mean', 'Flow IAT 

In [29]:
# Scale and Run again the classification training task
scaler.fit(X_train_0)
x_train_0 = scaler.transform(X_train_0)
x_test_0 = scaler.transform(X_test_0)

In [30]:
rf2 = RandomForestClassifier(n_estimators=20, max_depth=None,
                                 bootstrap=False, n_jobs=-1,
                                 random_state=0,verbose=2)
rf2.fit(x_train_0, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


building tree 1 of 20building tree 2 of 20

building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  5.4min finished


In [31]:
y_pred = rf2.predict(x_test_0) # change to x_test_0
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


Accuracy: 0.9985807561481018


[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed:    3.3s finished


In [32]:
# see from this featues the contributions
feature_imp0 = pd.Series(rf2.feature_importances_,
                        index=X_train_0.columns).sort_values(ascending=False)

In [33]:
for i in range(0,len(feature_imp0)):
    print(i,end =" ")
    print(feature_imp0.index[i],end = ": ")
    print(feature_imp0[i])

0 Packet Length Std: 0.06257237630440414
1 Average Packet Size: 0.057478800672579024
2 Subflow Fwd Bytes: 0.04773682612652792
3 Subflow Bwd Bytes: 0.04645615633075148
4 Bwd Packet Length Mean: 0.04519535259248061
5 Destination Port: 0.04370818546443815
6 Packet Length Variance: 0.038021871358737275
7 Avg Fwd Segment Size: 0.03730788960954086
8 Bwd Packet Length Max: 0.034338553200139704
9 Idle Mean: 0.03057971380827757
10 Bwd Packet Length Std: 0.029350167973574925
11 Avg Bwd Segment Size: 0.029135894907666132
12 Packet Length Mean: 0.02779002644826991
13 Bwd Packets/s: 0.027085258532863858
14 PSH Flag Count: 0.02469325316082499
15 Total Length of Bwd Packets: 0.024271588029751906
16 Idle Max: 0.023837340691685065
17 min_seg_size_forward: 0.02092082589494219
18 Max Packet Length: 0.020482506682045418
19 Init_Win_bytes_forward: 0.020404495473157137
20 Fwd Packet Length Max: 0.019650406113557388
21 Fwd Packet Length Mean: 0.019071002042250516
22 Fwd Header Length.1: 0.016856119876983072


Remove more features that do not contribute much from the new random forest

In [34]:
new_features = [] # list containing the new features names
for i in range(0,len(feature_imp0)): # for for the range of all features
    if feature_imp0[i] > 0.001: # check if importance is over the threshold
        new_features.append(feature_imp0.index[i])
print(new_features)
# save the new data frame
X_train_00 = X_train[new_features]
X_test_00 = X_test[new_features]

['Packet Length Std', 'Average Packet Size', 'Subflow Fwd Bytes', 'Subflow Bwd Bytes', 'Bwd Packet Length Mean', 'Destination Port', 'Packet Length Variance', 'Avg Fwd Segment Size', 'Bwd Packet Length Max', 'Idle Mean', 'Bwd Packet Length Std', 'Avg Bwd Segment Size', 'Packet Length Mean', 'Bwd Packets/s', 'PSH Flag Count', 'Total Length of Bwd Packets', 'Idle Max', 'min_seg_size_forward', 'Max Packet Length', 'Init_Win_bytes_forward', 'Fwd Packet Length Max', 'Fwd Packet Length Mean', 'Fwd Header Length.1', 'Flow IAT Mean', 'Fwd Header Length', 'Bwd Header Length', 'Fwd IAT Std', 'Total Backward Packets', 'Fwd Packets/s', 'Subflow Fwd Packets', 'Init_Win_bytes_backward', 'ACK Flag Count', 'Total Fwd Packets', 'Subflow Bwd Packets', 'Total Length of Fwd Packets', 'Bwd Packet Length Min', 'Flow IAT Min', 'Flow IAT Std', 'Fwd IAT Min', 'act_data_pkt_fwd', 'Bwd IAT Min', 'Fwd IAT Total', 'Flow Packets/s', 'Fwd IAT Mean', 'Bwd IAT Max', 'Fwd IAT Max', 'Flow IAT Max', 'Flow Duration', 'Fwd

In [35]:
# Train again and predict
# Scale and Run again the classification training task
scaler.fit(X_train_00)
x_train_00 = scaler.transform(X_train_00)
x_test_00 = scaler.transform(X_test_00)

In [36]:
rf3 = RandomForestClassifier(n_estimators=20, max_depth=None,
                                 bootstrap=False, n_jobs=-1,
                                 random_state=0,verbose=2)
rf3.fit(x_train_00, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


building tree 1 of 20
building tree 2 of 20
building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  4.7min finished


In [37]:
y_pred = rf3.predict(x_test_00) # change to x_test_0
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


Accuracy: 0.9986159440121984


[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed:    3.3s finished


In [38]:
# Perform again to see feature importance
feature_imp00 = pd.Series(rf3.feature_importances_,
                        index=X_train_00.columns).sort_values(ascending=False)

In [39]:
for i in range(0,len(feature_imp00)):
    print(i,end =" ")
    print(feature_imp00.index[i],end = ": ")
    print(feature_imp00[i])

0 Subflow Bwd Bytes: 0.06851836335178833
1 Avg Bwd Segment Size: 0.054626595680399424
2 Bwd Packet Length Max: 0.05348935976713558
3 Total Length of Bwd Packets: 0.044364260250318206
4 Destination Port: 0.037048428403003264
5 Bwd Header Length: 0.03397936640052606
6 Fwd IAT Std: 0.03302190221428736
7 Init_Win_bytes_forward: 0.02993722734907776
8 Subflow Fwd Bytes: 0.029137087204199972
9 Packet Length Variance: 0.028704823410283235
10 Average Packet Size: 0.028459091347497842
11 Bwd Packet Length Std: 0.026853314329136468
12 Packet Length Mean: 0.025198778438170037
13 Bwd Packet Length Min: 0.02505654504866555
14 Flow Bytes/s: 0.024355125123226348
15 Packet Length Std: 0.023370233327011616
16 Avg Fwd Segment Size: 0.023066651817636823
17 Flow IAT Max: 0.02236814417666197
18 Fwd Packet Length Mean: 0.02017521586665995
19 act_data_pkt_fwd: 0.02010074700755608
20 Total Length of Fwd Packets: 0.018622649232039105
21 min_seg_size_forward: 0.01822026721507496
22 Fwd Packet Length Max: 0.01816

In [40]:
# colect features until comulative importance is 0.9
new_features = [] # list containing the new features names
cum_imp = 0
for i in range(0,len(feature_imp00)): # for for the range of all features
    if cum_imp < 0.9:
        new_features.append(feature_imp00.index[i])
        cum_imp+= feature_imp00[i]
print(new_features)
# save the new data frame
X_train_000 = X_train[new_features]
X_test_000 = X_test[new_features]

['Subflow Bwd Bytes', 'Avg Bwd Segment Size', 'Bwd Packet Length Max', 'Total Length of Bwd Packets', 'Destination Port', 'Bwd Header Length', 'Fwd IAT Std', 'Init_Win_bytes_forward', 'Subflow Fwd Bytes', 'Packet Length Variance', 'Average Packet Size', 'Bwd Packet Length Std', 'Packet Length Mean', 'Bwd Packet Length Min', 'Flow Bytes/s', 'Packet Length Std', 'Avg Fwd Segment Size', 'Flow IAT Max', 'Fwd Packet Length Mean', 'act_data_pkt_fwd', 'Total Length of Fwd Packets', 'min_seg_size_forward', 'Fwd Packet Length Max', 'Flow IAT Mean', 'Fwd Header Length.1', 'Fwd IAT Max', 'Bwd Packet Length Mean', 'Bwd Packets/s', 'Subflow Bwd Packets', 'Fwd Packet Length Std', 'Max Packet Length', 'Init_Win_bytes_backward', 'PSH Flag Count', 'Idle Min', 'Fwd Header Length', 'Subflow Fwd Packets']


In [41]:
# final training
# Train again and predict
# Scale and Run again the classification training task
scaler.fit(X_train_000)
x_train_000 = scaler.transform(X_train_000)
x_test_000 = scaler.transform(X_test_000)

In [42]:
del X_train_00, X_train_0

In [43]:
rf4 = RandomForestClassifier(n_estimators=20, max_depth=None,
                                 bootstrap=False, n_jobs=-1,
                                 random_state=0,verbose=2)
rf4.fit(x_train_000, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


building tree 1 of 20building tree 2 of 20

building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  4.2min finished


In [44]:
y_pred = rf4.predict(x_test_000) # change to x_test_0
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


Accuracy: 0.9983676740821832


[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed:    1.9s finished


In [47]:
# Perform again to see feature importance
feature_imp000 = pd.Series(rf4.feature_importances_,
                        index=X_train_000.columns).sort_values(ascending=False)

In [48]:
for i in range(0,len(feature_imp000)):
    print(i,end =" ")
    print(feature_imp000.index[i],end = ": ")
    print(feature_imp000[i])

0 Bwd Packet Length Std: 0.08963976103322824
1 Avg Bwd Segment Size: 0.0729369216644516
2 Average Packet Size: 0.05776455433675132
3 Bwd Packet Length Max: 0.05666994164848606
4 Destination Port: 0.05279896393100271
5 Total Length of Fwd Packets: 0.04797733473864768
6 Fwd Packet Length Max: 0.04355394998204584
7 Packet Length Std: 0.04236556400934843
8 Bwd Packet Length Mean: 0.038407518042727434
9 Packet Length Variance: 0.03734803745918708
10 Subflow Fwd Bytes: 0.03675378640680891
11 Packet Length Mean: 0.0297794611980539
12 Flow Bytes/s: 0.02752709082775817
13 Init_Win_bytes_forward: 0.027070140755383987
14 Bwd Header Length: 0.02646052845785997
15 Bwd Packet Length Min: 0.02610349383892003
16 Avg Fwd Segment Size: 0.022451458060263114
17 Bwd Packets/s: 0.020654980689370248
18 Fwd IAT Max: 0.020286318034404578
19 Total Length of Bwd Packets: 0.01886191677374133
20 Fwd Header Length: 0.018500736777287405
21 Subflow Bwd Bytes: 0.0177053266888564
22 Fwd Packet Length Std: 0.01635971513

In [49]:
# colect features until comulative importance is 0.9
new_features = [] # list containing the new features names
cum_imp = 0
for i in range(0,len(feature_imp000)): # for for the range of all features
    if cum_imp < 0.9:
        new_features.append(feature_imp000.index[i])
        cum_imp+= feature_imp000[i]
print(new_features)
# save the new data frame
X_train_0000 = X_train[new_features]
X_test_0000 = X_test[new_features]

['Bwd Packet Length Std', 'Avg Bwd Segment Size', 'Average Packet Size', 'Bwd Packet Length Max', 'Destination Port', 'Total Length of Fwd Packets', 'Fwd Packet Length Max', 'Packet Length Std', 'Bwd Packet Length Mean', 'Packet Length Variance', 'Subflow Fwd Bytes', 'Packet Length Mean', 'Flow Bytes/s', 'Init_Win_bytes_forward', 'Bwd Header Length', 'Bwd Packet Length Min', 'Avg Fwd Segment Size', 'Bwd Packets/s', 'Fwd IAT Max', 'Total Length of Bwd Packets', 'Fwd Header Length', 'Subflow Bwd Bytes', 'Fwd Packet Length Std', 'Fwd IAT Std', 'Fwd Header Length.1', 'Subflow Fwd Packets', 'Subflow Bwd Packets']


In [51]:
# final training
# Train again and predict
# Scale and Run again the classification training task
scaler.fit(X_train_0000)
x_train_0000 = scaler.transform(X_train_0000)
x_test_0000 = scaler.transform(X_test_0000)

In [52]:
rf5 = RandomForestClassifier(n_estimators=20, max_depth=None,
                                 bootstrap=False, n_jobs=-1,
                                 random_state=0,verbose=2)
rf5.fit(x_train_0000, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


building tree 1 of 20
building tree 2 of 20
building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  3.6min finished


In [53]:
y_pred = rf5.predict(x_test_0000) # change to x_test_0
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


Accuracy: 0.9981663213042968


[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed:    2.0s finished


In [54]:
# Perform again to see feature importance
feature_imp0000 = pd.Series(rf5.feature_importances_,
                        index=X_train_0000.columns).sort_values(ascending=False)

In [55]:
for i in range(0,len(feature_imp0000)):
    print(i,end =" ")
    print(feature_imp0000.index[i],end = ": ")
    print(feature_imp0000[i])

0 Bwd Packet Length Max: 0.07913958457128996
1 Packet Length Std: 0.06935822767087331
2 Subflow Bwd Bytes: 0.06222855847524918
3 Subflow Fwd Bytes: 0.05899660200586665
4 Packet Length Mean: 0.05843891036937836
5 Total Length of Bwd Packets: 0.05594711132924102
6 Destination Port: 0.055352975755227476
7 Average Packet Size: 0.052994391265022
8 Bwd Packet Length Mean: 0.05280241199418727
9 Packet Length Variance: 0.048565514715314034
10 Init_Win_bytes_forward: 0.04537107850008429
11 Bwd Packet Length Std: 0.04026569832384771
12 Fwd IAT Std: 0.037927328320231914
13 Avg Bwd Segment Size: 0.03448080543552824
14 Fwd Header Length: 0.02471814278388237
15 Total Length of Fwd Packets: 0.023233288563514032
16 Fwd Packet Length Max: 0.02311436270769824
17 Avg Fwd Segment Size: 0.021924036008802644
18 Fwd Header Length.1: 0.02104393351382617
19 Subflow Fwd Packets: 0.02099468243740843
20 Bwd Packets/s: 0.019837703806519225
21 Bwd Packet Length Min: 0.01915097659540342
22 Fwd Packet Length Std: 0.0

In [56]:
del X_train_000, X_train_0000

In [58]:
del X_test_0,X_test_00,X_test_000,X_test_0000

In [57]:
# colect features until comulative importance is 0.9
new_features = [] # list containing the new features names
cum_imp = 0
for i in range(0,len(feature_imp0000)): # for for the range of all features
    if cum_imp < 0.9:
        new_features.append(feature_imp0000.index[i])
        cum_imp+= feature_imp0000[i]
print(new_features)
# save the new data frame
X_train_00000 = X_train[new_features]
X_test_00000 = X_test[new_features]

['Bwd Packet Length Max', 'Packet Length Std', 'Subflow Bwd Bytes', 'Subflow Fwd Bytes', 'Packet Length Mean', 'Total Length of Bwd Packets', 'Destination Port', 'Average Packet Size', 'Bwd Packet Length Mean', 'Packet Length Variance', 'Init_Win_bytes_forward', 'Bwd Packet Length Std', 'Fwd IAT Std', 'Avg Bwd Segment Size', 'Fwd Header Length', 'Total Length of Fwd Packets', 'Fwd Packet Length Max', 'Avg Fwd Segment Size', 'Fwd Header Length.1', 'Subflow Fwd Packets', 'Bwd Packets/s']


In [59]:
# Train again and predict
# Scale and Run again the classification training task
scaler.fit(X_train_00000)
x_train_00000 = scaler.transform(X_train_00000)
x_test_00000 = scaler.transform(X_test_00000)

In [60]:
del x_train_0,x_train_00,x_train_000,x_train_0000

In [63]:
rf6 = RandomForestClassifier(n_estimators=20, max_depth=None,
                                 bootstrap=False, n_jobs=-1,
                                 random_state=0,verbose=2)
rf6.fit(x_train_00000, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


building tree 1 of 20building tree 2 of 20

building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  3.0min finished


In [64]:
y_pred = rf6.predict(x_test_00000) # change to x_test_0
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


Accuracy: 0.9979317355436526


[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed:    2.4s finished


In [65]:
# Perform again to see feature importance
feature_imp00000 = pd.Series(rf6.feature_importances_,
                        index=X_train_00000.columns).sort_values(ascending=False)

In [66]:
for i in range(0,len(feature_imp00000)):
    print(i,end =" ")
    print(feature_imp00000.index[i],end = ": ")
    print(feature_imp00000[i])

0 Packet Length Variance: 0.10392745199250686
1 Total Length of Fwd Packets: 0.08093599570275384
2 Packet Length Std: 0.07245995491228212
3 Subflow Fwd Bytes: 0.0698456542037804
4 Average Packet Size: 0.06294193011632401
5 Avg Bwd Segment Size: 0.06146021164043971
6 Fwd Packet Length Max: 0.059937621506363205
7 Init_Win_bytes_forward: 0.05674415553256137
8 Bwd Packet Length Mean: 0.054927803723317306
9 Destination Port: 0.054342964176528896
10 Subflow Bwd Bytes: 0.04822314586800956
11 Packet Length Mean: 0.04542863429219502
12 Bwd Packet Length Std: 0.03939930700083817
13 Bwd Packet Length Max: 0.029036810244033345
14 Subflow Fwd Packets: 0.028898057917442647
15 Total Length of Bwd Packets: 0.028229189699460016
16 Fwd Header Length: 0.025011241712762745
17 Bwd Packets/s: 0.024750951185579857
18 Fwd Header Length.1: 0.022840858507839852
19 Avg Fwd Segment Size: 0.016612576656145708
20 Fwd IAT Std: 0.014045483408835352


In [81]:
# colect features until comulative importance is 0.9
new_features = [] # list containing the new features names
cum_imp = 0
for i in range(0,len(feature_imp00000)): # for for the range of all features
    if cum_imp < 0.9:
        new_features.append(feature_imp00000.index[i])
        cum_imp+= feature_imp00000[i]
print(new_features)
# save the new data frame
X_train_000000 = X_train[new_features]
X_test_000000 = X_test[new_features]

['Packet Length Variance', 'Total Length of Fwd Packets', 'Packet Length Std', 'Subflow Fwd Bytes', 'Average Packet Size', 'Avg Bwd Segment Size', 'Fwd Packet Length Max', 'Init_Win_bytes_forward', 'Bwd Packet Length Mean', 'Destination Port', 'Subflow Bwd Bytes', 'Packet Length Mean', 'Bwd Packet Length Std', 'Bwd Packet Length Max', 'Subflow Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Header Length']


In [82]:
# Train again and predict
# Scale and Run again the classification training task
scaler.fit(X_train_000000)
x_train_000000 = scaler.transform(X_train_000000)
x_test_000000 = scaler.transform(X_test_000000)

In [69]:
rf7 = RandomForestClassifier(n_estimators=20, max_depth=None,
                                 bootstrap=False, n_jobs=-1,
                                 random_state=0,verbose=2)
rf7.fit(x_train_000000, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


building tree 1 of 20
building tree 2 of 20
building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  2.8min finished


In [70]:
y_pred = rf7.predict(x_test_000000) # change to x_test_0
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


Accuracy: 0.9980803065253939


[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed:    1.7s finished


In [83]:
save_dataframe(x_train_000000,y_train,x_test_000000,y_test,new_features,"17_features")

Train dataset Saved
Test dataset Saved


In [72]:
# Perform again to see feature importance
feature_imp000000 = pd.Series(rf7.feature_importances_,
                        index=X_train_000000.columns).sort_values(ascending=False)

In [73]:
for i in range(0,len(feature_imp000000)):
    print(i,end =" ")
    print(feature_imp000000.index[i],end = ": ")
    print(feature_imp000000[i])

0 Packet Length Std: 0.0919402958288149
1 Average Packet Size: 0.08788152265314116
2 Bwd Packet Length Mean: 0.08120767757539181
3 Bwd Packet Length Std: 0.07794444141001207
4 Avg Bwd Segment Size: 0.07573977420783362
5 Fwd Packet Length Max: 0.07469786193470355
6 Destination Port: 0.07452388262703372
7 Total Length of Fwd Packets: 0.05617642864451986
8 Init_Win_bytes_forward: 0.05550652377979314
9 Subflow Fwd Bytes: 0.05544749152571902
10 Subflow Fwd Packets: 0.04715078436290879
11 Fwd Header Length: 0.04509582848337449
12 Packet Length Mean: 0.04269620571621673
13 Total Length of Bwd Packets: 0.042693629160767514
14 Subflow Bwd Bytes: 0.03814966620330023
15 Bwd Packet Length Max: 0.027664702151449273
16 Packet Length Variance: 0.02548328373502027


In [76]:
new_features = [] # list containing the new features names
for i in range(0,len(feature_imp000000)): # for for the range of all features
    if feature_imp000000[i] > 0.05: # check if importance is over the threshold
        new_features.append(feature_imp000000.index[i])
print(new_features)
# save the new data frame
X_train_0000000 = X_train[new_features]
X_test_0000000 = X_test[new_features]

['Packet Length Std', 'Average Packet Size', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Avg Bwd Segment Size', 'Fwd Packet Length Max', 'Destination Port', 'Total Length of Fwd Packets', 'Init_Win_bytes_forward', 'Subflow Fwd Bytes']


In [77]:
# Train again and predict
# Scale and Run again the classification training task
scaler.fit(X_train_0000000)
x_train_0000000 = scaler.transform(X_train_0000000)
x_test_0000000 = scaler.transform(X_test_0000000)

In [80]:
save_dataframe(x_train_0000000,y_train,x_test_0000000,y_test,new_features,"10_features")

Train dataset Saved
Test dataset Saved
