In [1]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import RFE

# Read Files and Basic Pre-processing

In [3]:
train_file_path = '/content/drive/MyDrive/ITU_Competition_Intrusion_and_Vulnerability_Detection_in_Software_Defined_Networks(SDN)/Train_ULAK.csv'
test_file_path = '/content/drive/MyDrive/ITU_Competition_Intrusion_and_Vulnerability_Detection_in_Software_Defined_Networks(SDN)/Test_ULAK.csv'

train_df = pd.read_csv(train_file_path,skipinitialspace=True)
test_df = pd.read_csv(test_file_path,skipinitialspace=True)

if train_df is not None and test_df is not None:
    print("Data loaded successfully!")
    print("Train Data:")
    print(train_df.head())  # Display the first few rows of the train DataFrame
    print("\nTest Data:")
    print(test_df.head())   # Display the first few rows of the test DataFrame

Data loaded successfully!
Train Data:
   Destination Port  Flow Duration  Total Fwd Packets  Total Backward Packets  \
0                80       68855579                 10                       6   
1                53            196                  2                       2   
2               123            118                  2                       2   
3                80         295657                  7                      10   
4                80       15705351                  7                       5   

   Total Length of Fwd Packets  Total Length of Bwd Packets  \
0                         1038                        11595   
1                           70                          174   
2                           96                           96   
3                         1114                        15841   
4                          407                          452   

   Fwd Packet Length Max  Fwd Packet Length Min  Fwd Packet Length Mean  \
0                    

In [4]:
# Print the sizes of train and test dataframes
print("Shape of Training Dataset:", train_df.shape)
print("Shape of Testing Dataset:", test_df.shape)

Shape of Training Dataset: (1783356, 79)
Shape of Testing Dataset: (512077, 79)


In [5]:
#train_df.info()
#test_df.info()

In [6]:
# Summary statistics of the dataset
#train_df.describe()

In [7]:
# number of each label type train df
train_df['Label'].value_counts()

BENIGN                        1432050
DoS Hulk                       145575
PortScan                       100125
DDoS                            80656
DoS GoldenEye                    6484
FTP-Patator                      5000
SSH-Patator                      3714
DoS slowloris                    3651
DoS Slowhttptest                 3464
Bot                              1238
Web Attack � Brute Force          949
Web Attack � XSS                  410
Infiltration                       22
Web Attack � Sql Injection         12
Heartbleed                          6
Name: Label, dtype: int64

In [8]:
# number of each label type test df
test_df['Label'].value_counts()

BENIGN                        411203
DoS Hulk                       41801
PortScan                       28751
DDoS                           23160
DoS GoldenEye                   1861
FTP-Patator                     1436
SSH-Patator                     1067
DoS slowloris                   1048
DoS Slowhttptest                 994
Bot                              355
Web Attack � Brute Force         272
Web Attack � XSS                 117
Infiltration                       6
Web Attack � Sql Injection         4
Heartbleed                         2
Name: Label, dtype: int64

# Pre-processing of Data

One-Hot-Encoding to Replace Label Values with Columns

In [9]:
labels = train_df['Label'].values.reshape(-1, 1)
# Initialize the OneHotEncoder
encoder = OneHotEncoder()
# transform fit labels of train and test daframe
one_hot_labels = encoder.fit_transform(labels).toarray()
test_labels = encoder.transform(test_df['Label'].values.reshape(-1,1)).toarray()

feature_names = encoder.get_feature_names_out(input_features=['Label'])
one_hot_df = pd.DataFrame(one_hot_labels, columns=feature_names)
one_hot_df_test = pd.DataFrame(test_labels,columns=feature_names)
# apend one hot encoding to dataset for both train and test dataframes
df_encoded_train = pd.concat([train_df, one_hot_df], axis=1)
df_encoded_train = df_encoded_train.drop('Label', axis=1)

df_encoded_test = pd.concat([test_df, one_hot_df_test], axis=1)
df_encoded_test = df_encoded_test.drop('Label', axis=1)

In [10]:
print(train_df['Label'].unique())

['DoS Hulk' 'BENIGN' 'DDoS' 'PortScan' 'DoS GoldenEye' 'FTP-Patator'
 'DoS slowloris' 'DoS Slowhttptest' 'SSH-Patator' 'Web Attack � XSS'
 'Web Attack � Brute Force' 'Web Attack � Sql Injection' 'Bot'
 'Infiltration' 'Heartbleed']


Drop Records Containing Infitiy Values, are Null and Nan Values

In [11]:
# Replace infinite number with NaN values
df_encoded_train.replace([np.inf, -np.inf], np.nan, inplace=True)
df_encoded_test.replace([np.inf, -np.inf], np.nan, inplace=True)
# Drop null and NaN values
print("Records deleted in training dataset:", df_encoded_train.isnull().sum().sum())
print("Records deleted in test dataset:", df_encoded_test.isnull().sum().sum())

df_encoded_train = df_encoded_train.dropna().reset_index(drop=True)
df_encoded_test = df_encoded_test.dropna().reset_index(drop=True)

Records deleted in training dataset: 3622
Records deleted in test dataset: 1074


In [12]:
# Print the new sizes of train and test dataframes
print("Shape of Training Dataset:", df_encoded_train.shape)
print("Shape of Testing Dataset:", df_encoded_test.shape)

Shape of Training Dataset: (1781545, 93)
Shape of Testing Dataset: (511540, 93)


Split Data

In [13]:
X_train = df_encoded_train.drop(df_encoded_train.columns[-15:], axis=1)
y_train = df_encoded_train[df_encoded_train.columns[-15:]]

X_test = df_encoded_test.drop(df_encoded_test.columns[-15:], axis=1)
y_test = df_encoded_test[df_encoded_test.columns[-15:]]

In [14]:
print(X_train.shape)
print(y_train.shape)

(1781545, 78)
(1781545, 15)


In [15]:
print(X_test.shape)
print(y_test.shape)

(511540, 78)
(511540, 15)


In [16]:
# Delete dafarames to empty memory
del df_encoded_train,df_encoded_test
del train_df, test_df

Scale Data using Standard Scaler

In [17]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
x_train = scaler.transform(X_train)
x_test = scaler.transform(X_test)

Random Forest for Feature Importance

In [18]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_depth=3,
                                 bootstrap=True, n_jobs=-1,
                                 random_state=0,verbose=2)
rf.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


building tree 1 of 100building tree 2 of 100

building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100


[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.3min


building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 11.4min finished


Test accuracy to find the base performance with all features

In [19]:
y_pred = rf.predict(x_test)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   40.7s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  1.8min finished


Accuracy: 0.9384564256949604


Find out what features contribute the most

In [20]:
feature_imp = pd.Series(rf.feature_importances_,
                        index=X_train.columns).sort_values(ascending=False)

In [21]:
for i in range(0,len(feature_imp)):
    print(i,end =" ")
    print(feature_imp.index[i],end = ": ")
    print(feature_imp[i])

0 Packet Length Std: 0.074884529793549
1 Bwd Packet Length Mean: 0.07473778196936971
2 Avg Bwd Segment Size: 0.07290222918291389
3 Packet Length Variance: 0.06329335921033032
4 Bwd Packet Length Std: 0.05867090054191021
5 Max Packet Length: 0.05429898764940145
6 Average Packet Size: 0.05017962538224517
7 Total Length of Fwd Packets: 0.04254183566994731
8 Total Length of Bwd Packets: 0.03361219714764282
9 Subflow Bwd Bytes: 0.03358920403056025
10 Subflow Fwd Bytes: 0.03208298791570483
11 Packet Length Mean: 0.03066633208722605
12 Fwd Packet Length Mean: 0.029559121080121528
13 Fwd IAT Std: 0.029056269105987024
14 Fwd Packet Length Max: 0.02662544436295232
15 Bwd Packet Length Max: 0.025943116382113295
16 Idle Max: 0.021688456616956297
17 Avg Fwd Segment Size: 0.0210966771592294
18 Idle Min: 0.021059519014760518
19 PSH Flag Count: 0.01869602024412106
20 min_seg_size_forward: 0.016224068983548667
21 Bwd Header Length: 0.01375159809327814
22 Bwd Packet Length Min: 0.013732868127142766
23 F

Remove features that contribute zero

In [22]:
new_features = [] # list containing the new features names
for i in range(0,len(feature_imp)): # for for the range of all features
    if feature_imp[i] > 0.0: # check if importance is over the threshold
        new_features.append(feature_imp.index[i])
print(new_features)
# save the new data frame
X_train_0 = X_train[new_features]
X_test_0 = X_test[new_features]

['Packet Length Std', 'Bwd Packet Length Mean', 'Avg Bwd Segment Size', 'Packet Length Variance', 'Bwd Packet Length Std', 'Max Packet Length', 'Average Packet Size', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Subflow Bwd Bytes', 'Subflow Fwd Bytes', 'Packet Length Mean', 'Fwd Packet Length Mean', 'Fwd IAT Std', 'Fwd Packet Length Max', 'Bwd Packet Length Max', 'Idle Max', 'Avg Fwd Segment Size', 'Idle Min', 'PSH Flag Count', 'min_seg_size_forward', 'Bwd Header Length', 'Bwd Packet Length Min', 'Flow IAT Max', 'Fwd Packet Length Std', 'Bwd Packets/s', 'Init_Win_bytes_backward', 'Idle Mean', 'Fwd Header Length', 'Flow IAT Mean', 'Total Backward Packets', 'Fwd IAT Max', 'Flow IAT Std', 'Flow Bytes/s', 'Total Fwd Packets', 'Fwd Header Length.1', 'Fwd Packets/s', 'Subflow Fwd Packets', 'ACK Flag Count', 'Destination Port', 'Init_Win_bytes_forward', 'Fwd IAT Mean', 'Min Packet Length', 'Bwd IAT Min', 'Flow Packets/s', 'Subflow Bwd Packets', 'act_data_pkt_fwd', 'FIN Flag 

In [23]:
# Scale and Run again the classification training task
scaler.fit(X_train_0)
x_train_0 = scaler.transform(X_train_0)
x_test_0 = scaler.transform(X_test_0)

In [24]:
rf.fit(x_train_0, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


building tree 1 of 100building tree 2 of 100

building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100


[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.7min


building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 12.3min finished


In [25]:
y_pred = rf.predict(x_test_0) # change to x_test_0
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   40.7s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  1.9min finished


Accuracy: 0.9404230363216953


In [26]:
# see from this featues the contributions
feature_imp0 = pd.Series(rf.feature_importances_,
                        index=X_train_0.columns).sort_values(ascending=False)

In [27]:
for i in range(0,len(feature_imp0)):
    print(i,end =" ")
    print(feature_imp0.index[i],end = ": ")
    print(feature_imp0[i])

0 Avg Bwd Segment Size: 0.08739773888301977
1 Packet Length Std: 0.08399304017569273
2 Packet Length Variance: 0.07042187838513471
3 Bwd Packet Length Mean: 0.06929313483132214
4 Bwd Packet Length Std: 0.06379464388670093
5 Packet Length Mean: 0.05205333853586908
6 Average Packet Size: 0.04598675892562356
7 Max Packet Length: 0.042542652605308096
8 Subflow Fwd Bytes: 0.04049332350295166
9 Total Length of Bwd Packets: 0.03878840294501267
10 Bwd Packet Length Max: 0.034463953921190116
11 Total Length of Fwd Packets: 0.03357195065056664
12 Fwd Packet Length Max: 0.032450338788451515
13 Avg Fwd Segment Size: 0.030634315439729276
14 Subflow Bwd Bytes: 0.03061322296403141
15 Fwd IAT Std: 0.024756864666294662
16 Fwd Packet Length Mean: 0.022754124155730263
17 Idle Min: 0.018563303616511692
18 Idle Max: 0.01836467826128943
19 PSH Flag Count: 0.017007102228948683
20 Fwd Packet Length Std: 0.015499831676384078
21 Bwd Packet Length Min: 0.014479110695710997
22 Bwd Header Length: 0.013639362458033

Remove more features that do not contribute much from the new random forest

In [28]:
new_features = [] # list containing the new features names
for i in range(0,len(feature_imp0)): # for for the range of all features
    if feature_imp0[i] > 0.001: # check if importance is over the threshold
        new_features.append(feature_imp0.index[i])
print(new_features)
# save the new data frame
X_train_00 = X_train[new_features]
X_test_00 = X_test[new_features]

['Avg Bwd Segment Size', 'Packet Length Std', 'Packet Length Variance', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Packet Length Mean', 'Average Packet Size', 'Max Packet Length', 'Subflow Fwd Bytes', 'Total Length of Bwd Packets', 'Bwd Packet Length Max', 'Total Length of Fwd Packets', 'Fwd Packet Length Max', 'Avg Fwd Segment Size', 'Subflow Bwd Bytes', 'Fwd IAT Std', 'Fwd Packet Length Mean', 'Idle Min', 'Idle Max', 'PSH Flag Count', 'Fwd Packet Length Std', 'Bwd Packet Length Min', 'Bwd Header Length', 'Fwd IAT Max', 'Flow Bytes/s', 'Bwd Packets/s', 'min_seg_size_forward', 'Flow IAT Max', 'Init_Win_bytes_forward', 'Idle Mean', 'Destination Port', 'Init_Win_bytes_backward', 'Subflow Fwd Packets', 'Total Backward Packets', 'Fwd Header Length', 'ACK Flag Count', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd Header Length.1', 'Min Packet Length', 'Flow IAT Std', 'Flow Packets/s', 'Flow Duration', 'Subflow Bwd Packets', 'Total Fwd Packets', 'act_data_pkt_fwd']


In [29]:
# Train again and predict
# Scale and Run again the classification training task
scaler.fit(X_train_00)
x_train_00 = scaler.transform(X_train_00)
x_test_00 = scaler.transform(X_test_00)

In [30]:
rf.fit(x_train_00, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100building tree 4 of 100

building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100


[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.8min


building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 11.8min finished


In [31]:
y_pred = rf.predict(x_test_00) # change to x_test_0
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   42.5s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  1.8min finished


Accuracy: 0.9386773272862338


In [32]:
# Perform again to see feature importance
feature_imp00 = pd.Series(rf.feature_importances_,
                        index=X_train_00.columns).sort_values(ascending=False)

In [33]:
for i in range(0,len(feature_imp00)):
    print(i,end =" ")
    print(feature_imp00.index[i],end = ": ")
    print(feature_imp00[i])

0 Packet Length Std: 0.10836768977179784
1 Bwd Packet Length Std: 0.09284567307574548
2 Bwd Packet Length Max: 0.06371919864478232
3 Bwd Packet Length Mean: 0.05953998284138854
4 Average Packet Size: 0.05871767648479514
5 Packet Length Variance: 0.05635571517622319
6 Avg Bwd Segment Size: 0.04908434857481249
7 Subflow Bwd Bytes: 0.04852060509661591
8 Total Length of Fwd Packets: 0.046791236424598794
9 Fwd Packet Length Max: 0.04170315022544639
10 Subflow Fwd Bytes: 0.0341779736618819
11 Packet Length Mean: 0.029577534653097464
12 Avg Fwd Segment Size: 0.028723389585185503
13 Max Packet Length: 0.02680046171228531
14 Fwd Packet Length Mean: 0.025058546894064473
15 Total Length of Bwd Packets: 0.025019008620498922
16 Fwd IAT Std: 0.024726712953173968
17 Idle Max: 0.01972506773478399
18 Fwd Packet Length Std: 0.017053698855276744
19 Init_Win_bytes_backward: 0.015897437526586716
20 Bwd Header Length: 0.01404455554835598
21 Flow IAT Max: 0.011862510510740373
22 Idle Min: 0.01107475813659482

In [34]:
# colect features until comulative importance is 0.9
new_features = [] # list containing the new features names
cum_imp = 0
for i in range(0,len(feature_imp00)): # for for the range of all features
    if cum_imp < 0.9:
        new_features.append(feature_imp00.index[i])
        cum_imp+= feature_imp00[i]
print(new_features)
# save the new data frame
X_train_000 = X_train[new_features]
X_test_000 = X_test[new_features]

['Packet Length Std', 'Bwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Mean', 'Average Packet Size', 'Packet Length Variance', 'Avg Bwd Segment Size', 'Subflow Bwd Bytes', 'Total Length of Fwd Packets', 'Fwd Packet Length Max', 'Subflow Fwd Bytes', 'Packet Length Mean', 'Avg Fwd Segment Size', 'Max Packet Length', 'Fwd Packet Length Mean', 'Total Length of Bwd Packets', 'Fwd IAT Std', 'Idle Max', 'Fwd Packet Length Std', 'Init_Win_bytes_backward', 'Bwd Header Length', 'Flow IAT Max', 'Idle Min']


In [35]:
# final training
# Train again and predict
# Scale and Run again the classification training task
scaler.fit(X_train_000)
x_train_000 = scaler.transform(X_train_000)
x_test_000 = scaler.transform(X_test_000)

In [36]:
del X_train_00, X_train_0

In [None]:
rf.fit(x_train_000, y_train)

In [None]:
y_pred = rf.predict(x_test_000) # change to x_test_0
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
rf2 = RandomForestClassifier(n_estimators=100, max_depth=4,
                                 bootstrap=True, n_jobs=-1,
                                 random_state=0,verbose=3)
rf2.fit(x_train_000, y_train)

In [None]:
y_pred = rf2.predict(x_test_000) # change to x_test_0
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [38]:
# Save the final dataset with the 23 features
x_train_new = pd.DataFrame(x_train_000)
x_train_new.columns= new_features
y_train_new = pd.DataFrame(y_train)
train_frame = [x_train_new,y_train_new]
train_final = pd.concat(train_frame,axis=1)
train_final.to_csv('/content/drive/MyDrive/ITU_Competition_Intrusion_and_Vulnerability_Detection_in_Software_Defined_Networks(SDN)/train_data.csv',index = False)

In [39]:
# Save test data set as well
x_test_new = pd.DataFrame(x_test_000)
x_test_new.columns= new_features
y_test_new = pd.DataFrame(y_test)
test_frame = [x_test_new,y_test_new]
test_final = pd.concat(test_frame,axis=1)
test_final.to_csv('/content/drive/MyDrive/ITU_Competition_Intrusion_and_Vulnerability_Detection_in_Software_Defined_Networks(SDN)/test_data.csv',index = False)