In [None]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt

In [None]:
# Reference: https://github.com/Saurabh2805/kdd_cup_99/blob/master/KDD_CUP_99_dataset_1.ipynb
# Label = 'intrusion_type'
columns = ['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised','root_shell','su_attempted',
'num_root','num_file_creations','num_shells','num_access_files','num_outbound_cmds',
'is_host_login',
'is_guest_login',
'count',
'srv_count',
'serror_rate',
'srv_serror_rate',
'rerror_rate',
'srv_rerror_rate',
'same_srv_rate',
'diff_srv_rate',
'srv_diff_host_rate',
'dst_host_count',
'dst_host_srv_count',
'dst_host_same_srv_rate',
'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate',
'dst_host_serror_rate',
'dst_host_srv_serror_rate',
'dst_host_rerror_rate',
'dst_host_srv_rerror_rate',
'intrusion_type']

In [None]:
df = pd.read_csv('sample_data/kddcup.data_10_percent_testing.csv', names=columns, header=None)
print(f"Total rows and columns: {df.shape}")

Total rows and columns: (494021, 42)


In [None]:
# Ordinal Encoding
from sklearn.preprocessing import OrdinalEncoder

columns_encode = ['protocol_type','service','flag','intrusion_type']
df[columns_encode] = OrdinalEncoder().fit_transform(df[columns_encode])

In [None]:
# Check for NANs
NANs_before = df.isna().sum().sum()
print(f"Number of NANs before removal: {NANs_before}")
df.dropna(inplace=True)
NANs_after = df.isna().sum().sum()
print(f"Number of NaNs after removal: {NANs_after}")

Number of NANs before removal: 0
Number of NaNs after removal: 0


In [None]:
# Check for Duplicates
duplicates_before = df.duplicated().sum()
print(f"Number of duplicate rows before removal: {duplicates_before}")
df.drop_duplicates(inplace=True)
duplicates_after = df.duplicated().sum()
print(f"Number of duplicate rows after removal: {duplicates_after}")

Number of duplicate rows before removal: 348435
Number of duplicate rows after removal: 0


In [None]:
# print("Class Distribution")
# print(df['intrusion_type'].value_counts())

Class Distribution
intrusion_type
11.0    87832
9.0     51820
0.0       968
20.0      918
17.0      906
21.0      893
5.0       651
18.0      641
15.0      416
14.0      206
10.0      158
3.0        53
1.0        30
22.0       20
6.0        19
4.0        12
16.0       10
7.0         9
2.0         8
8.0         7
13.0        4
12.0        3
19.0        2
Name: count, dtype: int64


In [None]:
# Split into Train and Test
from sklearn.model_selection import train_test_split

x = df.drop(columns='intrusion_type')
y = df['intrusion_type']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42) # 80:20

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

def RandomForest(x_train, y_train, x_test, y_test):
  rf = RandomForestClassifier(random_state=42)
  rf.fit(x_train, y_train)

  y_pred = rf.predict(x_test)
  accuracy = accuracy_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred, average='macro')

  print("Classification Report")
  print(classification_report(y_test, y_pred))

  print(f"Accuracy: {accuracy}")
  print(f"F1 Score: {f1}")

  return accuracy, f1

In [None]:
# Reference: https://github.com/atulpatelDS/Youtube/blob/main/Machine_Learning/Imbalanced_Dataset_Handling/Different%20Techniques%20to%20deal%20with%20Imbalanced%20Dataset%20(Imbalanced%20Classes)%20in%20Machine%20Learning.ipynb
# Random Oversampling with Smoothing
from imblearn.over_sampling import RandomOverSampler

def random_oversampling(x_train, y_train):
  ros = RandomOverSampler()
  x_train_ros, y_train_ros = ros.fit_resample(x_train, y_train)
  return x_train_ros, y_train_ros
  # print(f"x_train_ros shape: {x_train_ros.shape}, y_train_ros shape: {y_train_ros.shape}")

In [None]:
# Random Oversampling with Smoothing (for nonaugmented dataset)
acc_non_aug, f1_non_aug = RandomForest(x_train, y_train, x_test, y_test)

Classification Report
              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00       194
         1.0       0.67      0.67      0.67         6
         2.0       0.00      0.00      0.00         2
         3.0       1.00      1.00      1.00        11
         4.0       1.00      0.50      0.67         2
         5.0       1.00      0.98      0.99       130
         6.0       1.00      0.75      0.86         4
         7.0       0.00      0.00      0.00         2
         8.0       0.00      0.00      0.00         1
         9.0       1.00      1.00      1.00     10364
        10.0       0.97      0.97      0.97        32
        11.0       1.00      1.00      1.00     17567
        13.0       1.00      1.00      1.00         1
        14.0       1.00      1.00      1.00        41
        15.0       0.98      1.00      0.99        83
        16.0       0.00      0.00      0.00         2
        17.0       0.99      0.99      0.99       181
     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Random Oversampling with Smoothing (for augmented dataset)
x_train_ros, y_train_ros = random_oversampling(x_train, y_train)
acc_aug, f1_aug = RandomForest(x_train_ros, y_train_ros, x_test, y_test)

Classification Report
              precision    recall  f1-score   support

         0.0       1.00      0.99      0.99       194
         1.0       1.00      0.67      0.80         6
         2.0       1.00      0.50      0.67         2
         3.0       1.00      1.00      1.00        11
         4.0       1.00      0.50      0.67         2
         5.0       1.00      0.99      1.00       130
         6.0       1.00      1.00      1.00         4
         7.0       1.00      0.50      0.67         2
         8.0       0.00      0.00      0.00         1
         9.0       1.00      1.00      1.00     10364
        10.0       1.00      0.97      0.98        32
        11.0       1.00      1.00      1.00     17567
        13.0       1.00      1.00      1.00         1
        14.0       1.00      1.00      1.00        41
        15.0       0.98      0.99      0.98        83
        16.0       0.00      0.00      0.00         2
        17.0       0.99      0.99      0.99       181
     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Metrics
print(f"Non-Augmented Accuracy: {acc_non_aug}")
print(f"Non-Augmented F1 Score: {f1_non_aug}")
print(f"Augmented Accuracy: {acc_aug}")
print(f"Augmented F1 Score: {f1_aug}")

Non-Augmented Accuracy: 0.9990727385122604
Non-Augmented F1 Score: 0.7551293198861161
Augmented Accuracy: 0.9991757675664538
Augmented F1 Score: 0.8375445353913546


In [None]:
# SMOTE-NC (for augmented dataset)

In [None]:
# SMOTE-NC (for nonaugmented dataset)