In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Reference: https://github.com/Saurabh2805/kdd_cup_99/blob/master/KDD_CUP_99_dataset_1.ipynb
# Label = 'intrusion_type'
columns = ['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised','root_shell','su_attempted',
'num_root','num_file_creations','num_shells','num_access_files','num_outbound_cmds',
'is_host_login',
'is_guest_login',
'count',
'srv_count',
'serror_rate',
'srv_serror_rate',
'rerror_rate',
'srv_rerror_rate',
'same_srv_rate',
'diff_srv_rate',
'srv_diff_host_rate',
'dst_host_count',
'dst_host_srv_count',
'dst_host_same_srv_rate',
'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate',
'dst_host_serror_rate',
'dst_host_srv_serror_rate',
'dst_host_rerror_rate',
'dst_host_srv_rerror_rate',
'intrusion_type']

In [3]:
df = pd.read_csv('sample_data/kddcup.data_10_percent_testing.csv', names=columns, header=None)
print(f"Total rows and columns: {df.shape}")

Total rows and columns: (83747, 42)


In [4]:
# Ordinal Encoding
from sklearn.preprocessing import OrdinalEncoder

columns_encode = ['protocol_type','service','flag','intrusion_type']
df[columns_encode] = OrdinalEncoder().fit_transform(df[columns_encode])

In [5]:
# Check for NANs
NANs_before = df.isna().sum().sum()
print(f"Number of NANs before removal: {NANs_before}")
df.dropna(inplace=True)
NANs_after = df.isna().sum().sum()
print(f"Number of NaNs after removal: {NANs_after}")

Number of NANs before removal: 14
Number of NaNs after removal: 0


In [6]:
# Check for Duplicates
duplicates_before = df.duplicated().sum()
print(f"Number of duplicate rows before removal: {duplicates_before}")
df.drop_duplicates(inplace=True)
duplicates_after = df.duplicated().sum()
print(f"Number of duplicate rows after removal: {duplicates_after}")

Number of duplicate rows before removal: 21964
Number of duplicate rows after removal: 0


In [7]:
# print(df.shape)

In [8]:
# Split into Train and Test
from sklearn.model_selection import train_test_split

x = df.drop('intrusion_type', axis=1)
y = df['intrusion_type']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, train_size=0.8)

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

def RandomForest(x_train, y_train, x_test, y_test):
  rf = RandomForestClassifier(n_estimators=100, random_state=42)
  rf.fit(x_train, y_train)

  y_predict = rf.predict(x_test)
  accuracy = accuracy_score(y_test, y_predict)
  f1 = f1_score(y_test, y_predict, average='weighted')

  print(f"Accuracy: {accuracy}")
  print(f"F1 Score: {f1}")

In [16]:
# Reference: https://github.com/atulpatelDS/Youtube/blob/main/Machine_Learning/Imbalanced_Dataset_Handling/Different%20Techniques%20to%20deal%20with%20Imbalanced%20Dataset%20(Imbalanced%20Classes)%20in%20Machine%20Learning.ipynb
# Random Oversampling with Smoothing
from imblearn.over_sampling import RandomOverSampler

def random_oversampling(x_train, y_train):
  ros = RandomOverSampler()
  x_train_ros, y_train_ros = ros.fit_resample(x_train, y_train)
  return x_train_ros, y_train_ros
  # print(f"x_train_ros shape: {x_train_ros.shape}, y_train_ros shape: {y_train_ros.shape}")

In [17]:
# Random Oversampling with Smoothing (for nonaugmented dataset)
RandomForest(x_train, y_train, x_test, y_test)

Accuracy: 0.9995144452537024


In [18]:
# Random Oversampling with Smoothing (for augmented dataset)
x_train_ros, y_train_ros = random_oversampling(x_train, y_train)
RandomForest(x_train_ros, y_train_ros, x_test, y_test)

Accuracy: 0.9993525936716031


In [None]:
# SMOTE-NC (for augmented dataset)

In [None]:
# SMOTE-NC (for nonaugmented dataset)