In [22]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

In [23]:
# Reference: https://github.com/Saurabh2805/kdd_cup_99/blob/master/KDD_CUP_99_dataset_1.ipynb
# Label = 'intrusion_type'
columns = ['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised','root_shell','su_attempted',
'num_root','num_file_creations','num_shells','num_access_files','num_outbound_cmds',
'is_host_login',
'is_guest_login',
'count',
'srv_count',
'serror_rate',
'srv_serror_rate',
'rerror_rate',
'srv_rerror_rate',
'same_srv_rate',
'diff_srv_rate',
'srv_diff_host_rate',
'dst_host_count',
'dst_host_srv_count',
'dst_host_same_srv_rate',
'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate',
'dst_host_serror_rate',
'dst_host_srv_serror_rate',
'dst_host_rerror_rate',
'dst_host_srv_rerror_rate',
'intrusion_type']

In [24]:
df = pd.read_csv('sample_data/kddcup.data_10_percent_testing.csv', names=columns, header=None)
print(f"Total rows and columns: {df.shape}")

Total rows and columns: (494021, 42)


In [25]:
# Ordinal Encoding
columns_encode = ['protocol_type','service','flag','intrusion_type']
df[columns_encode] = OrdinalEncoder().fit_transform(df[columns_encode])

In [26]:
# Check for NANs
print(f"Number of NANs: {df.isna().sum().sum()}")
df.dropna(inplace=True)
print(f"Number of NaNs after drop: {df.isna().sum().sum()}")

Number of NANs: 0
Number of NaNs after drop: 0


In [27]:
duplicates_before = df.duplicated().sum()
print(f"Number of duplicate rows before removal: {duplicates_before}")
df.drop_duplicates(inplace=True)
duplicates_after = df.duplicated().sum()
print(f"Number of duplicate rows after removal: {duplicates_after}")

Number of duplicate rows before removal: 348435
Number of duplicate rows after removal: 0


In [28]:
print(df.shape)

(145586, 42)


In [29]:
print("Class distribution:")
print(df['intrusion_type'].value_counts())

Class distribution:
intrusion_type
11.0    87832
9.0     51820
0.0       968
20.0      918
17.0      906
21.0      893
5.0       651
18.0      641
15.0      416
14.0      206
10.0      158
3.0        53
1.0        30
22.0       20
6.0        19
4.0        12
16.0       10
7.0         9
2.0         8
8.0         7
13.0        4
12.0        3
19.0        2
Name: count, dtype: int64


In [30]:
#Random forest on non-augmented dataset.
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 1. Split features and target
X = df.drop(columns=['intrusion_type'])
y = df['intrusion_type']

# 2. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 3. Train Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# 4. Evaluate on test set
y_pred = rf.predict(X_test)
print("Classification Report (Non-Augmented):")
print(classification_report(y_test, y_pred))


Classification Report (Non-Augmented):
              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00       194
         1.0       0.67      0.67      0.67         6
         2.0       0.00      0.00      0.00         2
         3.0       1.00      1.00      1.00        11
         4.0       1.00      0.50      0.67         2
         5.0       1.00      0.98      0.99       130
         6.0       1.00      0.75      0.86         4
         7.0       0.00      0.00      0.00         2
         8.0       0.00      0.00      0.00         1
         9.0       1.00      1.00      1.00     10364
        10.0       0.97      0.97      0.97        32
        11.0       1.00      1.00      1.00     17567
        13.0       1.00      1.00      1.00         1
        14.0       1.00      1.00      1.00        41
        15.0       0.98      1.00      0.99        83
        16.0       0.00      0.00      0.00         2
        17.0       0.99      0.99      0.9

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
#Random Oversampling with Smoothing for nonaugmented dataset

In [31]:
#SMOTENC for nonaugmented dataset
from imblearn.over_sampling import SMOTENC
from sklearn.preprocessing import OrdinalEncoder

# Drop classes with fewer than 6 samples
vc = df['intrusion_type'].value_counts()
valid_classes = vc[vc >= 6].index
df = df[df['intrusion_type'].isin(valid_classes)]

#Categorical column names
categorical_cols = ['protocol_type', 'service', 'flag']
cat_indices = [df.columns.get_loc(col) for col in categorical_cols]

# Split features and target
X = df.drop(columns=['intrusion_type'])
y = df['intrusion_type']

#Apply SMOTENC
smote_nc = SMOTENC(categorical_features=cat_indices, random_state=42, sampling_strategy='auto')
X_resampled, y_resampled = smote_nc.fit_resample(X, y)

print("Original dataset class distribution:")
print(y.value_counts())

print("\nAugmented dataset class distribution:")
print(y_resampled.value_counts())
# Create augmented DataFrame
df_augmented = pd.DataFrame(X_resampled, columns=X.columns)
df_augmented['intrusion_type'] = y_resampled

print("Original dataset class distribution:")
print(y.value_counts())

print("\nAugmented dataset class distribution:")
print(y_resampled.value_counts())


Original dataset class distribution:
intrusion_type
11.0    87832
9.0     51820
0.0       968
20.0      918
17.0      906
21.0      893
5.0       651
18.0      641
15.0      416
14.0      206
10.0      158
3.0        53
1.0        30
22.0       20
6.0        19
4.0        12
16.0       10
7.0         9
2.0         8
8.0         7
Name: count, dtype: int64

Augmented dataset class distribution:
intrusion_type
11.0    87832
1.0     87832
7.0     87832
9.0     87832
18.0    87832
3.0     87832
14.0    87832
20.0    87832
15.0    87832
5.0     87832
6.0     87832
2.0     87832
0.0     87832
4.0     87832
17.0    87832
10.0    87832
8.0     87832
22.0    87832
21.0    87832
16.0    87832
Name: count, dtype: int64
Original dataset class distribution:
intrusion_type
11.0    87832
9.0     51820
0.0       968
20.0      918
17.0      906
21.0      893
5.0       651
18.0      641
15.0      416
14.0      206
10.0      158
3.0        53
1.0        30
22.0       20
6.0        19
4.0        12
16.0  

In [33]:
# Use your augmented DataFrame
X_aug = df_augmented.drop(columns=['intrusion_type'])
y_aug = df_augmented['intrusion_type']

# Train-test split
X_train_aug, X_test_aug, y_train_aug, y_test_aug = train_test_split(
    X_aug, y_aug, test_size=0.2, stratify=y_aug, random_state=42
)

# Train and evaluate
rf_aug = RandomForestClassifier(random_state=42)
rf_aug.fit(X_train_aug, y_train_aug)
y_pred_aug = rf_aug.predict(X_test_aug)

print("Classification Report (SMOTENC-Augmented):")
print(classification_report(y_test_aug, y_pred_aug))


Classification Report (SMOTENC-Augmented):
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     17566
         1.0       1.00      1.00      1.00     17566
         2.0       1.00      1.00      1.00     17566
         3.0       1.00      1.00      1.00     17566
         4.0       1.00      1.00      1.00     17567
         5.0       1.00      1.00      1.00     17566
         6.0       1.00      1.00      1.00     17567
         7.0       1.00      1.00      1.00     17567
         8.0       1.00      1.00      1.00     17566
         9.0       1.00      1.00      1.00     17567
        10.0       1.00      1.00      1.00     17567
        11.0       1.00      1.00      1.00     17566
        14.0       1.00      1.00      1.00     17567
        15.0       1.00      1.00      1.00     17566
        16.0       1.00      1.00      1.00     17567
        17.0       1.00      1.00      1.00     17566
        18.0       1.00      1.00     