In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

In [2]:

data_path = "../data/cleaned/cicids2017_cleaned_nids.csv"
data = pd.read_csv(data_path)

print("Loaded dataset shape:", data.shape)

Loaded dataset shape: (2522009, 71)


In [3]:
print(data.columns)

Index([' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Fwd URG Flags',
       ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s',
       ' Bwd Packets/s', ' Min Packet Length', ' Max Packet Length',
       ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance',
       'FIN Flag Count', ' SYN Flag Count', 

In [4]:
data.columns = data.columns.str.strip()

In [5]:
print(data.columns)

Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd URG Flags',
       'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s',
       'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length',
       'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance',
       'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count',

In [6]:

X = data.drop('Label', axis=1)
y = data['Label']

In [7]:
# Encode labels (Binary: BENIGN = 0, Attacks = 1)
y = y.replace({
    'BENIGN': 0,
    'DoS Hulk': 1, 'DoS GoldenEye': 1, 'DoS Slowhttptest': 1, 'DoS slowloris': 1,
    'DDoS': 1, 'Heartbleed': 1, 'Bot': 1,
    'Web Attack � XSS': 1, 'Web Attack � Brute Force': 1, 'Web Attack � Sql Injection': 1,
    'Infiltration': 1, 'PortScan': 1, 'FTP-Patator': 1, 'SSH-Patator': 1
})
print("Unique labels after encoding:", y.unique())

Unique labels after encoding: [0 1]


  y = y.replace({


In [8]:

# One-hot encode categorical features

categorical_cols = X.select_dtypes(include=['object', 'category']).columns
if len(categorical_cols) > 0:
    print("Categorical columns detected:", categorical_cols)
    X = pd.get_dummies(X, columns=categorical_cols)
else:
    print("No categorical columns detected.")

No categorical columns detected.


In [9]:

# Remove zero-variance columns

zero_var_cols = [col for col in X.columns if X[col].nunique() <= 1]
print("Zero variance columns removed:", zero_var_cols)
X = X.drop(columns=zero_var_cols)

Zero variance columns removed: []


In [10]:

# Remove highly correlated features (correlation > 0.95)

corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
corr_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
print("Highly correlated columns removed:", corr_drop)
X = X.drop(columns=corr_drop)

Highly correlated columns removed: ['Total Backward Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Std', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Fwd IAT Total', 'Fwd IAT Max', 'Fwd Packets/s', 'Packet Length Std', 'SYN Flag Count', 'CWE Flag Count', 'ECE Flag Count', 'Average Packet Size', 'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Fwd Header Length.1', 'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Packets', 'Subflow Bwd Bytes', 'Idle Mean', 'Idle Max', 'Idle Min']


In [11]:

# Replace infinities & fill NaN

X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(X.mean(), inplace=True)

In [12]:

#  Outlier removal (Isolation Forest)

from sklearn.ensemble import IsolationForest

iso = IsolationForest(contamination=0.02, random_state=42)
yhat = iso.fit_predict(X)

mask = yhat != -1
X = X[mask]
y = y[mask]
print("After outlier removal:", X.shape)

After outlier removal: (2471568, 47)


In [13]:

# Scaling

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [14]:

#  Manual oversampling for balanced dataset

data_combined = pd.DataFrame(X_scaled, columns=X.columns)
data_combined['Label'] = y.values

majority_class = data_combined[data_combined['Label'] == 0]
minority_class = data_combined[data_combined['Label'] == 1]

minority_upsampled = resample(
    minority_class,
    replace=True,
    n_samples=len(majority_class),
    random_state=42
)

data_resampled = pd.concat([majority_class, minority_upsampled])
data_resampled = data_resampled.sample(frac=1, random_state=42).reset_index(drop=True)

X_resampled = data_resampled.drop('Label', axis=1).values
y_resampled = data_resampled['Label'].values

print("Resampled dataset shape:", X_resampled.shape)

Resampled dataset shape: (4111262, 47)


In [15]:

#  Train-test split

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

print("Training set:", X_train.shape)
print("Test set:", X_test.shape)

Training set: (3289009, 47)
Test set: (822253, 47)


In [16]:

# Save final preprocessed arrays

import os
output_dir = '../data/processed'
os.makedirs(output_dir, exist_ok=True)
np.save(os.path.join(output_dir, 'X_train.npy'), X_train)
np.save(os.path.join(output_dir, 'X_test.npy'), X_test)
np.save(os.path.join(output_dir, 'y_train.npy'), y_train)
np.save(os.path.join(output_dir, 'y_test.npy'), y_test)

print('All preprocessing completed — Data saved to', output_dir)

All preprocessing completed — Data saved to ../data/processed
