In [1]:
import pandas as pd
import os
import numpy as np

### Preprocessing

In [None]:
# List to store DataFrames
dfs = []

# Load the datasets
for i in os.listdir('./data/'):
    if i.endswith('.csv'):
        dfs.append(pd.read_csv(f'./data/{i}'))

In [9]:
# Concatenate all DataFrames into a single DataFrame
data = pd.concat(dfs, axis=0, ignore_index=True)

# Deleting DataFrames after merging
for df in dfs: del df

In [10]:
# Remove trailing spaces from column names
data.columns = data.columns.str.strip()

In [11]:
# replace infinite values with NaN
data.replace([float('inf'), float('-inf')], np.nan, inplace=True)

In [None]:
    # filling missing values with mean
missing_values = data.isna().sum()
for column, count in missing_values.items():
    if count != 0:
        data.fillna({column: data[column].mean()}, inplace=True)
        print(f"Filled missing values in '{column}' with mean: {data[column].mean()}")

Filled missing values in 'Flow Bytes/s' with mean: 1491719.0643420683
Filled missing values in 'Flow Packets/s' with mean: 70854.23306262739


In [13]:
# dropping columns based on correlation
columns_to_drop = [
    "Fwd PSH Flags",
    "Fwd URG Flags",
    "Avg Fwd Segment Size",
    "Avg Bwd Segment Size",
    "Fwd Header Length.1",
    "Subflow Fwd Packets",
    "Subflow Fwd Bytes",
    "Subflow Bwd Packets",
    "Subflow Bwd Bytes",
]

data.drop(columns=columns_to_drop, inplace=True)

print("No.of columns after dropping highly correlated columns: ", data.shape[1])

No.of columns after dropping highly correlated columns:  70


In [14]:
#Converting The Label Column into Normal and Abnormal representated by 0 and 1 respectively.
data['Label'] = data['Label'].map({'BENIGN': 0})
data['Label'] = data['Label'].fillna(1)

# convert Label column to int
data['Label'] = data['Label'].astype(int)

In [15]:
# Removal of duplicates
data = data.drop_duplicates(keep='first')

In [16]:
# Identify columns with identical data
identical_columns = {}
columns = data.columns
list_control = columns.copy().tolist()

# Compare each pair of columns
for col1 in columns:
    for col2 in columns:
        if col1 != col2:
            if data[col1].equals(data[col2]):
                if (col1 not in identical_columns) and (col1 in list_control):
                    identical_columns[col1] = [col2]
                    list_control.remove(col2)
                elif (col1 in identical_columns) and (col1 in list_control):
                    identical_columns[col1].append(col2)
                    list_control.remove(col2)
                    
for key, value in identical_columns.items():
    data.drop(columns=value, inplace=True)

In [17]:
# Dropping the columns with only one unique value
only_unique_cols = []
for col in data.columns:
    if len(data[col].unique()) == 1:
        only_unique_cols.append(col)

data.drop(only_unique_cols, axis=1, inplace=True)
del only_unique_cols

In [18]:
print("Final data shape: ", data.shape)
print("class distribution: ", data['Label'].value_counts())

Final data shape:  (2522362, 62)
class distribution:  Label
0    2096484
1     425878
Name: count, dtype: int64


In [19]:
# Save the cleaned data to a new CSV file
data.to_csv('./data/cleaned_data.csv', index=False)
print("Cleaned data saved to './data/cleaned_data.csv'")

Cleaned data saved to './data/cleaned_data.csv'


### Feature Scaling and SMOTE

In [9]:
import pandas as pd

data = pd.read_csv('./data/cleaned_data.csv')

In [11]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

In [12]:
# Separate features and target
X = data.drop('Label', axis=1)
y = data['Label']

In [13]:
# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#save the scaler
joblib.dump(scaler, './scaler.pkl')

['./scaler.pkl']

In [15]:
# Split into train and test (SMOTE is applied only to training data)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, test_size=0.2, random_state=42)

In [17]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [20]:
# save the resampled data and test data
X_train_resampled = pd.DataFrame(X_train_resampled, columns=X.columns)
y_train_resampled = pd.Series(y_train_resampled)
X_test = pd.DataFrame(X_test, columns=X.columns)
y_test = pd.Series(y_test)

train = pd.concat([X_train_resampled, y_train_resampled], axis=1)
test = pd.concat([X_test, y_test], axis=1)

train.to_csv('./data/train.csv', index=False)
test.to_csv('./data/test.csv', index=False)

In [8]:
# Distribution before and after SMOTE
print("Distribution before SMOTE:")
print(y.value_counts())
print("Distribution after SMOTE:")
print(y_train_resampled.value_counts())

Distribution before SMOTE:
Label
0    2096484
1     425878
Name: count, dtype: int64
Distribution after SMOTE:
Label
0    1677187
1    1677187
Name: count, dtype: int64
