# IDS Preprocessing

In [1]:
# =========================
# 02_preprocessing.ipynb
# =========================

# 1. Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# 2. Chemin vers les données
data_path = "../data/cleaned/"
df=pd.read_csv(os.path.join(data_path, "dataset_explored.csv"))

# Séparation des colonnes Target 

In [2]:
# Targets (jamais modifiées)
y_lvl1 = df["Binary_Label"]
y_lvl2 = df["Attack_Family"]
y_lvl3 = df["Label_Original"]
# Features
X = df.drop(columns=[
    "Label",
    "Label_Original",
    "Attack_Family",
    "Binary_Label"
])


In [3]:
print(X.columns)

Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
       'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Co

# nettoyer les valeurs manquantes

In [4]:
import numpy as np

# 1. Nettoyage X
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.dropna(inplace=True)

# 2. Alignement des labels
y_lvl1 = y_lvl1.loc[X.index]
y_lvl2 = y_lvl2.loc[X.index]
y_lvl3 = y_lvl3.loc[X.index]


In [5]:
print("Shape de X :", X.shape)
print("Shape de y_lvl1 :", y_lvl1.shape)
print("Shape de y_lvl2 :", y_lvl2.shape)
print("Shape de y_lvl3 :", y_lvl3.shape)


Shape de X : (2827876, 78)
Shape de y_lvl1 : (2827876,)
Shape de y_lvl2 : (2827876,)
Shape de y_lvl3 : (2827876,)


# Réduction des features correlés 

In [6]:
import pandas as pd
import numpy as np

# -----------------------------
# 1️⃣ Liste des features à supprimer
# -----------------------------
to_drop = [
    'Total Backward Packets',
    'Fwd Packet Length Std',
    'Bwd Packet Length Std',
    'Fwd IAT Max',
    'Fwd Packets/s',
    'Packet Length Std',
    'CWE Flag Count',
    'ECE Flag Count',
    'Avg Fwd Segment Size',
    'Avg Bwd Segment Size',
    'Fwd Header Length.1',
    'Subflow Fwd Packets',
    'Subflow Bwd Packets',
    'Subflow Fwd Bytes',
    'Idle Max',
    'Idle Min'
]

X_reduced = X.drop(columns=to_drop)
print("Nombre de features après suppression :", X_reduced.shape[1])


Nombre de features après suppression : 62


# 2. PRÉPARATION DES DONNÉES (PAS DE SCALING ICI)


In [7]:
# -----------------------------
# 4️⃣ Préparation des données (SANS standardisation globale)
# -----------------------------
# On ne fait plus de fit_transform ici pour éviter la fuite de données.
# La standardisation sera faite séparément pour chaque modèle (L1, L2, L3) APRÈS le split.

X_unscaled = X_reduced.copy()
print("Shape X_unscaled :", X_unscaled.shape)

# Conversion en float32 pour économiser de l'espace
X_unscaled_f32 = X_unscaled.astype('float32')


Shape X_unscaled : (2827876, 62)


In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_lvl2_encoded = le.fit_transform(y_lvl2)
print("Classes encodées :", list(le.classes_))


Classes encodées : ['BENIGN', 'Bot', 'BruteForce', 'DDoS', 'DoS', 'PortScan', 'RareAttack', 'WebAttack']


# 3. SAUVEGARDE DES DONNÉES BRUTES


In [9]:
import joblib
import os

processed_path = "../data/processed/"
os.makedirs(processed_path, exist_ok=True)

joblib.dump(X_unscaled_f32, os.path.join(processed_path, "X_unscaled.joblib"), compress=3)
joblib.dump(y_lvl1, os.path.join(processed_path, "y_lvl1.joblib"), compress=3)
joblib.dump(y_lvl2_encoded, os.path.join(processed_path, "y_lvl2_encoded.joblib"), compress=3)
joblib.dump(le, os.path.join(processed_path, "label_encoder_lvl2.joblib"))

if "Label" in df.columns:
    y_lvl3 = df["Label"].loc[X.index]
    joblib.dump(y_lvl3, os.path.join(processed_path, "y_lvl3.joblib"), compress=3)

print("Sauvegarde terminée avec succès.")


Sauvegarde terminée avec succès.
