# Classification Level 2 

## data loading

In [45]:
import joblib
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
processed_path = "../data/processed/"
# Load optimized data
X_full = joblib.load(os.path.join(processed_path, "X_scaled.joblib"))
y_bin = joblib.load(os.path.join(processed_path, "y_lvl1.joblib"))
y_multi = joblib.load(os.path.join(processed_path, "y_lvl2_encoded.joblib"))
# --- Level 2 Filtering: Attacks Only ---
X_attack = X_full[y_bin == 1]
y_attack_raw = y_multi[y_bin == 1]
# IMPORTANT: XGBoost requires labels starting at 0.
le = LabelEncoder()
y_attack = le.fit_transform(y_attack_raw)
# View the mapping
mapping = dict(zip(range(len(le.classes_)), le.classes_))
print(f"Mapping detected: {mapping}")
# Example: 0 -> Classe originale 1, 1 -> Classe originale 2, etc.
print(f"Attack data loaded: {X_attack.shape}")

Mapping detected: {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7}
Attack data loaded: (556556, 62)


## Data Splitting

In [46]:
# Split 70% Train, 30% Temporary
X_train, X_temp, y_train, y_temp = train_test_split(
    X_attack, y_attack, test_size=0.3, random_state=42, stratify=y_attack
)
# Split the 30% into 15% Validation and 15% Test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)
print(f"Train size: {X_train.shape[0]}")
print(f"Val size: {X_val.shape[0]}")
print(f"Test size: {X_test.shape[0]}")

Train size: 389589
Val size: 83483
Test size: 83484


## Training : Model Random Forest

In [47]:
from sklearn.ensemble import RandomForestClassifier
print("Training Random Forest...")
rf_multi = RandomForestClassifier(
    n_estimators=50, 
    max_depth=15, 
    class_weight='balanced', 
    n_jobs=-1, 
    random_state=42
)
rf_multi.fit(X_train, y_train)
# Validation check
y_val_pred = rf_multi.predict(X_val)
print("\n--- Validation Report (RF) ---")
print(classification_report(y_val, y_val_pred))

Training Random Forest...

--- Validation Report (RF) ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       294
           1       0.99      0.93      0.96      2301
           2       1.00      1.00      1.00     19203
           3       1.00      1.00      1.00     37757
           4       1.00      1.00      1.00     23820
           5       1.00      0.71      0.83         7
           6       0.36      0.86      0.51       101

    accuracy                           1.00     83483
   macro avg       0.91      0.93      0.90     83483
weighted avg       1.00      1.00      1.00     83483



## Training  Model : XGBoost

In [None]:
from xgboost import XGBClassifier
print("Training XGBoost with Early Stopping...")
xgb_multi = XGBClassifier(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=6,
    objective='multi:softprob',
    n_jobs=-1,
    random_state=42
)
# Fit using validation set for early stopping
xgb_multi.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=10,
    verbose=False
)
# Validation check
y_val_pred_xgb = xgb_multi.predict(X_val)
print("\n--- Validation Report (XGBoost) ---")
print(classification_report(y_val, y_val_pred_xgb))

Training XGBoost with Early Stopping...


  num_parallel_tree: Optional[int] = None,


# Test Final et sauvegarde du meilleur model

In [None]:
# Final predictions on Test set
y_test_pred = rf_multi.predict(X_test)
print("\n--- FINAL TEST REPORT (Level 2) ---")
# This report shows accuracy, precision, recall, and F1 per attack family
print(classification_report(y_test, y_test_pred))
# Confusion Matrix
plt.figure(figsize=(10,8))
sns.heatmap(confusion_matrix(y_test, y_test_pred), annot=True, fmt='d', cmap='Reds')
plt.title("Confusion Matrix - Level 2 (Test Set)")
plt.xlabel("Predicted Class")
plt.ylabel("Actual Class")
plt.show()
# Save the multi-class model AND the encoder
model_path = "../data/trained_models/"
joblib.dump(rf_multi, os.path.join(model_path, "model_lvl2_multiclass.joblib"))
joblib.dump(le, os.path.join(model_path, "label_encoder_lvl2_final.joblib"))
print("Model and LabelEncoder saved successfully.")