In [3]:
import sys
import os
import pandas as pd
import joblib

# 1. Import functions from main.py
sys.path.append(os.path.abspath('..'))
from main import (
    load_data, 
    preprocess_data, 
    handle_imbalance, 
    train_random_forest, 
    train_logistic_regression,
    evaluate_model
)

print("üöÄ Starting Model Comparison Pipeline...")

üöÄ Starting Model Comparison Pipeline...


In [4]:
# 2. Load and Prepare Data (Common Step)
data_path = "../data/raw/network_data.csv"
df = load_data(data_path)

# Preprocess (scaling + encoders)
X, y, artifacts = preprocess_data(df, fit=True)

# Remove rare classes (Fix for stratification)
class_counts = y.value_counts()
valid_classes = class_counts[class_counts >= 2].index
mask = y.isin(valid_classes)
X = X[mask]
y = y[mask]

# Split (Standard 80/20)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Apply SMOTE to Training Data ONLY
X_train_res, y_train_res = handle_imbalance(X_train, y_train)

print(f"\n‚úÖ Data Ready. Training Set: {X_train_res.shape}, Test Set: {X_test.shape}")

2025-12-05 03:47:05,397 - INFO - Loading data from ../data/raw/network_data.csv
2025-12-05 03:47:05,664 - INFO - Loaded 100655 records with 42 features
2025-12-05 03:47:05,667 - INFO - Preprocessing data
2025-12-05 03:47:05,696 - INFO - Encoded 11 target classes
2025-12-05 03:47:05,833 - INFO - Preprocessed 100655 samples with 41 features
2025-12-05 03:47:05,906 - INFO - Handling class imbalance with SMOTE
2025-12-05 03:47:05,908 - INFO - Original class distribution: {5: 77822, 8: 1926, 3: 722, 6: 10, 7: 10, 2: 9, 10: 6, 0: 6, 9: 6, 4: 6}
2025-12-05 03:47:05,908 - INFO - Adjusting SMOTE k_neighbors to 5 due to rare classes
2025-12-05 03:47:07,606 - INFO - Resampled class distribution: {5: 77822, 8: 77822, 3: 77822, 10: 77822, 0: 77822, 2: 77822, 6: 77822, 9: 77822, 7: 77822, 4: 77822}



‚úÖ Data Ready. Training Set: (778220, 41), Test Set: (20131, 41)


In [5]:
# ---------------------------------------------------------
# MODEL 1: RANDOM FOREST
# ---------------------------------------------------------
print("\nüå≤ Training Random Forest...")
rf_model = train_random_forest(X_train_res, y_train_res)
print("Evaluating Random Forest...")
rf_metrics = evaluate_model(rf_model, X_test, y_test)
rf_metrics

2025-12-05 03:47:17,047 - INFO - Training Random Forest with 100 estimators



üå≤ Training Random Forest...


2025-12-05 03:47:23,736 - INFO - Random Forest training completed
2025-12-05 03:47:23,738 - INFO - Evaluating model performance
2025-12-05 03:47:23,835 - INFO - Precision (weighted): 0.9997
2025-12-05 03:47:23,836 - INFO - Recall (weighted): 0.9999
2025-12-05 03:47:23,836 - INFO - Precision (macro): 0.8000
2025-12-05 03:47:23,837 - INFO - Recall (macro): 0.8000
2025-12-05 03:47:23,855 - INFO - 
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           2       1.00      1.00      1.00         2
           3       1.00      1.00      1.00       180
           4       0.00      0.00      0.00         1
           5       1.00      1.00      1.00     19456
           6       1.00      1.00      1.00         3
           7       1.00      1.00      1.00         3
           8       1.00      1.00      1.00       481
           9       1.00      1.00      1.00         1
          10       0.00      0.00     

Evaluating Random Forest...


{'precision_weighted': 0.9997019751880643,
 'recall_weighted': 0.9998509761065024,
 'precision_macro': 0.7999845829693201,
 'recall_macro': 0.8}

In [6]:
# ---------------------------------------------------------
# MODEL 2: LOGISTIC REGRESSION
# ---------------------------------------------------------
print("\nüìà Training Logistic Regression...")
lr_model = train_logistic_regression(X_train_res, y_train_res)
print("Evaluating Logistic Regression...")
lr_metrics = evaluate_model(lr_model, X_test, y_test)
lr_metrics

2025-12-05 03:47:28,607 - INFO - Training Logistic Regression (max_iter=1000)



üìà Training Logistic Regression...


2025-12-05 03:48:13,472 - INFO - Logistic Regression training completed
2025-12-05 03:48:13,473 - INFO - Evaluating model performance
2025-12-05 03:48:13,524 - INFO - Precision (weighted): 0.9993
2025-12-05 03:48:13,526 - INFO - Recall (weighted): 0.9972
2025-12-05 03:48:13,526 - INFO - Precision (macro): 0.5995
2025-12-05 03:48:13,527 - INFO - Recall (macro): 0.8664
2025-12-05 03:48:13,552 - INFO - 
Classification Report:
              precision    recall  f1-score   support

           0       0.40      1.00      0.57         2
           2       1.00      1.00      1.00         2
           3       0.98      1.00      0.99       180
           4       0.00      0.00      0.00         1
           5       1.00      1.00      1.00     19456
           6       0.40      0.67      0.50         3
           7       0.15      1.00      0.26         3
           8       1.00      1.00      1.00       481
           9       1.00      1.00      1.00         1
          10       0.07      1.0

Evaluating Logistic Regression...


{'precision_weighted': 0.9993377133096643,
 'recall_weighted': 0.9971685460235458,
 'precision_macro': 0.5994875995153645,
 'recall_macro': 0.8663839775219297}

In [7]:
# ---------------------------------------------------------
# COMPARISON TABLE
# ---------------------------------------------------------
print("\nüèÜ FINAL RESULTS üèÜ")
results_df = pd.DataFrame({
    "Metric": list(rf_metrics.keys()),
    "Random Forest": list(rf_metrics.values()),
    "Logistic Regression": list(lr_metrics.values()),
})

results_df = results_df.set_index("Metric")
display(results_df)


üèÜ FINAL RESULTS üèÜ


Unnamed: 0_level_0,Random Forest,Logistic Regression
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
precision_weighted,0.999702,0.999338
recall_weighted,0.999851,0.997169
precision_macro,0.799985,0.599488
recall_macro,0.8,0.866384


In [8]:
# Ensure models directory exists
os.makedirs("../models", exist_ok=True)

# Optional: Save the best model (Random Forest usually wins)
joblib.dump(rf_model, "../models/best_model_rf.pkl")
print("\n‚úÖ Best model saved to ../models/best_model_rf.pkl")


‚úÖ Best model saved to ../models/best_model_rf.pkl
