In [8]:
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import os

In [11]:
# 1. Configuration
DATA_PATH = "../data/processed/subset_ids2017.csv"
MODEL_DIR = "../models/"
MODEL_PATH = os.path.join(MODEL_DIR, "deeptrace_rf_v1.pkl")

# The exact list you provided (Top 20)
SELECTED_FEATURES = [
    'Bwd Packet Length Std', 'Packet Length Variance', 'Packet Length Std',
    'Total Length of Fwd Packets', 'Average Packet Size', 'Max Packet Length',
    'Bwd Packet Length Mean', 'Packet Length Mean', 'Subflow Fwd Bytes',
    'Fwd Packet Length Max', 'Fwd Packet Length Mean', 'Destination Port',
    'Fwd IAT Std', 'Flow IAT Max', 'Fwd Packet Length Std',
    'Total Fwd Packets', 'act_data_pkt_fwd', 'Init_Win_bytes_backward',
    'Bwd Packet Length Max', 'Fwd Header Length'
]

TARGET_COL = "Attack Type"

def train():
    print("‚è≥ Loading Dataset...")
    df = pd.read_csv(DATA_PATH)
    
    # 2. Filter Columns
    print(f"‚úÇÔ∏è  Filtering to Top {len(SELECTED_FEATURES)} features...")
    X = df[SELECTED_FEATURES]
    y = df[TARGET_COL]

    # 3. Train/Test Split (80% Train, 20% Test)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # 4. Initialize Model
    # n_jobs=-1 uses all CPU cores. class_weight handles the imbalance.
    print("üß† Training Random Forest (this will take a few minutes)...")
    clf = RandomForestClassifier(
        n_estimators=100,
        max_depth=20,          # Limit depth to prevent overfitting
        class_weight='balanced',
        n_jobs=-1,
        random_state=42
    )
    
    clf.fit(X_train, y_train)
    
    # 5. Evaluate
    print("‚úÖ Training Complete. Evaluating...")
    y_pred = clf.predict(X_test)
    
    print("\nüìä Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # 6. Save Artifacts
    if not os.path.exists(MODEL_DIR):
        os.makedirs(MODEL_DIR)
        
    joblib.dump(clf, MODEL_PATH)
    print(f"\nüíæ Model saved to: {MODEL_PATH}")
    
    # IMPORTANT: Save the feature list too! 
    # The C++ engine MUST output features in this EXACT order.
    feature_list_path = os.path.join(MODEL_DIR, "model_features.json")
    import json
    with open(feature_list_path, "w") as f:
        json.dump(SELECTED_FEATURES, f)
    print(f"üìù Feature list saved to: {feature_list_path}")

if __name__ == "__main__":
    train()

‚è≥ Loading Dataset...
‚úÇÔ∏è  Filtering to Top 20 features...
üß† Training Random Forest (this will take a few minutes)...
‚úÖ Training Complete. Evaluating...

üìä Classification Report:
                precision    recall  f1-score   support

          Bots       0.46      0.61      0.52        18
   Brute Force       1.00      0.97      0.99        75
          DDoS       1.00      1.00      1.00      1023
           DoS       0.99      0.99      0.99      1535
Normal Traffic       1.00      1.00      1.00     16616
 Port Scanning       0.99      1.00      1.00       715
   Web Attacks       0.52      0.61      0.56        18

      accuracy                           1.00     20000
     macro avg       0.85      0.88      0.87     20000
  weighted avg       1.00      1.00      1.00     20000


üíæ Model saved to: ../models/deeptrace_rf_v1.pkl
üìù Feature list saved to: ../models/model_features.json
