# Train Final Aegis Zero Model

This notebook trains the **Production XGBoost Model** for the Aegis Zero AI Engine.

**Dataset:** `dhoogla/cicids2017`

**Instructions:**
1. **Run All Cells**.
2. The script will automatically download the dataset from Kaggle.
   - *Fallback:* If auto-download fails (e.g., 403 Forbidden), please download the zip manually from Kaggle and upload it to the Files area.
3. It will process the data and train the model (`ROC AUC > 0.99` expected).
4. The final model `xgboost_final.joblib` will be downloaded automatically.

In [None]:
%pip install -q xgboost fastparquet pyarrow kaggle

In [None]:
import os
import glob
import zipfile
import pandas as pd
import numpy as np
import joblib
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

# Configuration
DATA_DIR = './data'
MODEL_NAME = 'xgboost_final.joblib'
os.makedirs(DATA_DIR, exist_ok=True)

In [None]:
# 1. Data Acquisition (Auto-Magic)
print("Checking for data...")

# Set up Kaggle Token
os.makedirs(os.path.expanduser('~/.kaggle'), exist_ok=True)
kaggle_token = '{"username":"rajeevchaurasia","key":"22ae91b46e46b6790b2c2d581ccebc02"}'
with open(os.path.expanduser('~/.kaggle/kaggle.json'), 'w') as f:
    f.write(kaggle_token)
os.chmod(os.path.expanduser('~/.kaggle/kaggle.json'), 0o600)

try:
    if not glob.glob(f'{DATA_DIR}/*.parquet') and not glob.glob(f'{DATA_DIR}/*.csv'):
        print("Attempting Kaggle download (dhoogla/cicids2017)...")
        !kaggle datasets download -d dhoogla/cicids2017 --unzip -p {DATA_DIR}
except Exception as e:
    print(f"Kaggle download warning: {e}")

# Manual Fallback: Check for Zips
zip_files = glob.glob('*.zip')
for zf in zip_files:
    print(f"Unzipping manual upload: {zf}...")
    with zipfile.ZipFile(zf, 'r') as zip_ref:
        zip_ref.extractall(DATA_DIR)

# Verify
data_files = glob.glob(f'{DATA_DIR}/*.parquet') + glob.glob(f'{DATA_DIR}/*.csv')
if not data_files:
    # Deep search
    data_files = glob.glob(f'{DATA_DIR}/**/*.parquet', recursive=True) + glob.glob(f'{DATA_DIR}/**/*.csv', recursive=True)

if not data_files:
    raise RuntimeError("CRITICAL: No data files found! Please upload the dataset zip file manualy.")
else:
    print(f"Found {len(data_files)} data files. Ready.")

In [None]:
# 2. Load & Explore
dfs = []
for f in data_files:
    try:
        if f.endswith('.parquet'):
            df = pd.read_parquet(f)
        else:
            df = pd.read_csv(f)
        dfs.append(df.sample(frac=0.3, random_state=42)) # Sample 30% for speed
    except Exception as e:
        print(f"Skipping {f}: {e}")

full_df = pd.concat(dfs, ignore_index=True)
print(f"Loaded {len(full_df)} samples.")

In [None]:
# 3. Preprocessing
SELECTED_FEATURES = [
    'Bwd Packet Length Std',
    'Bwd Packet Length Mean',
    'Average Packet Size',
    'Flow Bytes/s',
    'Flow Packets/s',
    'Fwd IAT Mean',
    'Fwd IAT Max',
    'Fwd IAT Min',
    'Fwd IAT Total',
    'Total Fwd Packets',
    'Subflow Fwd Packets',
    'Avg Bwd Segment Size'
]

# Clean Column Names
full_df.columns = full_df.columns.str.strip()
col_map = {
    'Avg Packet Size': 'Average Packet Size',
    'Subflow Fwd Pkts': 'Subflow Fwd Packets',
    'Tot Fwd Pkts': 'Total Fwd Packets'
}
full_df.rename(columns=col_map, inplace=True)

# Clean Data
full_df.replace([np.inf, -np.inf], np.nan, inplace=True)
full_df.dropna(inplace=True)
full_df['is_attack'] = full_df['Label'].apply(lambda x: 0 if str(x).lower() == 'benign' else 1)

X = full_df[SELECTED_FEATURES]
y = full_df['is_attack']
print(f"Features Shape: {X.shape}")

In [None]:
# 4. Train Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

n_pos = y_train.sum()
n_neg = len(y_train) - n_pos
scale_pos = n_neg / n_pos if n_pos > 0 else 1.0

print(f"Training XGBoost (Pos Weight: {scale_pos:.2f})...")

model = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos,
    n_jobs=-1,
    random_state=42,
    tree_method='hist' 
)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]
print(f"ROC AUC: {roc_auc_score(y_test, y_prob):.4f}")

In [None]:
# 5. Export
joblib.dump(model, MODEL_NAME)
print(f"Model saved: {MODEL_NAME}")

try:
    from google.colab import files
    files.download(MODEL_NAME)
except:
    print("Auto-download skipped (not in Colab).")