In [1]:
import pandas as pd
import os
from sklearn.datasets import fetch_kddcup99

print("üì° Starting KDD Cup 99 dataset download (light version)...")

# 1. Download data (fetches features and labels separately)
# 'percent10' downloads only 10% for quick testing.
kdd_data = fetch_kddcup99(subset='SA', percent10=True, as_frame=True)

# 2. Merge everything into a single DataFrame
df = kdd_data.frame
# Scikit-learn names the target column 'labels', but our main.py expects 'label'
df = df.rename(columns={'labels': 'label'})

# 3. Quick cleanup (convert bytes to strings to avoid format issues)
# The original dataset comes in binary format (e.g., b'tcp'); this cleans it up.
for col in df.columns:
    if df[col].dtype == object:
        df[col] = df[col].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

# 4. Save to the correct folder
output_dir = "../data/raw"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "network_data.csv")

df.to_csv(output_path, index=False)

print(f"‚úÖ Success! Data saved to: {output_path}")
print(f"üìä Dimensions: {df.shape} (Rows, Columns)")
print("üîç Column preview:", list(df.columns[:5]), "...", list(df.columns[-1:]))

üì° Starting KDD Cup 99 dataset download (light version)...
‚úÖ Success! Data saved to: ../data/raw/network_data.csv
üìä Dimensions: (100655, 42) (Rows, Columns)
üîç Column preview: ['duration', 'protocol_type', 'service', 'flag', 'src_bytes'] ... ['label']


In [2]:
# Pass the correct relative path as an argument
%run ../main.py ../data/raw/network_data.csv

2025-12-04 09:31:49,662 - INFO - Starting Network Intrusion Detection pipeline
2025-12-04 09:31:49,663 - INFO - Loading data from ../data/raw/network_data.csv
2025-12-04 09:31:49,950 - INFO - Loaded 100655 records with 42 features
2025-12-04 09:31:49,951 - INFO - Preprocessing data
2025-12-04 09:31:49,980 - INFO - Encoded 11 target classes
2025-12-04 09:31:50,118 - INFO - Preprocessed 100655 samples with 41 features
2025-12-04 09:31:50,190 - INFO - Train set: 80523 samples, Test set: 20131 samples
2025-12-04 09:31:50,190 - INFO - Handling class imbalance with SMOTE
2025-12-04 09:31:50,192 - INFO - Original class distribution: {5: 77822, 8: 1926, 3: 722, 6: 10, 7: 10, 2: 9, 10: 6, 0: 6, 9: 6, 4: 6}
2025-12-04 09:31:50,193 - INFO - Adjusting SMOTE k_neighbors to 5 due to rare classes
2025-12-04 09:31:51,940 - INFO - Resampled class distribution: {5: 77822, 8: 77822, 3: 77822, 10: 77822, 0: 77822, 2: 77822, 6: 77822, 9: 77822, 7: 77822, 4: 77822}
2025-12-04 09:31:51,941 - INFO - Training 