## 1. Import Required Libraries

In [None]:
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_kddcup99
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    roc_auc_score,
    roc_curve,
    precision_recall_curve,
    auc
)
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("Libraries imported successfully!")

## 2. Load and Explore Dataset

In [None]:
# Load KDD Cup 1999 dataset (10% subset for speed)
print("Loading KDD Cup 1999 dataset (10% subset)...")
data = fetch_kddcup99(percent10=True, shuffle=True, random_state=42)

# Create DataFrame
X_raw = pd.DataFrame(data.data)
y_raw = pd.Series(data.target)

print(f"Dataset shape: {X_raw.shape}")
print(f"Number of samples: {len(X_raw)}")
print(f"Number of features: {X_raw.shape[1]}")

In [None]:
# Assign column names (KDD has 41 features)
X_raw.columns = [f"f{i}" for i in range(X_raw.shape[1])]

# Display first few rows
print("First 5 rows of raw data:")
X_raw.head()

## 3. Data Preprocessing

In [None]:
# Decode bytes to strings where needed (object dtype)
for col in X_raw.select_dtypes([object]).columns:
    X_raw[col] = X_raw[col].apply(lambda v: v.decode() if isinstance(v, bytes) else v)

print("Data types after decoding:")
print(X_raw.dtypes.value_counts())

In [None]:
# Process target variable: decode and map to binary (normal -> 0, attack -> 1)
y = y_raw.apply(lambda b: b.decode() if isinstance(b, bytes) else b)
y_binary = (y != 'normal.').astype(int)

# Check class distribution
print("\nClass distribution:")
print(f"Normal (0): {(y_binary == 0).sum()} ({(y_binary == 0).sum() / len(y_binary) * 100:.2f}%)")
print(f"Attack (1): {(y_binary == 1).sum()} ({(y_binary == 1).sum() / len(y_binary) * 100:.2f}%)")

# Visualize class distribution
plt.figure(figsize=(8, 5))
y_binary.value_counts().plot(kind='bar', color=['#2ecc71', '#e74c3c'])
plt.title('Class Distribution: Normal vs Attack', fontsize=14, fontweight='bold')
plt.xlabel('Class (0=Normal, 1=Attack)', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Identify categorical columns in KDD
# In KDD Cup 1999: f1=protocol_type, f2=service, f3=flag
categorical_cols = ['f1', 'f2', 'f3']
numeric_cols = [c for c in X_raw.columns if c not in categorical_cols]

print(f"Categorical columns: {len(categorical_cols)}")
print(f"Numeric columns: {len(numeric_cols)}")
print(f"\nCategorical features: {categorical_cols}")

# Show unique values in categorical columns
for col in categorical_cols:
    print(f"\n{col}: {X_raw[col].nunique()} unique values")
    print(f"Sample values: {X_raw[col].unique()[:10]}")

## 4. Build Preprocessing and Model Pipeline

In [None]:
# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
        ("num", StandardScaler(), numeric_cols),
    ],
    remainder='drop'
)

# Create full pipeline with RandomForest classifier
pipeline = Pipeline([
    ("pre", preprocessor),
    ("clf", RandomForestClassifier(
        n_estimators=200,
        max_depth=20,
        min_samples_split=10,
        min_samples_leaf=4,
        n_jobs=-1,
        random_state=42,
        verbose=1
    ))
])

print("Pipeline created successfully!")
print("\nPipeline steps:")
print(pipeline)

## 5. Train-Test Split

In [None]:
# Split data into train and test sets (80-20 split, stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X_raw, 
    y_binary, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_binary
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"\nTraining set class distribution:")
print(f"Normal: {(y_train == 0).sum()} | Attack: {(y_train == 1).sum()}")
print(f"\nTest set class distribution:")
print(f"Normal: {(y_test == 0).sum()} | Attack: {(y_test == 1).sum()}")

## 6. Train the Model

In [None]:
# Train the pipeline
print("Training Random Forest model...\n")
pipeline.fit(X_train, y_train)
print("\nModel training completed!")

## 7. Model Evaluation

In [None]:
# Make predictions
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

# Training accuracy
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)

print(f"Training Accuracy: {train_score:.4f}")
print(f"Test Accuracy: {test_score:.4f}")

In [None]:
# Classification Report (Key SOC Metrics)
print("\n" + "="*60)
print("CLASSIFICATION REPORT (Binary: Normal vs Attack)")
print("="*60)
print(classification_report(y_test, y_pred, target_names=['Normal', 'Attack'], digits=4))

# ROC AUC Score
roc_auc = roc_auc_score(y_test, y_proba)
print(f"\nROC AUC Score: {roc_auc:.4f}")

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Normal', 'Attack'],
            yticklabels=['Normal', 'Attack'],
            cbar_kws={'label': 'Count'})
plt.title('Confusion Matrix - Network Intrusion Detection', fontsize=14, fontweight='bold')
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
plt.show()

# Calculate specific metrics from confusion matrix
tn, fp, fn, tp = cm.ravel()
print(f"\nConfusion Matrix Breakdown:")
print(f"True Negatives (TN): {tn}")
print(f"False Positives (FP): {fp} ← False alarms (normal classified as attack)")
print(f"False Negatives (FN): {fn} ← Missed attacks (CRITICAL for SOC!)")
print(f"True Positives (TP): {tp}")

In [None]:
# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='#3498db', lw=2, label=f'ROC Curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='#95a5a6', lw=2, linestyle='--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate (Recall)', fontsize=12)
plt.title('ROC Curve - Network Intrusion Detection', fontsize=14, fontweight='bold')
plt.legend(loc='lower right', fontsize=11)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Precision-Recall Curve (Important for imbalanced datasets)
precision, recall, pr_thresholds = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)

plt.figure(figsize=(10, 6))
plt.plot(recall, precision, color='#e74c3c', lw=2, label=f'PR Curve (AUC = {pr_auc:.4f})')
plt.xlabel('Recall (Sensitivity)', fontsize=12)
plt.ylabel('Precision', fontsize=12)
plt.title('Precision-Recall Curve - Critical for SOC Operations', fontsize=14, fontweight='bold')
plt.legend(loc='lower left', fontsize=11)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Precision-Recall AUC: {pr_auc:.4f}")

## 8. Feature Importance Analysis

In [None]:
# Extract feature importances from Random Forest
rf_model = pipeline.named_steps['clf']
feature_importances = rf_model.feature_importances_

# Get feature names after preprocessing
preprocessor_fitted = pipeline.named_steps['pre']
cat_features = preprocessor_fitted.named_transformers_['cat'].get_feature_names_out(categorical_cols)
all_feature_names = list(cat_features) + numeric_cols

# Create DataFrame for feature importances
feature_importance_df = pd.DataFrame({
    'feature': all_feature_names,
    'importance': feature_importances
}).sort_values('importance', ascending=False)

print("Top 20 Most Important Features:")
print(feature_importance_df.head(20))

In [None]:
# Visualize top 20 feature importances
top_n = 20
top_features = feature_importance_df.head(top_n)

plt.figure(figsize=(12, 8))
plt.barh(range(top_n), top_features['importance'].values, color='#3498db')
plt.yticks(range(top_n), top_features['feature'].values)
plt.xlabel('Feature Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title(f'Top {top_n} Most Important Features - Random Forest', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

## 9. Cross-Validation Score

In [None]:
# Perform 5-fold cross-validation on training set
print("Performing 5-fold cross-validation...\n")
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)

print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

## 10. Save Model and Pipeline

In [None]:
# Save the complete pipeline (preprocessing + model)
model_filename = 'kdd_rf_pipeline.joblib'
joblib.dump(pipeline, model_filename)

print(f"✓ Model pipeline saved to: {model_filename}")
print(f"\nModel file size: {joblib.os.path.getsize(model_filename) / (1024*1024):.2f} MB")

# Test loading the model
loaded_pipeline = joblib.load(model_filename)
test_prediction = loaded_pipeline.predict(X_test[:5])
print(f"\n✓ Model loaded successfully!")
print(f"Sample predictions from loaded model: {test_prediction}")

## 11. Model Summary and SOC Integration Notes

In [None]:
print("="*80)
print("MODEL SUMMARY - NETWORK INTRUSION DETECTION")
print("="*80)
print(f"\nDataset: KDD Cup 1999 (10% subset)")
print(f"Total samples: {len(X_raw):,}")
print(f"Training samples: {len(X_train):,}")
print(f"Test samples: {len(X_test):,}")
print(f"\nModel: Random Forest Classifier")
print(f"Number of trees: 200")
print(f"Max depth: 20")
print(f"\nPerformance Metrics (Test Set):")
print(f"  • Accuracy: {test_score:.4f}")
print(f"  • ROC AUC: {roc_auc:.4f}")
print(f"  • Precision-Recall AUC: {pr_auc:.4f}")
print(f"\nCritical SOC Metrics:")
print(f"  • False Negatives (Missed Attacks): {fn}")
print(f"  • False Positives (False Alarms): {fp}")
print(f"\nModel saved to: {model_filename}")
print("\n" + "="*80)
print("\nSOC/SIEM INTEGRATION NOTES:")
print("="*80)
print("""
1. Model Input: 41 network traffic features (protocol, service, connection stats)
2. Model Output: Binary classification (0=Normal, 1=Attack) + probability score
3. Alert Threshold: Use probability > 0.5 for high-confidence alerts
4. For SIEM integration:
   - Feed real-time network logs through preprocessing pipeline
   - Generate alerts for predictions with class=1
   - Include probability score in alert metadata for prioritization
   - Monitor false positive rate and adjust threshold if needed
5. Recommended retraining: Weekly with new labeled attack data
6. Key features to monitor: Top 20 features from importance analysis
""")
print("="*80)

## 12. Example: Making Predictions on New Data

In [None]:
# Example: Predict on sample test instances
sample_data = X_test.head(10)
sample_true = y_test.head(10)

sample_predictions = pipeline.predict(sample_data)
sample_probabilities = pipeline.predict_proba(sample_data)[:, 1]

# Create results DataFrame
results_df = pd.DataFrame({
    'True_Label': ['Normal' if x == 0 else 'Attack' for x in sample_true],
    'Predicted_Label': ['Normal' if x == 0 else 'Attack' for x in sample_predictions],
    'Attack_Probability': sample_probabilities,
    'Correct': sample_true.values == sample_predictions
})

print("Sample Predictions:")
print(results_df)
print(f"\nAccuracy on sample: {results_df['Correct'].sum() / len(results_df) * 100:.2f}%")

## Conclusion

This notebook demonstrates a complete ML pipeline for network intrusion detection:

✓ **Data Loading**: KDD Cup 1999 dataset with 494,021 samples

✓ **Preprocessing**: OneHotEncoding for categorical features, StandardScaler for numeric features

✓ **Model**: Random Forest with 200 trees (highly accurate and interpretable)

✓ **Evaluation**: Comprehensive metrics including confusion matrix, ROC curve, precision-recall

✓ **Explainability**: Feature importance analysis for understanding key attack indicators

✓ **Production-Ready**: Saved pipeline for deployment, ready for SIEM integration

### Key Takeaways for SOC Operations:
- **High Recall** is critical to minimize missed attacks (false negatives)
- **Precision** matters to reduce alert fatigue from false positives
- **Feature importance** helps SOC analysts understand which network characteristics indicate threats
- **Probability scores** enable alert prioritization and risk-based response