# Lab 03: Network Anomaly Detection

Build an anomaly detection system for network traffic using Isolation Forest and One-Class SVM.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/depalmar/ai_for_the_win/blob/main/notebooks/lab03_anomaly_detection.ipynb)

## Learning Objectives
- Network flow feature engineering
- Isolation Forest for anomaly detection
- One-Class SVM and Local Outlier Factor
- Precision-Recall evaluation for imbalanced data

In [None]:
# Install dependencies (uncomment for Colab)
# !pip install scikit-learn pandas numpy matplotlib seaborn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

plt.style.use('seaborn-v0_8-whitegrid')
np.random.seed(42)

## 1. Generate Network Flow Data

In [None]:
# Generate synthetic network flow data
n_normal = 900
n_anomaly = 100

# Normal traffic
normal_data = {
    'bytes_sent': np.random.lognormal(8, 1, n_normal),
    'bytes_recv': np.random.lognormal(9, 1.2, n_normal),
    'packets_sent': np.random.poisson(50, n_normal),
    'packets_recv': np.random.poisson(80, n_normal),
    'duration': np.random.exponential(5, n_normal),
    'dst_port': np.random.choice([80, 443, 22, 53, 8080], n_normal, p=[0.3, 0.4, 0.1, 0.1, 0.1]),
    'protocol': np.random.choice(['TCP', 'UDP'], n_normal, p=[0.85, 0.15]),
    'label': 0
}

# Anomalous traffic (C2, exfiltration, scanning)
anomaly_data = {
    'bytes_sent': np.random.lognormal(12, 2, n_anomaly),  # Larger transfers
    'bytes_recv': np.random.lognormal(6, 0.5, n_anomaly),  # Small responses
    'packets_sent': np.random.poisson(500, n_anomaly),  # Many packets
    'packets_recv': np.random.poisson(20, n_anomaly),
    'duration': np.random.uniform(0.001, 0.5, n_anomaly),  # Short bursts
    'dst_port': np.random.choice([4444, 8888, 31337, 6667], n_anomaly),  # Suspicious ports
    'protocol': np.random.choice(['TCP', 'UDP'], n_anomaly, p=[0.5, 0.5]),
    'label': 1
}

df_normal = pd.DataFrame(normal_data)
df_anomaly = pd.DataFrame(anomaly_data)
df = pd.concat([df_normal, df_anomaly], ignore_index=True).sample(frac=1, random_state=42)

print(f"Dataset shape: {df.shape}")
print(f"\nLabel distribution:")
print(df['label'].value_counts())

## 2. Feature Engineering

In [None]:
# Engineer network features
df['duration'] = df['duration'].clip(lower=0.001)

# Total bytes and packets
df['total_bytes'] = df['bytes_sent'] + df['bytes_recv']
df['total_packets'] = df['packets_sent'] + df['packets_recv']

# Rate features
df['bytes_per_second'] = df['total_bytes'] / df['duration']
df['packets_per_second'] = df['total_packets'] / df['duration']

# Ratio features
df['bytes_ratio'] = df['bytes_sent'] / (df['total_bytes'] + 1)
df['packets_ratio'] = df['packets_sent'] / (df['total_packets'] + 1)

# Bytes per packet
df['bytes_per_packet'] = df['total_bytes'] / (df['total_packets'] + 1)

# Port features
df['is_well_known_port'] = (df['dst_port'] < 1024).astype(int)
df['is_suspicious_port'] = df['dst_port'].isin([4444, 8888, 31337, 6667, 1337]).astype(int)

print("Engineered features:")
print(df[['bytes_per_second', 'packets_per_second', 'bytes_ratio', 'bytes_per_packet']].describe())

In [None]:
# Visualize feature distributions
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

features_to_plot = ['bytes_per_second', 'packets_per_second', 'bytes_ratio', 'duration']
for ax, feature in zip(axes.flatten(), features_to_plot):
    for label, color in [(0, 'green'), (1, 'red')]:
        subset = df[df['label'] == label][feature]
        ax.hist(subset, alpha=0.5, label=f"{'Normal' if label==0 else 'Anomaly'}", 
                bins=30, color=color, density=True)
    ax.set_xlabel(feature)
    ax.set_ylabel('Density')
    ax.legend()
    ax.set_title(f'{feature} Distribution')

plt.tight_layout()
plt.show()

## 3. Prepare Features for Anomaly Detection

In [None]:
# Select features for anomaly detection
feature_cols = [
    'bytes_per_second', 'packets_per_second', 
    'bytes_ratio', 'packets_ratio', 
    'bytes_per_packet', 'duration',
    'is_well_known_port', 'is_suspicious_port'
]

X = df[feature_cols].values
y = df['label'].values

# Use RobustScaler for outlier-robust scaling
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

print(f"Feature matrix shape: {X_scaled.shape}")

## 4. Isolation Forest

In [None]:
# Train Isolation Forest
iso_forest = IsolationForest(
    n_estimators=100,
    contamination=0.1,  # Expected proportion of anomalies
    random_state=42
)

# Predict: -1 for anomaly, 1 for normal
iso_pred = iso_forest.fit_predict(X_scaled)

# Convert to binary (1 for anomaly, 0 for normal)
iso_pred_binary = (iso_pred == -1).astype(int)

print("Isolation Forest Results:")
print(f"Predicted anomalies: {iso_pred_binary.sum()}")
print(f"Actual anomalies: {y.sum()}")

In [None]:
# Evaluate Isolation Forest
precision = precision_score(y, iso_pred_binary)
recall = recall_score(y, iso_pred_binary)
f1 = f1_score(y, iso_pred_binary)

print("Isolation Forest Metrics:")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")

# Confusion matrix
cm = confusion_matrix(y, iso_pred_binary)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Normal', 'Anomaly'],
            yticklabels=['Normal', 'Anomaly'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Isolation Forest Confusion Matrix')
plt.show()

## 5. One-Class SVM

In [None]:
# Train One-Class SVM (on normal data only for proper one-class learning)
# In practice, you'd train only on normal traffic
ocsvm = OneClassSVM(
    kernel='rbf',
    gamma='scale',
    nu=0.1  # Upper bound on fraction of outliers
)

ocsvm_pred = ocsvm.fit_predict(X_scaled)
ocsvm_pred_binary = (ocsvm_pred == -1).astype(int)

print("One-Class SVM Results:")
print(f"Predicted anomalies: {ocsvm_pred_binary.sum()}")

## 6. Local Outlier Factor

In [None]:
# Train Local Outlier Factor
lof = LocalOutlierFactor(
    n_neighbors=20,
    contamination=0.1
)

lof_pred = lof.fit_predict(X_scaled)
lof_pred_binary = (lof_pred == -1).astype(int)

print("LOF Results:")
print(f"Predicted anomalies: {lof_pred_binary.sum()}")

## 7. Compare All Models

In [None]:
# Compare all models
models = {
    'Isolation Forest': iso_pred_binary,
    'One-Class SVM': ocsvm_pred_binary,
    'Local Outlier Factor': lof_pred_binary
}

results = []
for name, pred in models.items():
    results.append({
        'Model': name,
        'Precision': precision_score(y, pred),
        'Recall': recall_score(y, pred),
        'F1': f1_score(y, pred)
    })

results_df = pd.DataFrame(results)
print("Model Comparison:")
print(results_df.to_string(index=False))

# Plot comparison
results_df.set_index('Model')[['Precision', 'Recall', 'F1']].plot(kind='bar', figsize=(10, 5))
plt.title('Anomaly Detection Model Comparison')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

## 8. Anomaly Score Analysis

In [None]:
# Get anomaly scores from Isolation Forest
anomaly_scores = -iso_forest.score_samples(X_scaled)
df['anomaly_score'] = anomaly_scores

# Plot score distribution
fig, ax = plt.subplots(figsize=(10, 5))

for label, color, name in [(0, 'green', 'Normal'), (1, 'red', 'Anomaly')]:
    subset = df[df['label'] == label]['anomaly_score']
    ax.hist(subset, alpha=0.5, label=name, bins=30, color=color, density=True)

ax.axvline(x=np.percentile(anomaly_scores, 90), color='black', linestyle='--', 
           label='90th percentile threshold')
ax.set_xlabel('Anomaly Score')
ax.set_ylabel('Density')
ax.set_title('Anomaly Score Distribution')
ax.legend()
plt.show()

In [None]:
# Show top anomalies
print("Top 10 Most Anomalous Flows:")
top_anomalies = df.nlargest(10, 'anomaly_score')[[
    'bytes_sent', 'bytes_recv', 'dst_port', 'duration', 'anomaly_score', 'label'
]]
print(top_anomalies.to_string())

## Summary

In this lab, we built a network anomaly detection system using:
- **Feature engineering** for network flows (rates, ratios, port analysis)
- **Isolation Forest** for efficient anomaly detection
- **One-Class SVM** for boundary-based detection
- **Local Outlier Factor** for density-based detection

### Key Takeaways:
- Isolation Forest is fast and effective for large datasets
- Feature engineering is crucial (bytes/second, packets/second)
- Contamination parameter should match expected anomaly rate
- Combine multiple detectors for robustness

### Next Steps:
1. Add time-based features (hour of day, day of week)
2. Implement sliding window aggregation
3. Build real-time detection pipeline