# Proactive Network Threat Detection with a Deep Learning Autoencoder

**Copyright (c) 2026 Shrikara Kaudambady. All rights reserved.**

This notebook implements an advanced threat detection system using an **Autoencoder**, a type of neural network. The model is trained on 'normal' network traffic to learn its underlying patterns. It then identifies potential threats by flagging traffic that it cannot reconstruct accurately, indicating a deviation from the norm.

### 1. Setup and Library Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras import layers, Model
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")

### 2. Data Simulation

We generate a synthetic dataset of network connections. Most of the data represents normal traffic, but we will inject specific anomalies like a port scan and data exfiltration.

In [None]:
np.random.seed(42)
n_samples = 5000

# Generate Normal Data
normal_data = pd.DataFrame({
    'duration': np.random.uniform(0, 5, n_samples),
    'protocol_type': np.random.choice(['tcp', 'udp'], n_samples, p=[0.8, 0.2]),
    'service': np.random.choice(['http', 'smtp', 'private'], n_samples, p=[0.7, 0.1, 0.2]),
    'src_bytes': np.random.randint(100, 5000, n_samples),
    'dst_bytes': np.random.randint(200, 15000, n_samples)
})
normal_data['is_anomaly'] = 0

# Anomaly 1: Port Scan
port_scan_data = pd.DataFrame({
    'duration': np.random.uniform(0, 0.1, 50),
    'protocol_type': 'tcp',
    'service': np.random.choice(['telnet', 'gopher', 'shell', 'login', 'finger'], 50),
    'src_bytes': 0,
    'dst_bytes': 0
})
port_scan_data['is_anomaly'] = 1

# Anomaly 2: Data Exfiltration
exfil_data = pd.DataFrame({
    'duration': np.random.uniform(10, 20, 10),
    'protocol_type': 'tcp',
    'service': 'private',
    'src_bytes': np.random.randint(1_000_000, 5_000_000, 10), # Unusually high source bytes
    'dst_bytes': np.random.randint(100, 500, 10)
})
exfil_data['is_anomaly'] = 1

df = pd.concat([normal_data, port_scan_data, exfil_data], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True) # Shuffle

print(f"Generated dataset with {len(df)} records.")
print(f"Number of anomalies: {df['is_anomaly'].sum()}")

### 3. Feature Engineering and Preprocessing
We convert categorical data to numbers (one-hot encoding) and scale all features to a [0, 1] range, which is critical for training neural networks.

In [None]:
# One-Hot Encode categorical features
df_processed = pd.get_dummies(df, columns=['protocol_type', 'service'], dummy_na=False)

# Separate original labels before dropping them for training
labels = df_processed['is_anomaly']
df_processed = df_processed.drop('is_anomaly', axis=1)

# Scale all features to the [0, 1] range
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(df_processed)

# Split data: train ONLY on normal data, test on the full mixed dataset
X_train_normal = X_scaled[labels == 0]
X_test_mixed = X_scaled
y_test_labels = labels

print(f"Training data shape (normal traffic only): {X_train_normal.shape}")
print(f"Test data shape (mixed traffic): {X_test_mixed.shape}")

### 4. Build and Train the Autoencoder Model
The model has an encoder that compresses the data and a decoder that reconstructs it. It is trained to minimize the reconstruction error on normal data.

In [None]:
input_dim = X_train_normal.shape[1]
encoding_dim = 8

input_layer = layers.Input(shape=(input_dim,))
# Encoder
encoder = layers.Dense(32, activation='relu')(input_layer)
encoder = layers.Dense(16, activation='relu')(encoder)
encoder = layers.Dense(encoding_dim, activation='relu')(encoder)
# Decoder
decoder = layers.Dense(16, activation='relu')(encoder)
decoder = layers.Dense(32, activation='relu')(decoder)
decoder = layers.Dense(input_dim, activation='sigmoid')(decoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mean_absolute_error')
autoencoder.summary()

# Train the model on normal data ONLY
history = autoencoder.fit(
    X_train_normal, X_train_normal, 
    epochs=20, 
    batch_size=32, 
    shuffle=True, 
    validation_split=0.1,
    verbose=1
)

### 5. Anomaly Detection and Visualization
We calculate the reconstruction error for all data points. A high error indicates an anomaly.

In [None]:
# Get model's predictions on the full test set
predictions = autoencoder.predict(X_test_mixed)

# Calculate Mean Absolute Error (MAE) as the reconstruction error
mae = np.mean(np.abs(X_test_mixed - predictions), axis=1)
df['reconstruction_error'] = mae

# Plot the distribution of errors
plt.figure(figsize=(12, 6))
sns.histplot(df[df['is_anomaly'] == 0]['reconstruction_error'], bins=50, label='Normal', color='blue', kde=True)
sns.histplot(df[df['is_anomaly'] == 1]['reconstruction_error'], bins=50, label='Anomaly', color='red', kde=True)
plt.title('Distribution of Reconstruction Errors')
plt.legend()
plt.show()

# Set a threshold for anomaly detection (e.g., 95th percentile of normal errors)
threshold = np.percentile(df[df['is_anomaly'] == 0]['reconstruction_error'], 95)
print(f"\nAnomaly detection threshold set at: {threshold:.4f}")

# Identify anomalies
detected_anomalies = df[df['reconstruction_error'] > threshold]

print(f"\nFound {len(detected_anomalies)} potential threats.")
print("--- Detected Anomalies --- ")
display(detected_anomalies.head(20))