# Appendix: Self-Supervised Training

Train TabularResNet on OCSF data using self-supervised contrastive learning.

**What you'll learn:**
1. Contrastive learning for tabular data
2. Data augmentation strategies for OCSF events
3. Training loop implementation
4. Extracting embeddings for downstream tasks

**Prerequisites:**
- Processed features from [03-feature-engineering.ipynb](03-feature-engineering.ipynb)
- PyTorch installed

In [None]:
import numpy as np
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import warnings
warnings.filterwarnings('ignore')

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1. Load Processed Features

Load the numerical and categorical feature arrays from the feature engineering notebook.

In [None]:
# Load feature arrays
numerical = np.load('../data/numerical_features.npy')
categorical = np.load('../data/categorical_features.npy')

# Load artifacts (encoders, scaler, cardinalities)
with open('../data/feature_artifacts.pkl', 'rb') as f:
    artifacts = pickle.load(f)

cardinalities = artifacts['cardinalities']

print(f"Numerical features: {numerical.shape}")
print(f"Categorical features: {categorical.shape}")
print(f"Cardinalities: {cardinalities}")

In [None]:
# Convert to PyTorch tensors
numerical_tensor = torch.tensor(numerical, dtype=torch.float32)
categorical_tensor = torch.tensor(categorical, dtype=torch.long)

# Create dataset and dataloader
dataset = TensorDataset(numerical_tensor, categorical_tensor)
batch_size = 256  # Large batches are important for contrastive learning
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)

print(f"Dataset size: {len(dataset)}")
print(f"Batches per epoch: {len(dataloader)}")

## 2. Define TabularResNet Model

A simplified TabularResNet that creates embeddings from mixed numerical/categorical data.

In [None]:
class ResidualBlock(nn.Module):
    """Residual block with two linear layers and skip connection."""
    
    def __init__(self, d_model, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_model)
        self.linear2 = nn.Linear(d_model, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # First sub-layer
        residual = x
        x = self.norm1(x)
        x = F.gelu(self.linear1(x))
        x = self.dropout(x)
        
        # Second sub-layer
        x = self.norm2(x)
        x = self.linear2(x)
        x = self.dropout(x)
        
        return x + residual


class TabularResNet(nn.Module):
    """
    ResNet-style architecture for tabular data.
    
    Combines:
    - Categorical embeddings
    - Numerical feature projection
    - Residual blocks for deep feature learning
    """
    
    def __init__(self, num_numerical, cardinalities, d_model=128, 
                 num_blocks=4, embedding_dim=32, dropout=0.1):
        super().__init__()
        
        self.d_model = d_model
        
        # Categorical embeddings
        self.embeddings = nn.ModuleList([
            nn.Embedding(cardinality, embedding_dim)
            for cardinality in cardinalities
        ])
        
        # Calculate input dimension
        total_cat_dim = len(cardinalities) * embedding_dim
        input_dim = num_numerical + total_cat_dim
        
        # Input projection
        self.input_projection = nn.Linear(input_dim, d_model)
        
        # Residual blocks
        self.blocks = nn.ModuleList([
            ResidualBlock(d_model, dropout) 
            for _ in range(num_blocks)
        ])
        
        # Final layer norm
        self.final_norm = nn.LayerNorm(d_model)
        
    def forward(self, numerical, categorical, return_embedding=True):
        """
        Forward pass.
        
        Args:
            numerical: (batch, num_numerical) tensor
            categorical: (batch, num_categorical) tensor of indices
            return_embedding: If True, return embeddings instead of logits
        """
        # Embed categorical features
        cat_embedded = []
        for i, emb_layer in enumerate(self.embeddings):
            cat_embedded.append(emb_layer(categorical[:, i]))
        
        # Concatenate all features
        if cat_embedded:
            cat_concat = torch.cat(cat_embedded, dim=1)
            x = torch.cat([numerical, cat_concat], dim=1)
        else:
            x = numerical
        
        # Project to model dimension
        x = self.input_projection(x)
        
        # Apply residual blocks
        for block in self.blocks:
            x = block(x)
        
        # Final normalization
        x = self.final_norm(x)
        
        return x

# Create model
model = TabularResNet(
    num_numerical=numerical.shape[1],
    cardinalities=cardinalities,
    d_model=128,
    num_blocks=4,
    embedding_dim=32,
    dropout=0.1
).to(device)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

## 3. Define Contrastive Learning Components

**Contrastive learning** trains the model so that:
- Similar records (augmented versions of the same event) have similar embeddings
- Different records have different embeddings

In [None]:
class TabularAugmentation:
    """
    Data augmentation for tabular data.
    
    For OCSF data, we only augment:
    - Numerical: Add small Gaussian noise (5-10%)
    - Categorical: Random dropout (10-20%) - replaces with random value
    
    We DON'T augment security-critical fields like status, severity, activity_id.
    """
    
    def __init__(self, noise_level=0.1, dropout_prob=0.15):
        self.noise_level = noise_level
        self.dropout_prob = dropout_prob
    
    def augment_numerical(self, numerical):
        """Add Gaussian noise to numerical features."""
        noise = torch.randn_like(numerical) * self.noise_level
        return numerical + noise
    
    def augment_categorical(self, categorical, cardinalities):
        """Randomly replace some categorical features."""
        augmented = categorical.clone()
        mask = torch.rand_like(categorical.float()) < self.dropout_prob
        
        for i, cardinality in enumerate(cardinalities):
            random_cats = torch.randint(
                0, cardinality, (categorical.size(0),),
                device=categorical.device
            )
            augmented[:, i] = torch.where(
                mask[:, i], random_cats, categorical[:, i]
            )
        
        return augmented


def contrastive_loss(model, numerical, categorical, cardinalities, 
                     temperature=0.07, augmenter=None):
    """
    SimCLR-style contrastive loss for tabular data.
    
    For each record:
    1. Create two augmented views
    2. Compute embeddings for both views
    3. Pull embeddings of same record together (positive pairs)
    4. Push embeddings of different records apart (negative pairs)
    """
    if augmenter is None:
        augmenter = TabularAugmentation()
    
    batch_size = numerical.size(0)
    
    # Create two augmented views
    num_aug1 = augmenter.augment_numerical(numerical)
    cat_aug1 = augmenter.augment_categorical(categorical, cardinalities)
    emb1 = model(num_aug1, cat_aug1)
    
    num_aug2 = augmenter.augment_numerical(numerical)
    cat_aug2 = augmenter.augment_categorical(categorical, cardinalities)
    emb2 = model(num_aug2, cat_aug2)
    
    # Concatenate embeddings: [view1_batch, view2_batch]
    embeddings = torch.cat([emb1, emb2], dim=0)
    
    # Normalize (important for cosine similarity)
    embeddings = F.normalize(embeddings, dim=1)
    
    # Compute similarity matrix
    similarity = torch.matmul(embeddings, embeddings.T) / temperature
    
    # Labels: positive pairs are (i, i+batch_size)
    labels = torch.cat([
        torch.arange(batch_size, 2 * batch_size),
        torch.arange(0, batch_size)
    ], dim=0).to(numerical.device)
    
    # Mask self-similarity
    mask = torch.eye(2 * batch_size, dtype=torch.bool, device=numerical.device)
    similarity = similarity.masked_fill(mask, float('-inf'))
    
    # Cross-entropy loss
    loss = F.cross_entropy(similarity, labels)
    
    return loss

# Test the loss function
augmenter = TabularAugmentation(noise_level=0.1, dropout_prob=0.15)

# Get a batch
num_batch, cat_batch = next(iter(dataloader))
num_batch = num_batch.to(device)
cat_batch = cat_batch.to(device)

# Compute loss
loss = contrastive_loss(model, num_batch, cat_batch, cardinalities, augmenter=augmenter)
print(f"Initial contrastive loss: {loss.item():.4f}")

## 4. Training Loop

Train the model using contrastive learning.

In [None]:
def train_epoch(model, dataloader, optimizer, cardinalities, augmenter, device):
    """Train for one epoch."""
    model.train()
    total_loss = 0
    
    for numerical, categorical in dataloader:
        numerical = numerical.to(device)
        categorical = categorical.to(device)
        
        optimizer.zero_grad()
        loss = contrastive_loss(
            model, numerical, categorical, cardinalities,
            augmenter=augmenter
        )
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

# Training configuration
num_epochs = 20
learning_rate = 1e-3

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
augmenter = TabularAugmentation(noise_level=0.1, dropout_prob=0.15)

print("Starting training...")
print(f"Epochs: {num_epochs}, Batch size: {batch_size}, LR: {learning_rate}")
print("-" * 40)

In [None]:
# Training loop
losses = []

for epoch in range(num_epochs):
    loss = train_epoch(model, dataloader, optimizer, cardinalities, augmenter, device)
    scheduler.step()
    losses.append(loss)
    
    if (epoch + 1) % 5 == 0 or epoch == 0:
        lr = scheduler.get_last_lr()[0]
        print(f"Epoch {epoch+1:3d}/{num_epochs} | Loss: {loss:.4f} | LR: {lr:.6f}")

print("-" * 40)
print(f"Final loss: {losses[-1]:.4f}")

In [None]:
# Plot training loss
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 4))
plt.plot(losses, marker='o', markersize=4)
plt.xlabel('Epoch')
plt.ylabel('Contrastive Loss')
plt.title('Training Loss Over Time')
plt.grid(True, alpha=0.3)
plt.show()

## 5. Extract Embeddings

Use the trained model to create embeddings for all records.

In [None]:
@torch.no_grad()
def extract_embeddings(model, numerical, categorical, batch_size=512):
    """
    Extract embeddings for all records.
    
    Returns:
        numpy array of embeddings (N, d_model)
    """
    model.eval()
    embeddings = []
    
    dataset = TensorDataset(
        torch.tensor(numerical, dtype=torch.float32),
        torch.tensor(categorical, dtype=torch.long)
    )
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    for num_batch, cat_batch in loader:
        num_batch = num_batch.to(device)
        cat_batch = cat_batch.to(device)
        
        emb = model(num_batch, cat_batch)
        embeddings.append(emb.cpu().numpy())
    
    return np.vstack(embeddings)

# Extract embeddings
embeddings = extract_embeddings(model, numerical, categorical)
print(f"Embeddings shape: {embeddings.shape}")

In [None]:
# Save embeddings and model
np.save('../data/embeddings.npy', embeddings)
torch.save(model.state_dict(), '../data/tabular_resnet.pt')

print("Saved:")
print("  - ../data/embeddings.npy")
print("  - ../data/tabular_resnet.pt")

## 6. Quick Embedding Visualization

Use t-SNE to visualize the learned embedding space.

In [None]:
from sklearn.manifold import TSNE

# Sample for visualization (t-SNE is slow on large datasets)
sample_size = min(2000, len(embeddings))
indices = np.random.choice(len(embeddings), sample_size, replace=False)
emb_sample = embeddings[indices]

# Run t-SNE
print("Running t-SNE (this may take a minute)...")
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
emb_2d = tsne.fit_transform(emb_sample)
print("Done!")

In [None]:
# Plot t-SNE
plt.figure(figsize=(10, 8))
plt.scatter(emb_2d[:, 0], emb_2d[:, 1], alpha=0.5, s=10)
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.title('OCSF Event Embeddings (t-SNE)')
plt.tight_layout()
plt.show()

## Summary

In this notebook, we:

1. **Loaded processed features** from the feature engineering notebook
2. **Built TabularResNet** - categorical embeddings + residual blocks
3. **Implemented contrastive learning** - SimCLR-style with augmentation
4. **Trained the model** on unlabeled OCSF data
5. **Extracted embeddings** for all records

**Key insight**: We learned meaningful representations from unlabeled data by training the model to recognize that augmented versions of the same event should have similar embeddings.

**Next**: Use these embeddings in:
- [05-embedding-analysis.ipynb](05-embedding-analysis.ipynb) - Analyze embedding quality
- [06-anomaly-detection.ipynb](06-anomaly-detection.ipynb) - Detect anomalies using embeddings