# 16. Architecture Comparison

Compare different model architectures for G-code prediction from sensor data.

## Contents
1. [Setup](#1-setup)
2. [Baseline Architectures](#2-baseline-architectures)
3. [LSTM Variants](#3-lstm-variants)
4. [Transformer Variants](#4-transformer-variants)
5. [CNN-Based Models](#5-cnn-based-models)
6. [Hybrid Architectures](#6-hybrid-architectures)
7. [Comparison Analysis](#7-comparison-analysis)

---

## 1. Setup

In [None]:
import sys
from pathlib import Path

# Add src to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root / 'src'))

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple, Optional
import json
import time
from collections import OrderedDict

# Environment check
print(f"Python: {sys.version}")
print(f"PyTorch: {torch.__version__}")
device = torch.device('mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

# Reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

# Plotting style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

In [None]:
# Common configuration
CONFIG = {
    'continuous_dim': 155,
    'categorical_dims': [10, 10, 50, 50],
    'd_model': 256,
    'seq_length': 64,
    'vocab_sizes': {'type': 10, 'command': 50, 'param_type': 30, 'param_value': 100}
}

# Create dummy data for benchmarking
batch_size = 8
dummy_continuous = torch.randn(batch_size, CONFIG['seq_length'], CONFIG['continuous_dim']).to(device)
dummy_categorical = torch.randint(0, 10, (batch_size, CONFIG['seq_length'], 4)).to(device)

print(f"Benchmark data shape: {dummy_continuous.shape}")

## 2. Baseline Architectures

Define baseline models for comparison.

In [None]:
class BaselineLinear(nn.Module):
    """Simple linear baseline."""
    
    def __init__(self, config):
        super().__init__()
        input_dim = config['continuous_dim'] + sum(config['categorical_dims'])
        self.embeddings = nn.ModuleList([
            nn.Embedding(dim, 8) for dim in config['categorical_dims']
        ])
        self.fc = nn.Linear(config['continuous_dim'] + 4 * 8, config['d_model'])
        
    def forward(self, continuous, categorical):
        cat_embeds = [emb(categorical[:, :, i]) for i, emb in enumerate(self.embeddings)]
        cat_concat = torch.cat(cat_embeds, dim=-1)
        x = torch.cat([continuous, cat_concat], dim=-1)
        return self.fc(x)


class BaselineMLP(nn.Module):
    """Multi-layer perceptron baseline."""
    
    def __init__(self, config):
        super().__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(dim, 16) for dim in config['categorical_dims']
        ])
        input_dim = config['continuous_dim'] + 4 * 16
        
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, config['d_model'])
        )
        
    def forward(self, continuous, categorical):
        cat_embeds = [emb(categorical[:, :, i]) for i, emb in enumerate(self.embeddings)]
        cat_concat = torch.cat(cat_embeds, dim=-1)
        x = torch.cat([continuous, cat_concat], dim=-1)
        return self.layers(x)


# Test baselines
baseline_linear = BaselineLinear(CONFIG).to(device)
baseline_mlp = BaselineMLP(CONFIG).to(device)

with torch.no_grad():
    out_linear = baseline_linear(dummy_continuous, dummy_categorical)
    out_mlp = baseline_mlp(dummy_continuous, dummy_categorical)

print(f"Linear output: {out_linear.shape}")
print(f"MLP output: {out_mlp.shape}")

## 3. LSTM Variants

Different LSTM-based architectures.

In [None]:
class SimpleLSTM(nn.Module):
    """Basic LSTM encoder."""
    
    def __init__(self, config, num_layers=2, bidirectional=False):
        super().__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(dim, 16) for dim in config['categorical_dims']
        ])
        input_dim = config['continuous_dim'] + 4 * 16
        
        self.lstm = nn.LSTM(
            input_dim, config['d_model'],
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=0.1 if num_layers > 1 else 0
        )
        
        self.bidirectional = bidirectional
        if bidirectional:
            self.proj = nn.Linear(config['d_model'] * 2, config['d_model'])
        
    def forward(self, continuous, categorical):
        cat_embeds = [emb(categorical[:, :, i]) for i, emb in enumerate(self.embeddings)]
        cat_concat = torch.cat(cat_embeds, dim=-1)
        x = torch.cat([continuous, cat_concat], dim=-1)
        
        output, _ = self.lstm(x)
        
        if self.bidirectional:
            output = self.proj(output)
        
        return output


class StackedLSTM(nn.Module):
    """Stacked LSTM with residual connections."""
    
    def __init__(self, config, num_layers=4):
        super().__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(dim, 16) for dim in config['categorical_dims']
        ])
        input_dim = config['continuous_dim'] + 4 * 16
        
        self.input_proj = nn.Linear(input_dim, config['d_model'])
        
        self.lstm_layers = nn.ModuleList([
            nn.LSTM(config['d_model'], config['d_model'], batch_first=True)
            for _ in range(num_layers)
        ])
        self.layer_norms = nn.ModuleList([
            nn.LayerNorm(config['d_model']) for _ in range(num_layers)
        ])
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, continuous, categorical):
        cat_embeds = [emb(categorical[:, :, i]) for i, emb in enumerate(self.embeddings)]
        cat_concat = torch.cat(cat_embeds, dim=-1)
        x = torch.cat([continuous, cat_concat], dim=-1)
        x = self.input_proj(x)
        
        for lstm, ln in zip(self.lstm_layers, self.layer_norms):
            residual = x
            x, _ = lstm(x)
            x = self.dropout(x)
            x = ln(x + residual)
        
        return x


# Test LSTM variants
simple_lstm = SimpleLSTM(CONFIG, num_layers=2).to(device)
bidirectional_lstm = SimpleLSTM(CONFIG, num_layers=2, bidirectional=True).to(device)
stacked_lstm = StackedLSTM(CONFIG, num_layers=4).to(device)

with torch.no_grad():
    out1 = simple_lstm(dummy_continuous, dummy_categorical)
    out2 = bidirectional_lstm(dummy_continuous, dummy_categorical)
    out3 = stacked_lstm(dummy_continuous, dummy_categorical)

print(f"Simple LSTM: {out1.shape}")
print(f"Bidirectional LSTM: {out2.shape}")
print(f"Stacked LSTM: {out3.shape}")

## 4. Transformer Variants

Transformer-based architectures.

In [None]:
class SimpleTransformer(nn.Module):
    """Basic transformer encoder."""
    
    def __init__(self, config, num_layers=4, num_heads=8):
        super().__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(dim, 16) for dim in config['categorical_dims']
        ])
        input_dim = config['continuous_dim'] + 4 * 16
        
        self.input_proj = nn.Linear(input_dim, config['d_model'])
        self.pos_encoding = nn.Parameter(torch.randn(1, config['seq_length'], config['d_model']) * 0.02)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=config['d_model'],
            nhead=num_heads,
            dim_feedforward=config['d_model'] * 4,
            dropout=0.1,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
    def forward(self, continuous, categorical):
        cat_embeds = [emb(categorical[:, :, i]) for i, emb in enumerate(self.embeddings)]
        cat_concat = torch.cat(cat_embeds, dim=-1)
        x = torch.cat([continuous, cat_concat], dim=-1)
        x = self.input_proj(x)
        x = x + self.pos_encoding[:, :x.size(1), :]
        
        return self.transformer(x)


class RelativeTransformer(nn.Module):
    """Transformer with relative positional encoding."""
    
    def __init__(self, config, num_layers=4, num_heads=8):
        super().__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(dim, 16) for dim in config['categorical_dims']
        ])
        input_dim = config['continuous_dim'] + 4 * 16
        
        self.input_proj = nn.Linear(input_dim, config['d_model'])
        
        # Relative position embeddings
        self.max_relative_position = config['seq_length']
        self.relative_positions = nn.Embedding(
            2 * self.max_relative_position + 1, num_heads
        )
        
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=config['d_model'],
                nhead=num_heads,
                dim_feedforward=config['d_model'] * 4,
                dropout=0.1,
                batch_first=True
            ) for _ in range(num_layers)
        ])
        
    def forward(self, continuous, categorical):
        cat_embeds = [emb(categorical[:, :, i]) for i, emb in enumerate(self.embeddings)]
        cat_concat = torch.cat(cat_embeds, dim=-1)
        x = torch.cat([continuous, cat_concat], dim=-1)
        x = self.input_proj(x)
        
        for layer in self.layers:
            x = layer(x)
        
        return x


# Test transformer variants
simple_transformer = SimpleTransformer(CONFIG, num_layers=4).to(device)
relative_transformer = RelativeTransformer(CONFIG, num_layers=4).to(device)

with torch.no_grad():
    out1 = simple_transformer(dummy_continuous, dummy_categorical)
    out2 = relative_transformer(dummy_continuous, dummy_categorical)

print(f"Simple Transformer: {out1.shape}")
print(f"Relative Transformer: {out2.shape}")

## 5. CNN-Based Models

Convolutional approaches for time series.

In [None]:
class TemporalConvNet(nn.Module):
    """Temporal Convolutional Network (TCN)."""
    
    def __init__(self, config, num_channels=[128, 256, 256], kernel_size=3):
        super().__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(dim, 16) for dim in config['categorical_dims']
        ])
        input_dim = config['continuous_dim'] + 4 * 16
        
        layers = []
        num_levels = len(num_channels)
        
        for i in range(num_levels):
            dilation = 2 ** i
            in_channels = input_dim if i == 0 else num_channels[i-1]
            out_channels = num_channels[i]
            
            padding = (kernel_size - 1) * dilation
            
            layers.append(nn.Conv1d(in_channels, out_channels, kernel_size,
                                   padding=padding, dilation=dilation))
            layers.append(nn.BatchNorm1d(out_channels))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.1))
        
        self.network = nn.Sequential(*layers)
        self.output_proj = nn.Linear(num_channels[-1], config['d_model'])
        
    def forward(self, continuous, categorical):
        cat_embeds = [emb(categorical[:, :, i]) for i, emb in enumerate(self.embeddings)]
        cat_concat = torch.cat(cat_embeds, dim=-1)
        x = torch.cat([continuous, cat_concat], dim=-1)
        
        # Conv expects [B, C, T]
        x = x.transpose(1, 2)
        x = self.network(x)
        x = x.transpose(1, 2)  # Back to [B, T, C]
        
        # Crop to original length
        x = x[:, :CONFIG['seq_length'], :]
        
        return self.output_proj(x)


class WaveNet(nn.Module):
    """WaveNet-style dilated convolutions."""
    
    def __init__(self, config, num_layers=8, channels=128):
        super().__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(dim, 16) for dim in config['categorical_dims']
        ])
        input_dim = config['continuous_dim'] + 4 * 16
        
        self.input_conv = nn.Conv1d(input_dim, channels, 1)
        
        self.dilated_convs = nn.ModuleList()
        self.skip_convs = nn.ModuleList()
        
        for i in range(num_layers):
            dilation = 2 ** (i % 4)  # Reset dilation every 4 layers
            self.dilated_convs.append(
                nn.Conv1d(channels, channels * 2, 3, padding=dilation, dilation=dilation)
            )
            self.skip_convs.append(nn.Conv1d(channels, channels, 1))
        
        self.output_conv = nn.Sequential(
            nn.ReLU(),
            nn.Conv1d(channels, channels, 1),
            nn.ReLU(),
            nn.Conv1d(channels, config['d_model'], 1)
        )
        
    def forward(self, continuous, categorical):
        cat_embeds = [emb(categorical[:, :, i]) for i, emb in enumerate(self.embeddings)]
        cat_concat = torch.cat(cat_embeds, dim=-1)
        x = torch.cat([continuous, cat_concat], dim=-1)
        x = x.transpose(1, 2)
        
        x = self.input_conv(x)
        skip_sum = 0
        
        for dilated, skip in zip(self.dilated_convs, self.skip_convs):
            residual = x
            x = dilated(x)
            gate, filter_out = x.chunk(2, dim=1)
            x = torch.tanh(filter_out) * torch.sigmoid(gate)
            
            skip_sum = skip_sum + skip(x)
            x = x + residual
        
        x = self.output_conv(skip_sum)
        return x.transpose(1, 2)


# Test CNN variants
tcn = TemporalConvNet(CONFIG).to(device)
wavenet = WaveNet(CONFIG).to(device)

with torch.no_grad():
    out1 = tcn(dummy_continuous, dummy_categorical)
    out2 = wavenet(dummy_continuous, dummy_categorical)

print(f"TCN: {out1.shape}")
print(f"WaveNet: {out2.shape}")

## 6. Hybrid Architectures

Combine multiple approaches.

In [None]:
class ConvLSTM(nn.Module):
    """CNN feature extraction + LSTM sequence modeling."""
    
    def __init__(self, config):
        super().__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(dim, 16) for dim in config['categorical_dims']
        ])
        input_dim = config['continuous_dim'] + 4 * 16
        
        # CNN for local feature extraction
        self.cnn = nn.Sequential(
            nn.Conv1d(input_dim, 128, 3, padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Conv1d(128, 256, 3, padding=1),
            nn.BatchNorm1d(256),
            nn.ReLU(),
        )
        
        # LSTM for sequential modeling
        self.lstm = nn.LSTM(256, config['d_model'], num_layers=2, 
                           batch_first=True, bidirectional=True)
        self.proj = nn.Linear(config['d_model'] * 2, config['d_model'])
        
    def forward(self, continuous, categorical):
        cat_embeds = [emb(categorical[:, :, i]) for i, emb in enumerate(self.embeddings)]
        cat_concat = torch.cat(cat_embeds, dim=-1)
        x = torch.cat([continuous, cat_concat], dim=-1)
        
        # CNN
        x = x.transpose(1, 2)
        x = self.cnn(x)
        x = x.transpose(1, 2)
        
        # LSTM
        x, _ = self.lstm(x)
        x = self.proj(x)
        
        return x


class TransformerLSTM(nn.Module):
    """Transformer attention + LSTM for hybrid modeling."""
    
    def __init__(self, config):
        super().__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(dim, 16) for dim in config['categorical_dims']
        ])
        input_dim = config['continuous_dim'] + 4 * 16
        
        self.input_proj = nn.Linear(input_dim, config['d_model'])
        
        # Transformer for global context
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=config['d_model'],
            nhead=8,
            dim_feedforward=config['d_model'] * 2,
            dropout=0.1,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=2)
        
        # LSTM for sequential refinement
        self.lstm = nn.LSTM(config['d_model'], config['d_model'], 
                           num_layers=2, batch_first=True)
        
    def forward(self, continuous, categorical):
        cat_embeds = [emb(categorical[:, :, i]) for i, emb in enumerate(self.embeddings)]
        cat_concat = torch.cat(cat_embeds, dim=-1)
        x = torch.cat([continuous, cat_concat], dim=-1)
        x = self.input_proj(x)
        
        # Transformer
        x = self.transformer(x)
        
        # LSTM
        x, _ = self.lstm(x)
        
        return x


# Test hybrid models
conv_lstm = ConvLSTM(CONFIG).to(device)
transformer_lstm = TransformerLSTM(CONFIG).to(device)

with torch.no_grad():
    out1 = conv_lstm(dummy_continuous, dummy_categorical)
    out2 = transformer_lstm(dummy_continuous, dummy_categorical)

print(f"ConvLSTM: {out1.shape}")
print(f"TransformerLSTM: {out2.shape}")

## 7. Comparison Analysis

Compare all architectures on key metrics.

In [None]:
def count_parameters(model):
    """Count trainable parameters."""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def benchmark_model(model, continuous, categorical, num_runs=50):
    """Benchmark inference speed."""
    model.eval()
    
    # Warmup
    with torch.no_grad():
        for _ in range(5):
            _ = model(continuous, categorical)
    
    # Benchmark
    times = []
    with torch.no_grad():
        for _ in range(num_runs):
            start = time.time()
            _ = model(continuous, categorical)
            times.append((time.time() - start) * 1000)
    
    return np.mean(times), np.std(times)


# All models to compare
models = OrderedDict([
    ('Linear', baseline_linear),
    ('MLP', baseline_mlp),
    ('SimpleLSTM', simple_lstm),
    ('BiLSTM', bidirectional_lstm),
    ('StackedLSTM', stacked_lstm),
    ('Transformer', simple_transformer),
    ('RelativeTransformer', relative_transformer),
    ('TCN', tcn),
    ('WaveNet', wavenet),
    ('ConvLSTM', conv_lstm),
    ('TransformerLSTM', transformer_lstm),
])

# Collect metrics
results = []
for name, model in models.items():
    params = count_parameters(model)
    mean_time, std_time = benchmark_model(model, dummy_continuous, dummy_categorical)
    
    results.append({
        'name': name,
        'parameters': params,
        'mean_latency_ms': mean_time,
        'std_latency_ms': std_time
    })

# Print results
print(f"{'Model':<20} {'Parameters':<15} {'Latency (ms)':<15}")
print("-" * 50)
for r in results:
    print(f"{r['name']:<20} {r['parameters']:>12,} {r['mean_latency_ms']:>10.2f} ± {r['std_latency_ms']:.2f}")

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

names = [r['name'] for r in results]
params = [r['parameters'] / 1e6 for r in results]
latencies = [r['mean_latency_ms'] for r in results]
latency_stds = [r['std_latency_ms'] for r in results]

# Parameter count
colors = plt.cm.Set3(np.linspace(0, 1, len(names)))
bars1 = axes[0].barh(names, params, color=colors)
axes[0].set_xlabel('Parameters (Millions)')
axes[0].set_title('Model Size Comparison')
axes[0].invert_yaxis()

# Latency
bars2 = axes[1].barh(names, latencies, xerr=latency_stds, color=colors, capsize=3)
axes[1].set_xlabel('Inference Latency (ms)')
axes[1].set_title('Inference Speed Comparison')
axes[1].invert_yaxis()

plt.tight_layout()
plt.savefig(project_root / 'reports' / 'architecture_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Efficiency analysis (params vs latency tradeoff)
fig, ax = plt.subplots(figsize=(10, 6))

for i, r in enumerate(results):
    ax.scatter(r['parameters'] / 1e6, r['mean_latency_ms'], 
              s=100, c=[colors[i]], label=r['name'])

ax.set_xlabel('Parameters (Millions)')
ax.set_ylabel('Latency (ms)')
ax.set_title('Efficiency: Parameters vs Latency')
ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left')

plt.tight_layout()
plt.savefig(project_root / 'reports' / 'efficiency_tradeoff.png', dpi=150, bbox_inches='tight')
plt.show()

# Save results
results_path = project_root / 'reports' / 'architecture_comparison.json'
with open(results_path, 'w') as f:
    json.dump(results, f, indent=2)
print(f"Results saved to: {results_path}")

---

## Summary

This notebook compares various architectures:

1. **Baselines**: Linear, MLP
2. **LSTM**: Simple, Bidirectional, Stacked with residuals
3. **Transformer**: Standard, Relative positional encoding
4. **CNN**: TCN, WaveNet
5. **Hybrid**: ConvLSTM, TransformerLSTM

Key findings to explore further:
- Transformer models offer good accuracy but higher latency
- LSTM models balance accuracy and efficiency
- Hybrid models may offer best of both worlds

---

**Navigation:**
← [Previous: 15_data_augmentation](15_data_augmentation.ipynb) |
[Next: 17_uncertainty_quantification](17_uncertainty_quantification.ipynb) →