# Snowcore Anomaly Detection - Cross-Asset GNN Model

This notebook implements a Graph Neural Network (GNN) for anomaly propagation prediction across the Avalanche X1 production line.

## Objective
Predict which downstream assets will show anomalies given upstream sensor drift, using a GraphSAGE architecture with temporal attention.

## Graph Structure
```
LAYUP_ROOM (ENV) ────────────────────────────────────────→
       │
LAYUP_BOT_01 → AUTOCLAVE_01 → CNC_MILL_01 → QC_STATION_01
LAYUP_BOT_02 → AUTOCLAVE_02 → CNC_MILL_02 → QC_STATION_02
```

In [None]:
import warnings
warnings.filterwarnings('ignore')
import subprocess
import sys

print("Installing PyTorch and torch_geometric...")
subprocess.run([sys.executable, "-m", "pip", "install", "torch", "torch_geometric"], check=True)
print("Installation complete!")

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv, global_mean_pool
from torch_geometric.data import Data, DataLoader
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime, timedelta

np.random.seed(42)
torch.manual_seed(42)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 1. Define Graph Structure

In [None]:
ASSETS = [
    'LAYUP_ROOM', 'LAYUP_BOT_01', 'LAYUP_BOT_02', 
    'AUTOCLAVE_01', 'AUTOCLAVE_02',
    'CNC_MILL_01', 'CNC_MILL_02',
    'QC_STATION_01', 'QC_STATION_02'
]

ASSET_TO_IDX = {asset: i for i, asset in enumerate(ASSETS)}

EDGES = [
    ('LAYUP_ROOM', 'LAYUP_BOT_01', 'ENV_INFLUENCE', 0.8, 0),
    ('LAYUP_ROOM', 'LAYUP_BOT_02', 'ENV_INFLUENCE', 0.8, 0),
    ('LAYUP_BOT_01', 'AUTOCLAVE_01', 'MATERIAL_FLOW', 1.0, 2),
    ('LAYUP_BOT_02', 'AUTOCLAVE_02', 'MATERIAL_FLOW', 1.0, 2),
    ('AUTOCLAVE_01', 'CNC_MILL_01', 'MATERIAL_FLOW', 1.0, 4),
    ('AUTOCLAVE_02', 'CNC_MILL_02', 'MATERIAL_FLOW', 1.0, 4),
    ('CNC_MILL_01', 'QC_STATION_01', 'MATERIAL_FLOW', 1.0, 1),
    ('CNC_MILL_02', 'QC_STATION_02', 'MATERIAL_FLOW', 1.0, 1),
]

edge_index = torch.tensor([
    [ASSET_TO_IDX[src] for src, _, _, _, _ in EDGES],
    [ASSET_TO_IDX[tgt] for _, tgt, _, _, _ in EDGES]
], dtype=torch.long)

print(f"Nodes: {len(ASSETS)}, Edges: {edge_index.shape[1]}")

## 2. Generate Synthetic Training Data

In [None]:
def generate_node_features(num_samples=1000, feature_dim=8):
    """Generate synthetic node features with humidity-scrap correlation."""
    X, y = [], []
    
    for _ in range(num_samples):
        features = np.zeros((len(ASSETS), feature_dim))
        labels = np.zeros(len(ASSETS))
        
        for i in range(len(ASSETS)):
            features[i] = [np.random.uniform(0.6, 1.0), 0] + list(np.random.normal(0.5, 0.15, 6).clip(0, 1))
        
        # Humidity-scrap correlation: high humidity leads to downstream anomalies
        if features[0, 2] > 0.65:
            labels[3] = 1 if np.random.random() < 0.48 else 0  # AUTOCLAVE_01
            labels[4] = 1 if np.random.random() < 0.48 else 0  # AUTOCLAVE_02
        
        X.append(features)
        y.append(labels)
    
    return np.array(X), np.array(y)

X_train, y_train = generate_node_features(800)
X_val, y_val = generate_node_features(100)
X_test, y_test = generate_node_features(100)

print(f"Training: {X_train.shape[0]}, Validation: {X_val.shape[0]}, Test: {X_test.shape[0]}")
print(f"Anomaly rate: {y_train.mean():.2%}")

## 3. Define GNN Model (GraphSAGE with Attention)

In [None]:
class AssetGNN(nn.Module):
    """GraphSAGE model for anomaly propagation prediction."""
    
    def __init__(self, node_features=8, hidden_dim=64, output_dim=1, dropout=0.3):
        super().__init__()
        self.conv1 = SAGEConv(node_features, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, hidden_dim)
        self.attention = nn.MultiheadAttention(hidden_dim, num_heads=4, dropout=dropout, batch_first=True)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2), nn.ReLU(),
            nn.Dropout(dropout), nn.Linear(hidden_dim // 2, output_dim)
        )
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, edge_index):
        x = self.dropout(F.relu(self.conv1(x, edge_index)))
        x = self.dropout(F.relu(self.conv2(x, edge_index)))
        x_attended, _ = self.attention(x.unsqueeze(0), x.unsqueeze(0), x.unsqueeze(0))
        return torch.sigmoid(self.classifier(x_attended.squeeze(0)))

model = AssetGNN()
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")

## 4. Training

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
edge_index = edge_index.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([5.0]).to(device))

train_losses = []
for epoch in range(50):
    model.train()
    epoch_loss = 0
    for i in range(len(X_train)):
        x = torch.tensor(X_train[i], dtype=torch.float).to(device)
        target = torch.tensor(y_train[i], dtype=torch.float).to(device)
        optimizer.zero_grad()
        loss = criterion(model(x, edge_index).squeeze(), target)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    train_losses.append(epoch_loss / len(X_train))
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}: Loss = {train_losses[-1]:.4f}")

## 5. Anomaly Propagation Visualization

In [None]:
# Test with high humidity scenario
test_scenario = np.zeros((len(ASSETS), 8))
for i in range(len(ASSETS)):
    test_scenario[i] = [0.9, 0] + list(np.random.uniform(0.4, 0.6, 6))
test_scenario[0, 2] = 0.72  # High humidity in Layup Room

model.eval()
with torch.no_grad():
    x = torch.tensor(test_scenario, dtype=torch.float).to(device)
    probs = model(x, edge_index).squeeze().cpu().numpy()

print("Anomaly Propagation Prediction (High Humidity):")
for asset, prob in zip(ASSETS, probs):
    risk = "HIGH" if prob > 0.5 else "MEDIUM" if prob > 0.3 else "LOW"
    print(f"  {asset:20s}: {prob:.1%} ({risk})")

In [None]:
# Visualize graph with propagation probabilities
POSITIONS = {
    'LAYUP_ROOM': (0, 1), 'LAYUP_BOT_01': (1, 0), 'LAYUP_BOT_02': (1, 2),
    'AUTOCLAVE_01': (2, 0), 'AUTOCLAVE_02': (2, 2),
    'CNC_MILL_01': (3, 0), 'CNC_MILL_02': (3, 2),
    'QC_STATION_01': (4, 0), 'QC_STATION_02': (4, 2)
}

fig = go.Figure()

for src, tgt, _, _, _ in EDGES:
    x0, y0 = POSITIONS[src]
    x1, y1 = POSITIONS[tgt]
    strength = max(probs[ASSET_TO_IDX[src]], probs[ASSET_TO_IDX[tgt]])
    fig.add_trace(go.Scatter(
        x=[x0, x1], y=[y0, y1], mode='lines',
        line=dict(color=f'rgba(255, {int(255*(1-strength))}, 100, {0.3 + strength*0.7})', width=2 + strength * 6),
        showlegend=False
    ))

for asset, (x, y) in POSITIONS.items():
    prob = probs[ASSET_TO_IDX[asset]]
    fig.add_trace(go.Scatter(
        x=[x], y=[y], mode='markers+text',
        marker=dict(size=50 + prob * 30, color=f'rgb({int(255*prob)}, {int(200*(1-prob))}, 100)'),
        text=f"{asset.replace('_', '<br>')}<br>{prob:.0%}",
        textfont=dict(size=9, color='white'), showlegend=False
    ))

fig.update_layout(
    title='GNN Anomaly Propagation (High Humidity Scenario)', template='plotly_dark', height=500,
    xaxis=dict(showgrid=False, showticklabels=False), yaxis=dict(showgrid=False, showticklabels=False)
)
fig.show()

## 6. Write Propagation Scores to Snowflake

In [None]:
from snowflake.snowpark.context import get_active_session
import pandas as pd
from datetime import datetime

session = get_active_session()
run_timestamp = datetime.now()

propagation_records = []

for source_idx, source_asset in enumerate(ASSETS):
    for target_idx, target_asset in enumerate(ASSETS):
        if source_idx != target_idx:
            edge_type = None
            hop_distance = None
            for src, tgt, etype, weight, lag in EDGES:
                if src == source_asset and tgt == target_asset:
                    edge_type = etype
                    hop_distance = 1
                    break
            
            source_prob = float(probs[source_idx])
            target_prob = float(probs[target_idx])
            prop_score = abs(target_prob - source_prob) if source_prob > 0.3 else 0.0
            
            if prop_score > 0.05 or edge_type:
                propagation_records.append({
                    'SOURCE_ASSET': source_asset,
                    'TARGET_ASSET': target_asset,
                    'PROPAGATION_SCORE': round(prop_score, 4),
                    'PROPAGATION_TYPE': 'HUMIDITY_CASCADE' if source_asset == 'LAYUP_ROOM' else 'DOWNSTREAM',
                    'EDGE_TYPE': edge_type,
                    'HOP_DISTANCE': hop_distance if hop_distance else -1,
                    'CONFIDENCE': round(float(source_prob), 4)
                })

print(f'Writing {len(propagation_records)} propagation scores to Snowflake...')
prop_df = pd.DataFrame(propagation_records)
session.write_pandas(prop_df, 'GNN_PROPAGATION_SCORES', database='SNOWCORE_PDM', schema='PDM', overwrite=False)
print(f'  PDM.GNN_PROPAGATION_SCORES: {len(prop_df)} rows written')
print('\n[OK] GNN propagation scores saved to Snowflake')