# Graph Neural Networks for Stock Relationship Modeling

This notebook demonstrates how to use Graph Neural Networks (GNNs) to model relationships between stocks and predict returns based on network effects.

## Key Concepts
- **Graph Construction**: Build stock network from correlations/supply chains
- **Graph Convolutional Network (GCN)**: Aggregate information from connected stocks
- **Node Classification/Regression**: Predict individual stock returns using network context

## Requirements
```bash
pip install torch torch-geometric numpy pandas matplotlib networkx
```

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import warnings
warnings.filterwarnings('ignore')

# Try to import torch_geometric, provide fallback message if not available
try:
    from torch_geometric.nn import GCNConv, GATConv
    from torch_geometric.data import Data
    TORCH_GEOMETRIC_AVAILABLE = True
except ImportError:
    print("torch_geometric not installed. Install with:")
    print("pip install torch-geometric")
    TORCH_GEOMETRIC_AVAILABLE = False

torch.manual_seed(42)
np.random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1. Generate Synthetic Stock Network Data

In [None]:
def generate_stock_network(n_stocks=50, n_days=500):
    """
    Generate synthetic stock data with network effects.
    
    Returns:
        returns: Stock returns matrix (n_days, n_stocks)
        adjacency: Adjacency matrix (n_stocks, n_stocks)
        sectors: Sector assignments for each stock
    """
    # Create sector structure
    n_sectors = 5
    stocks_per_sector = n_stocks // n_sectors
    sectors = np.repeat(range(n_sectors), stocks_per_sector)
    if len(sectors) < n_stocks:
        sectors = np.concatenate([sectors, [n_sectors-1] * (n_stocks - len(sectors))])
    
    # Generate sector factors
    sector_factors = np.random.randn(n_days, n_sectors) * 0.02
    
    # Generate market factor
    market_factor = np.random.randn(n_days) * 0.015
    
    # Generate stock-specific returns with network spillover
    returns = np.zeros((n_days, n_stocks))
    
    # Adjacency matrix based on sector membership + random connections
    adjacency = np.zeros((n_stocks, n_stocks))
    
    for i in range(n_stocks):
        for j in range(i+1, n_stocks):
            # Same sector = higher connection probability
            if sectors[i] == sectors[j]:
                prob = 0.6
            else:
                prob = 0.1
            
            if np.random.random() < prob:
                weight = np.random.uniform(0.3, 1.0)
                adjacency[i, j] = weight
                adjacency[j, i] = weight
    
    # Normalize adjacency (row-stochastic)
    adj_norm = adjacency / (adjacency.sum(axis=1, keepdims=True) + 1e-10)
    
    # Generate returns with network spillover
    for t in range(n_days):
        # Base returns: market + sector + idiosyncratic
        base_returns = (
            market_factor[t] +
            sector_factors[t, sectors] +
            np.random.randn(n_stocks) * 0.025
        )
        
        # Add network spillover from previous day
        if t > 0:
            spillover = adj_norm @ returns[t-1] * 0.3
            base_returns += spillover
        
        returns[t] = base_returns
    
    return returns, adjacency, sectors

# Generate data
N_STOCKS = 50
N_DAYS = 500

returns, adjacency, sectors = generate_stock_network(N_STOCKS, N_DAYS)

print(f"Returns shape: {returns.shape}")
print(f"Adjacency shape: {adjacency.shape}")
print(f"Number of edges: {(adjacency > 0).sum() // 2}")
print(f"Sectors: {np.bincount(sectors)}")

## 2. Visualize Stock Network

In [None]:
# Create NetworkX graph for visualization
G = nx.from_numpy_array(adjacency)

# Node colors by sector
sector_colors = plt.cm.Set1(np.linspace(0, 1, len(np.unique(sectors))))
node_colors = [sector_colors[s] for s in sectors]

# Layout
pos = nx.spring_layout(G, k=2, iterations=50, seed=42)

plt.figure(figsize=(12, 10))
nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=200, alpha=0.8)
nx.draw_networkx_edges(G, pos, alpha=0.2, width=0.5)
nx.draw_networkx_labels(G, pos, font_size=6)

# Legend
for i, color in enumerate(sector_colors[:len(np.unique(sectors))]):
    plt.scatter([], [], c=[color], label=f'Sector {i}', s=100)
plt.legend(loc='upper right')
plt.title(f'Stock Network ({N_STOCKS} stocks, {(adjacency > 0).sum() // 2} edges)')
plt.axis('off')
plt.tight_layout()
plt.show()

# Network statistics
print(f"\nNetwork Statistics:")
print(f"  Average degree: {np.mean(list(dict(G.degree()).values())):.2f}")
print(f"  Clustering coefficient: {nx.average_clustering(G):.4f}")
print(f"  Density: {nx.density(G):.4f}")

## 3. Prepare Graph Data for GNN

In [None]:
def create_graph_dataset(returns, adjacency, lookback=20, horizon=1):
    """
    Create graph dataset for GNN training.
    
    Features per node:
    - Historical returns (lookback days)
    - Volatility
    - Momentum
    
    Target: Next day return
    """
    n_days, n_stocks = returns.shape
    
    # Create edge index from adjacency
    edge_index = []
    edge_weights = []
    for i in range(n_stocks):
        for j in range(n_stocks):
            if adjacency[i, j] > 0:
                edge_index.append([i, j])
                edge_weights.append(adjacency[i, j])
    
    edge_index = torch.tensor(edge_index, dtype=torch.long).T
    edge_weights = torch.tensor(edge_weights, dtype=torch.float32)
    
    # Create features and targets for each time step
    X_list = []
    y_list = []
    
    for t in range(lookback, n_days - horizon):
        # Features: lookback returns, volatility, momentum
        hist_returns = returns[t-lookback:t, :]  # (lookback, n_stocks)
        volatility = hist_returns.std(axis=0)
        momentum = hist_returns.sum(axis=0)
        recent_return = hist_returns[-1, :]
        
        # Stack features
        features = np.column_stack([
            recent_return,
            volatility,
            momentum
        ])
        
        # Target: next day return
        target = returns[t + horizon, :]
        
        X_list.append(features)
        y_list.append(target)
    
    X = np.array(X_list)  # (n_samples, n_stocks, n_features)
    y = np.array(y_list)  # (n_samples, n_stocks)
    
    return X, y, edge_index, edge_weights

# Create dataset
LOOKBACK = 20
X, y, edge_index, edge_weights = create_graph_dataset(returns, adjacency, lookback=LOOKBACK)

# Train/test split
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"Edge index shape: {edge_index.shape}")
print(f"Number of features per node: {X_train.shape[2]}")

## 4. Graph Convolutional Network (GCN) Model

In [None]:
if TORCH_GEOMETRIC_AVAILABLE:
    class StockGCN(nn.Module):
        """Graph Convolutional Network for stock return prediction"""
        def __init__(self, input_dim, hidden_dim=32, n_layers=2, dropout=0.2):
            super().__init__()
            
            self.convs = nn.ModuleList()
            self.convs.append(GCNConv(input_dim, hidden_dim))
            for _ in range(n_layers - 1):
                self.convs.append(GCNConv(hidden_dim, hidden_dim))
            
            self.fc = nn.Linear(hidden_dim, 1)
            self.dropout = nn.Dropout(dropout)
        
        def forward(self, x, edge_index, edge_weight=None):
            # x: (n_nodes, input_dim)
            # edge_index: (2, n_edges)
            
            for conv in self.convs[:-1]:
                x = conv(x, edge_index, edge_weight)
                x = F.relu(x)
                x = self.dropout(x)
            
            x = self.convs[-1](x, edge_index, edge_weight)
            x = F.relu(x)
            
            # Output prediction for each node
            out = self.fc(x).squeeze(-1)
            return out

    # Initialize model
    input_dim = X_train.shape[2]
    model = StockGCN(input_dim, hidden_dim=32, n_layers=2).to(device)
    edge_index = edge_index.to(device)
    edge_weights = edge_weights.to(device)
    
    print(f"GCN parameters: {sum(p.numel() for p in model.parameters()):,}")
else:
    print("Skipping GCN model (torch_geometric not available)")

## 5. Train GCN Model

In [None]:
if TORCH_GEOMETRIC_AVAILABLE:
    def train_gcn(model, X_train, y_train, X_test, y_test, edge_index, edge_weights,
                  epochs=100, lr=0.01):
        """Train GCN model"""
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
        criterion = nn.MSELoss()
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10, factor=0.5)
        
        X_train_t = torch.tensor(X_train, dtype=torch.float32).to(device)
        y_train_t = torch.tensor(y_train, dtype=torch.float32).to(device)
        X_test_t = torch.tensor(X_test, dtype=torch.float32).to(device)
        y_test_t = torch.tensor(y_test, dtype=torch.float32).to(device)
        
        train_losses = []
        test_losses = []
        
        for epoch in range(epochs):
            model.train()
            total_loss = 0
            
            # Train on each time step
            for t in range(len(X_train_t)):
                optimizer.zero_grad()
                
                x = X_train_t[t]  # (n_stocks, n_features)
                y = y_train_t[t]  # (n_stocks,)
                
                y_pred = model(x, edge_index, edge_weights)
                loss = criterion(y_pred, y)
                
                loss.backward()
                optimizer.step()
                
                total_loss += loss.item()
            
            avg_train_loss = total_loss / len(X_train_t)
            train_losses.append(avg_train_loss)
            
            # Evaluate
            model.eval()
            with torch.no_grad():
                test_loss = 0
                for t in range(len(X_test_t)):
                    y_pred = model(X_test_t[t], edge_index, edge_weights)
                    test_loss += criterion(y_pred, y_test_t[t]).item()
                avg_test_loss = test_loss / len(X_test_t)
                test_losses.append(avg_test_loss)
            
            scheduler.step(avg_test_loss)
            
            if (epoch + 1) % 20 == 0:
                print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.6f}, Test Loss = {avg_test_loss:.6f}")
        
        return train_losses, test_losses

    # Train
    print("Training GCN...")
    train_losses, test_losses = train_gcn(
        model, X_train, y_train, X_test, y_test, 
        edge_index, edge_weights, epochs=100
    )

    # Plot training
    plt.figure(figsize=(10, 4))
    plt.plot(train_losses, label='Train')
    plt.plot(test_losses, label='Test')
    plt.xlabel('Epoch')
    plt.ylabel('MSE Loss')
    plt.title('GCN Training Loss')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
else:
    print("Skipping training (torch_geometric not available)")

## 6. Compare with Baseline (No Network Information)

In [None]:
class MLPBaseline(nn.Module):
    """Simple MLP without graph structure"""
    def __init__(self, input_dim, hidden_dim=32):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.fc3(x).squeeze(-1)
        return x

# Train baseline MLP
mlp = MLPBaseline(X_train.shape[2], hidden_dim=32).to(device)
optimizer = torch.optim.Adam(mlp.parameters(), lr=0.01)
criterion = nn.MSELoss()

X_train_t = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_t = torch.tensor(y_train, dtype=torch.float32).to(device)
X_test_t = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_t = torch.tensor(y_test, dtype=torch.float32).to(device)

mlp_train_losses = []
mlp_test_losses = []

print("Training MLP baseline...")
for epoch in range(100):
    mlp.train()
    total_loss = 0
    
    for t in range(len(X_train_t)):
        optimizer.zero_grad()
        y_pred = mlp(X_train_t[t])
        loss = criterion(y_pred, y_train_t[t])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    mlp_train_losses.append(total_loss / len(X_train_t))
    
    mlp.eval()
    with torch.no_grad():
        test_loss = sum(criterion(mlp(X_test_t[t]), y_test_t[t]).item() for t in range(len(X_test_t)))
        mlp_test_losses.append(test_loss / len(X_test_t))
    
    if (epoch + 1) % 20 == 0:
        print(f"Epoch {epoch+1}: Test Loss = {mlp_test_losses[-1]:.6f}")

print(f"\nFinal Test MSE (MLP): {mlp_test_losses[-1]:.6f}")
if TORCH_GEOMETRIC_AVAILABLE:
    print(f"Final Test MSE (GCN): {test_losses[-1]:.6f}")
    improvement = (mlp_test_losses[-1] - test_losses[-1]) / mlp_test_losses[-1] * 100
    print(f"GCN improvement: {improvement:.2f}%")

## 7. Analyze Learned Node Embeddings

In [None]:
if TORCH_GEOMETRIC_AVAILABLE:
    def get_node_embeddings(model, x, edge_index, edge_weight):
        """Extract node embeddings from intermediate layer"""
        model.eval()
        with torch.no_grad():
            # Pass through first layers
            for conv in model.convs[:-1]:
                x = conv(x, edge_index, edge_weight)
                x = F.relu(x)
            x = model.convs[-1](x, edge_index, edge_weight)
        return x.cpu().numpy()

    # Get embeddings for a sample time step
    sample_x = torch.tensor(X_test[0], dtype=torch.float32).to(device)
    embeddings = get_node_embeddings(model, sample_x, edge_index, edge_weights)

    # Reduce to 2D for visualization
    from sklearn.decomposition import PCA
    pca = PCA(n_components=2)
    embeddings_2d = pca.fit_transform(embeddings)

    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(
        embeddings_2d[:, 0], embeddings_2d[:, 1],
        c=sectors, cmap='Set1', s=100, alpha=0.7
    )
    plt.colorbar(scatter, label='Sector')
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('GCN Node Embeddings (PCA)')
    plt.grid(True, alpha=0.3)
    plt.show()

    # Check if stocks in same sector cluster together
    from sklearn.metrics import silhouette_score
    sil_score = silhouette_score(embeddings, sectors)
    print(f"\nSector clustering quality (silhouette score): {sil_score:.4f}")
else:
    print("Skipping embedding analysis (torch_geometric not available)")

## 8. Prediction Analysis

In [None]:
def evaluate_predictions(model, X_test, y_test, edge_index, edge_weights, use_gcn=True):
    """Evaluate model predictions"""
    model.eval()
    X_test_t = torch.tensor(X_test, dtype=torch.float32).to(device)
    
    all_preds = []
    all_targets = []
    
    with torch.no_grad():
        for t in range(len(X_test_t)):
            if use_gcn and TORCH_GEOMETRIC_AVAILABLE:
                preds = model(X_test_t[t], edge_index, edge_weights)
            else:
                preds = model(X_test_t[t])
            all_preds.append(preds.cpu().numpy())
            all_targets.append(y_test[t])
    
    return np.array(all_preds), np.array(all_targets)

# Get predictions
mlp_preds, targets = evaluate_predictions(mlp, X_test, y_test, edge_index, edge_weights, use_gcn=False)

if TORCH_GEOMETRIC_AVAILABLE:
    gcn_preds, _ = evaluate_predictions(model, X_test, y_test, edge_index, edge_weights, use_gcn=True)

# Calculate metrics
def calc_metrics(preds, targets):
    mse = np.mean((preds - targets) ** 2)
    mae = np.mean(np.abs(preds - targets))
    # Direction accuracy
    direction_acc = np.mean((preds > 0) == (targets > 0))
    return mse, mae, direction_acc

mlp_mse, mlp_mae, mlp_dir = calc_metrics(mlp_preds.flatten(), targets.flatten())
print(f"\nMLP Results:")
print(f"  MSE: {mlp_mse:.6f}")
print(f"  MAE: {mlp_mae:.6f}")
print(f"  Direction Accuracy: {mlp_dir:.2%}")

if TORCH_GEOMETRIC_AVAILABLE:
    gcn_mse, gcn_mae, gcn_dir = calc_metrics(gcn_preds.flatten(), targets.flatten())
    print(f"\nGCN Results:")
    print(f"  MSE: {gcn_mse:.6f}")
    print(f"  MAE: {gcn_mae:.6f}")
    print(f"  Direction Accuracy: {gcn_dir:.2%}")

# Visualize predictions vs actual
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scatter plot
sample = np.random.choice(len(targets.flatten()), 1000, replace=False)
axes[0].scatter(targets.flatten()[sample], mlp_preds.flatten()[sample], alpha=0.3, s=10, label='MLP')
if TORCH_GEOMETRIC_AVAILABLE:
    axes[0].scatter(targets.flatten()[sample], gcn_preds.flatten()[sample], alpha=0.3, s=10, label='GCN')
axes[0].plot([-0.1, 0.1], [-0.1, 0.1], 'r--', label='Perfect')
axes[0].set_xlabel('Actual Return')
axes[0].set_ylabel('Predicted Return')
axes[0].set_title('Predictions vs Actual')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Time series for one stock
stock_idx = 0
axes[1].plot(targets[:50, stock_idx], label='Actual', alpha=0.8)
axes[1].plot(mlp_preds[:50, stock_idx], label='MLP', alpha=0.8)
if TORCH_GEOMETRIC_AVAILABLE:
    axes[1].plot(gcn_preds[:50, stock_idx], label='GCN', alpha=0.8)
axes[1].set_xlabel('Time Step')
axes[1].set_ylabel('Return')
axes[1].set_title(f'Stock {stock_idx} Return Predictions')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Summary

This notebook demonstrated:

1. **Graph Construction**: Built stock network from sector membership and random connections
2. **Graph Convolutional Network**: GCN model that aggregates information from connected stocks
3. **Network Effects**: Stocks can be predicted better by considering neighbors
4. **Node Embeddings**: GCN learns representations that capture sector structure

### Key Insights:
- GCN can leverage network structure to improve predictions
- Learned embeddings cluster by sector (stocks with similar connections have similar representations)
- Network effects in returns can be captured by message passing

### Extensions to Try:
- Use real stock data and build network from correlations or supply chain relationships
- Try Graph Attention Networks (GAT) for learnable edge weights
- Add temporal dynamics with T-GCN (Temporal Graph Convolutional Network)
- Experiment with different graph construction methods (correlation threshold, kNN, etc.)