In [2]:
pip install torch torch-geometric numpy pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, SAGEConv, global_mean_pool
from torch_geometric.data import Data, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Constants
N, M, K, F = 362, 9, 1000, 4  # N samples, M algorithms, K boxes, F features
class_labels = [
    "initial", "SCALE", "PFS", "PFS'", "FTA",
    "VPSC", "PRISM", "GTREE", "RWordle-L"
]

# ----------------- Data Loading and Processing -----------------
def load_and_process_data(json_path, csv_path):
    # Load data
    with open(json_path, "r") as f:
        sets_data = json.load(f)
    scores_df = pd.read_csv(csv_path)
    
    # Initialize tensors
    X = np.zeros((N, M, K, F), dtype=np.float32)
    y = np.zeros((N, M), dtype=np.float32)
    
    # Create filename to index mapping
    filename_to_idx = {entry['base_name']: idx for idx, entry in enumerate(sets_data)}
    
    # Process each entry
    for entry in sets_data:
        base_name = entry['base_name']
        idx = filename_to_idx.get(base_name, -1)
        if idx == -1:
            continue
        
        boxes = entry['boxes']  # This is already a list of boxes per algorithm
        file_scores = scores_df[scores_df['filename'] == base_name]
        
        for algo_idx, algo in enumerate(class_labels):
            # Get score
            score_row = file_scores[file_scores['algorithm'] == algo]
            if not score_row.empty:
                y[idx, algo_idx] = score_row['score'].values[0]
            
            # Get boxes - handle cases where boxes might be missing
            if algo_idx < len(boxes):
                # Check if current algorithm's boxes exist
                algo_boxes = boxes[algo_idx] if isinstance(boxes[algo_idx], list) else []
                
                # Handle single box case (convert to list of boxes)
                if algo_boxes and isinstance(algo_boxes[0], (int, float)):
                    algo_boxes = [algo_boxes]  # Make it a list containing one box
                
                # Store boxes
                for box_idx, box in enumerate(algo_boxes[:K]):
                    if isinstance(box, list) and len(box) == F:
                        X[idx, algo_idx, box_idx] = box
    
    return X, y

# Convert to PyG Data format
def create_graph_data(X, y):
    graph_data_list = []
    for i in range(X.shape[0]):
        # Node features (M*K, F)
        node_features = X[i].reshape(-1, F)
        
        # Create edges (fully connected within each algorithm)
        edge_index = []
        for algo_idx in range(M):
            start = algo_idx * K
            end = start + K
            # Fully connect boxes within same algorithm
            for u in range(start, end):
                for v in range(start, end):
                    if u != v:
                        edge_index.append([u, v])
        
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        
        graph_data_list.append(Data(
            x=torch.tensor(node_features, dtype=torch.float),
            edge_index=edge_index,
            y=torch.tensor(y[i], dtype=torch.float)
        ))
    return graph_data_list

# ----------------- GNN Models -----------------
class GATModel(nn.Module):
    def __init__(self, node_dim=F, hidden_dim=64, output_dim=M, heads=4):
        super().__init__()
        self.conv1 = GATConv(node_dim, hidden_dim, heads=heads)
        self.conv2 = GATConv(hidden_dim*heads, hidden_dim, heads=1)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.elu(self.conv1(x, edge_index))
        x = F.elu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch=torch.zeros(data.num_nodes, dtype=torch.long, device=x.device))
        return self.fc(x)

class SAGEModel(nn.Module):
    def __init__(self, node_dim=F, hidden_dim=64, output_dim=M):
        super().__init__()
        self.conv1 = SAGEConv(node_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.elu(self.conv1(x, edge_index))
        x = F.elu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch=torch.zeros(data.num_nodes, dtype=torch.long, device=x.device))
        return self.fc(x)

# ----------------- Training -----------------
def train(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for data in loader:
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def test(model, loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for data in loader:
            out = model(data)
            total_loss += criterion(out, data.y).item()
    return total_loss / len(loader)

# ----------------- Main -----------------
if __name__ == "__main__":
    # Load and process data
    X, y = load_and_process_data("merge.json", "scoresB2.csv")
    print(f"Loaded data with shapes - X: {X.shape}, y: {y.shape}")
    
    # Convert to graph data
    graph_data = create_graph_data(X, y)
    print(f"Created {len(graph_data)} graph samples")
    
    # Split data
    train_data, test_data = train_test_split(graph_data, test_size=0.2, random_state=42)
    train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=32)
    
    # Initialize models
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gat = GATModel().to(device)
    sage = SAGEModel().to(device)
    
    # Training setup
    criterion = nn.MSELoss()
    gat_optim = optim.Adam(gat.parameters(), lr=0.001)
    sage_optim = optim.Adam(sage.parameters(), lr=0.001)
    
    # Train and compare
    print("\nTraining GAT Model:")
    for epoch in range(100):
        loss = train(gat, train_loader, gat_optim, criterion)
        test_loss = test(gat, test_loader, criterion)
        if (epoch+1) % 10 == 0:
            print(f"Epoch {epoch+1:03d}, Train Loss: {loss:.4f}, Test Loss: {test_loss:.4f}")
    
    print("\nTraining GraphSAGE Model:")
    for epoch in range(100):
        loss = train(sage, train_loader, sage_optim, criterion)
        test_loss = test(sage, test_loader, criterion)
        if (epoch+1) % 10 == 0:
            print(f"Epoch {epoch+1:03d}, Train Loss: {loss:.4f}, Test Loss: {test_loss:.4f}")
    
    # Final comparison
    gat_test_loss = test(gat, test_loader, criterion)
    sage_test_loss = test(sage, test_loader, criterion)
    print(f"\nFinal Comparison:")
    print(f"GAT Test MSE: {gat_test_loss:.4f}")
    print(f"GraphSAGE Test MSE: {sage_test_loss:.4f}")

Loaded data with shapes - X: (362, 9, 1000, 4), y: (362, 9)


RuntimeError: [enforce fail at ..\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 143856000 bytes.