In [2]:
import torch
import torch.nn as nn
from torch_geometric.nn import GCNConv, Linear, SAGEConv, GATv2Conv, GATConv
from torch_geometric.data import Data, Dataset
from torch_geometric.loader import DataLoader
import torch.nn.functional as F

import networkx as nx
import numpy as np
from pathlib import Path
import scipy as sp
rng = np.random.default_rng()

In [39]:
def create_dataset(G: nx.DiGraph):
  edge_index = torch.tensor(list(G.edges)).t().contiguous()
  return edge_index

n = 100
p = 0.1
gname = f"er_{n}_{str(p).replace('.', '')}"
path = Path(f"datasets/synthetic/{gname}")

with open(path / f"graph.mtx", "rb") as fh:
  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)

cascades = []
for i in range(250):
  with open(path / f"diffusions/timestamps/{i}.txt", "r") as fh:
    cascade = []
    for line in fh:
      cascade.append(list(map(int, line.strip().split())))
    cascades.append(cascade)

#for k in (50, 100, 150, 200, 250):
k = 100
k_cascades = cascades[:k]

n = G.number_of_nodes()
m = G.number_of_edges()
edge_index = create_dataset(G)
print(edge_index)

#model = GNNIndependentCascade(64, 16, n, m, num_layers=2)
#optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
#train_model(model, optimizer, data, k_cascades, 30, len(k_cascades) // 4)

#model.eval()
#print("")

tensor([[ 0,  0,  0,  ..., 99, 99, 99],
        [ 9, 17, 38,  ..., 86, 88, 98]])


In [14]:
class CascadeGNN(nn.Module):
    def __init__(self, num_nodes, hidden_dim=64, num_layers=3):
        super(CascadeGNN, self).__init__()
        self.num_nodes = num_nodes
        
        # Initial node embedding layer
        self.node_embedding = nn.Embedding(num_nodes, hidden_dim)
        
        # GNN layers
        self.convs = nn.ModuleList()
        for _ in range(num_layers):
            self.convs.append(GCNConv(hidden_dim, hidden_dim))
            
        # Edge probability prediction layer
        self.edge_predictor = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
        
    def forward(self, edge_index):
        # Get initial node embeddings
        x = self.node_embedding(torch.arange(self.num_nodes).to(edge_index.device))
        
        # Apply GNN layers
        for conv in self.convs:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=0.1, training=self.training)
            
        # Compute edge probabilities for all edges
        edge_probabilities = {}
        for i in range(edge_index.size(1)):
            source, target = edge_index[:, i]
            combined_features = torch.cat([x[source], x[target]], dim=0)
            prob = self.edge_predictor(combined_features)
            edge_probabilities[(source.item(), target.item())] = prob
        
        return edge_probabilities



In [91]:
def compute_cascade_likelihood(num_nodes, edge_probs, cascade, eps=1e-6):
    """
    Compute the negative log likelihood of observing a cascade given edge probabilities
    
    Args:
        num_nodes: Number of nodes in the graph
        edge_probs: Dictionary mapping (source, target) tuples to probabilities
        cascade: List of lists, where cascade[i] contains nodes activated at time i
        eps: Small value to prevent log(0)
    
    Returns:
        Negative log likelihood of the cascade
    """
    log_likelihood = 0.0
    activated_nodes = set()
    
    # Process each time step
    for t in range(len(cascade)):
        prev_activated = cascade[t-1] if t-1 >= 0 else []
        curr_activated = cascade[t]
        next_activated = cascade[t+1] if t+1 < len(cascade) else []
        activated_nodes.update(curr_activated)

        #print(t)
        #print(prev_activated)
        #print(curr_activated)
        #print(next_activated)

        for v in curr_activated:
            # Probability of activation from parents
            if prev_activated:
                parents = set([u for u in range(num_nodes) if (u, v) in edge_probs and u in prev_activated])
                prob = [1 - edge_probs[(u, v)] for u in parents]
                prob = torch.cat(prob)
                prob_not_activated = torch.prod(prob)
                log_likelihood += torch.log(1 - prob_not_activated + eps)
            if next_activated:
                children = set([w for w in range(num_nodes) if (v, w) in edge_probs and w not in activated_nodes and w not in set(next_activated)])
                if not children:
                    continue
                prob = [1 - edge_probs[(v, w)] for w in children]
                prob = torch.cat(prob)
                #print(prob)
                prob_not_activated = torch.prod(prob)
                log_likelihood += torch.log(prob_not_activated + eps)
    
    return log_likelihood

def compute_loss(num_nodes, edge_probs, edge_index, cascades):
  """
  Compute the negative log-likelihood loss for multiple cascades.
  
  Args:
    num_nodes: Number of nodes in the
    edge_probs: Tensor of predicted edge probabilities
    edge_index: Tensor of shape [2, num_edges] containing edge indices
    cascades: List of cascades, where each cascade is a list of lists of activated nodes
  
  Returns:
    loss: Negative log-likelihood loss
  """
  total_log_likelihood = 0.0
  for cascade in cascades:
    total_log_likelihood += compute_cascade_likelihood(num_nodes, edge_probs, cascade)
  
  # Return negative log-likelihood as the loss
  #print(-total_log_likelihood)
  return -total_log_likelihood

In [82]:
def train_cascade_gnn(model, num_nodes, edge_index, cascades, num_epochs=100, lr=0.001):
    """
    Train the GNN model using the observed cascades
    
    Args:
        model: CascadeGNN model
        num_nodes: Number of nodes in the graph
        edge_index: Tensor of shape [2, num_edges] containing edge indices
        cascades: List of cascades, where each cascade is a list of lists
        num_epochs: Number of training epochs
        lr: Learning rate
    """
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        
        # Get edge probabilities
        edge_probs = model.forward(edge_index)
        
        # Compute total negative log likelihood across all cascades
        total_loss = compute_loss(num_nodes, edge_probs, edge_index, cascades)
            
        # Backward pass and optimization
        total_loss.backward()
        optimizer.step()
        
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss.item():.4f}")
            
    return model

In [98]:
model = CascadeGNN(n, hidden_dim=64, num_layers=3)
trained_model = train_cascade_gnn(model, n, edge_index, k_cascades, num_epochs=50, lr=0.001)

Epoch 10/50, Loss: 12770.5693
Epoch 20/50, Loss: 11774.3955
Epoch 30/50, Loss: 11555.9971
Epoch 40/50, Loss: 11541.6787
Epoch 50/50, Loss: 11544.8721


In [99]:
edge_probs = trained_model(edge_index)
l1_errors = []
#print(edge_probs)

residuals = []
for i, e in enumerate(G.edges()):
  u, v = e
  p = G[u][v]['weight']
  residuals.append(abs(p - edge_probs[e].item()))

l1_errors.append(sum(residuals) / len(residuals))
print(l1_errors)

[0.11307469791069746]
