In [2]:
import torch
import torch.nn as nn
from torch_geometric.nn import GCNConv, Linear, SAGEConv, GATv2Conv, GATConv
from torch_geometric.data import Data, Dataset
from torch_geometric.loader import DataLoader
import torch.nn.functional as F

import networkx as nx
import numpy as np
from pathlib import Path
import scipy as sp
rng = np.random.default_rng()

  Referenced from: <E87A820F-D734-3F45-AFBE-9D80043A97C0> /Users/sbharg/homework/ut_austin/ece381k_mlnetworks/gnn_influence_probs/.venv/lib/python3.12/site-packages/libpyg.so
  Reason: tried: '/Library/Frameworks/Python.framework/Versions/3.12/Python' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/Library/Frameworks/Python.framework/Versions/3.12/Python' (no such file), '/Library/Frameworks/Python.framework/Versions/3.12/Python' (no such file)


In [3]:
def compute_cascade_likelihood(edge_probs, edge_index, cascade, epsilon=1e-8):
  """
  Compute the likelihood of observing a single cascade given edge probabilities.
  
  Args:
  - edge_probs: Tensor of predicted edge probabilities
  - edge_index: Tensor of shape [2, num_edges] containing edge indices
  - cascade: List of lists, where each inner list contains nodes activated at that time step
  - epsilon: Small value to avoid log(0)
  
  Returns:
  - log_likelihood: Log-likelihood of the cascade
  """
  device = edge_probs.device
  num_nodes = edge_index.max().item() + 1
  activated = torch.zeros(num_nodes, dtype=torch.bool, device=device)
  log_likelihood = 0.0

  src, dst = edge_index

  for t in range(len(cascade)):
    prev_activated = torch.tensor(cascade[t-1] if t-1 >= 0 else [], device=device)
    curr_activated = torch.tensor(cascade[t], device=device)
    next_activated = torch.tensor(cascade[t+1] if t+1 < len(cascade) else [], device=device)
    activated[curr_activated] = True
    
    # Probability of activation from parents
    for v in curr_activated:
      parents = src[(dst == v) & activated[src]]
      activated_parents = parents[torch.isin(parents, prev_activated)]
      if len(activated_parents) > 0:
        prob_v_activated = 1 - torch.prod(1 - edge_probs[torch.isin(src, activated_parents) & (dst == v)])
        log_likelihood += torch.log(prob_v_activated + epsilon)

    
    # Probability of non-activation of children
    for v in curr_activated:
      children = dst[(src == v) & ~activated[dst]]
      non_activated_children = children[~torch.isin(children, next_activated)]
      if len(non_activated_children) > 0:
        prob_children_not_activated = torch.prod(1 - edge_probs[(src == v) & torch.isin(dst, non_activated_children)])
        log_likelihood += torch.log(prob_children_not_activated + epsilon)
    

  return log_likelihood

  '''
  device = edge_probs.device
  num_nodes = edge_index.max().item() + 1
  activated = torch.zeros(num_nodes, dtype=torch.bool, device=device)
  log_likelihood = 0.0

  for t, activated_nodes in enumerate(cascade):
    if t == 0:
      activated[activated_nodes] = True
      continue

    # Compute activation probabilities for this step
    src, dst = edge_index
    mask = activated[src] & ~activated[dst]
    relevant_probs = edge_probs[mask]
    relevant_dst = dst[mask]

    # Compute likelihood of activations and non-activations
    new_activations = torch.tensor(activated_nodes, device=device)
    activated_probs = relevant_probs[torch.isin(relevant_dst, new_activations)]
    non_activated_probs = relevant_probs[~torch.isin(relevant_dst, new_activations)]

    log_likelihood += torch.sum(torch.log(activated_probs + epsilon))
    log_likelihood += torch.sum(torch.log(1 - non_activated_probs + epsilon))

    # Update activated nodes
    activated[activated_nodes] = True

  return log_likelihood
  '''

def compute_loss(edge_probs, edge_index, cascades):
  """
  Compute the negative log-likelihood loss for multiple cascades.
  
  Args:
  - edge_probs: Tensor of predicted edge probabilities
  - edge_index: Tensor of shape [2, num_edges] containing edge indices
  - cascades: List of cascades, where each cascade is a list of lists of activated nodes
  
  Returns:
  - loss: Negative log-likelihood loss
  """
  total_log_likelihood = 0.0
  for cascade in cascades:
    total_log_likelihood += compute_cascade_likelihood(edge_probs, edge_index, cascade)
  
  # Return negative log-likelihood as the loss
  return -total_log_likelihood

In [23]:
def create_dataset(G: nx.DiGraph, features):
  # Create a PyG Data object from the networkx graph
  edge_index = torch.tensor(list(G.edges)).t().contiguous()
  #x = torch.tensor(features, dtype=torch.float)
  data = Data(x=features, edge_index=edge_index)
  return data

n = 100
p = 0.1
gname = f"er_{n}_{str(p).replace('.', '')}"
path = Path(f"datasets/synthetic/{gname}")

with open(path / f"{gname}.mtx", "rb") as fh:
  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)
with open(path / "feats.npy", "rb") as fh:
  features_npy = torch.tensor(np.load(fh), dtype=torch.float)

cascades = []
idxes = rng.choice(500, 100, replace=False)
for i in idxes:
  with open(path / f"diffusions/timestamps/{i}.txt", "r") as fh:
    cascade = []
    for line in fh:
      cascade.append(list(map(int, line.strip().split())))
    cascades.append(cascade)

In [8]:
class GNNIndependentCascade(torch.nn.Module):
  def __init__(self, num_node_features, hidden_dim, num_layers=2):
    super(GNNIndependentCascade, self).__init__()
    self.num_layers = num_layers
    self.convs = nn.ModuleList([
      GATConv(num_node_features if i == 0 else hidden_dim, hidden_dim) 
      for i in range(num_layers)]
    )

    self.edge_predictor = nn.Sequential(
      nn.Linear(2 * hidden_dim, hidden_dim),
      nn.ELU(),
      nn.Linear(hidden_dim, 1),
      nn.ELU()
    )

  def forward(self, data):
    x, edge_index = data.x, data.edge_index

    # Node embedding
    for i in range(self.num_layers):
      x = self.convs[i](x, edge_index)
      x = torch.relu(x)
      x = torch.dropout(x, p=0.1, train=self.training)

    # Edge probability prediction
    row, col = edge_index
    edge_features = torch.cat([x[row], x[col]], dim=1)
    edge_probs = torch.sigmoid(self.edge_predictor(edge_features).squeeze())
    #edge_probs = torch.sigmoid(torch.sum(x[row] * x[col], dim=1))

    return edge_probs

def train_model(model, optimizer, data, cascades, num_epochs, batch_size = 50):
  model.train()
  #batches = DataLoader(cascades, batch_size=10, shuffle=True)
  #print(batches)

  for epoch in range(num_epochs):
    loss = 0.0
    rng.shuffle(cascades)
    batches = [cascades[i:i+batch_size] for i in range(0, len(cascades), batch_size)]

    for batch in batches:
      optimizer.zero_grad()
      edge_probs = model.forward(data)

      batch_loss = compute_loss(edge_probs, data.edge_index, batch)
      batch_loss.backward()
      optimizer.step()
      loss += batch_loss.item()
    
    if epoch % 10 == 0 or epoch == num_epochs - 1:
      print(f"Epoch {epoch+1}/{num_epochs}, Total Loss: {loss:.4f}")
      #print(edge_probs[0])
      #print(edge_probs[1])
      #print('\n')

features_eye = torch.eye(G.number_of_nodes())
data = create_dataset(G, features_eye)
model = GNNIndependentCascade(data.num_features, 64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
train_model(model, optimizer, data, cascades, 100)

Epoch 1/100, Total Loss: 13852.4717
Epoch 11/100, Total Loss: 12411.3013
Epoch 21/100, Total Loss: 12263.2939
Epoch 31/100, Total Loss: 12240.7729
Epoch 41/100, Total Loss: 12237.9585
Epoch 51/100, Total Loss: 12233.4639
Epoch 61/100, Total Loss: 12230.7671
Epoch 71/100, Total Loss: 12230.2842
Epoch 81/100, Total Loss: 12227.9946
Epoch 91/100, Total Loss: 12219.9272
Epoch 100/100, Total Loss: 12208.3076


In [312]:
model.eval()
l1_error = 0
l2_error = 0
edge_probs = model(data)

for i, e in enumerate(G.edges()):
  u, v = e
  p = G[u][v]['weight']
  l1_error += abs(p - edge_probs[i].item())
  l2_error += (p - edge_probs[i].item())**2

print(l1_error)
print(l1_error / G.number_of_edges())
print(edge_probs)

678.8665602754405
0.7034886634978659
tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
   

In [6]:
class GNNIndependentCascade(torch.nn.Module):
  def __init__(self, in_dim, hidden_dim, n_nodes, n_edges, num_layers=2):
    super(GNNIndependentCascade, self).__init__()
    self.num_layers = num_layers

    self.node_embed = nn.Embedding(n_nodes, in_dim)
    self.edge_embed = nn.Parameter(torch.rand((n_edges, hidden_dim)))

    self.convs = nn.ModuleList([
      SAGEConv(in_dim if i == 0 else hidden_dim, hidden_dim) 
      for i in range(num_layers)
    ])

    self.edge_predictor = nn.Sequential(
      nn.Linear(hidden_dim, hidden_dim),
      nn.ELU(),
      nn.Linear(hidden_dim, 1),
    )
    #nn.Linear(hidden_dim, 1)

  def forward(self, data):
    x, edge_index = data.x, data.edge_index

    # Node embedding
    #for i in range(self.num_layers):
    #  x = self.convs[i](x, edge_index)
    #  x = torch.relu(x)
    #  x = torch.dropout(x, p=0.1, train=self.training)

    # Edge probability prediction
    edge_probs = torch.sigmoid(self.edge_predictor(self.edge_embed))

    #edge_probs = torch.sigmoid(torch.sum(x[row] * x[col], dim=1))

    return edge_probs

def train_model(model, optimizer, data, cascades, num_epochs, batch_size = 50):
  model.train()
  #batches = DataLoader(cascades, batch_size=10, shuffle=True)
  #print(batches)

  for epoch in range(num_epochs):
    loss = 0.0
    rng.shuffle(cascades)
    batches = [cascades[i:i+batch_size] for i in range(0, len(cascades), batch_size)]

    for batch in batches:
      optimizer.zero_grad()
      edge_probs = model.forward(data)

      batch_loss = compute_loss(edge_probs, data.edge_index, batch)
      batch_loss.backward()
      optimizer.step()
      loss += batch_loss.item()
    
    if epoch % 10 == 0 or epoch == num_epochs - 1:
      print(f"Epoch {epoch+1}/{num_epochs}, Total Loss: {loss:.4f}")
      #print(edge_probs[0])
      #print(edge_probs[1])
      #print('\n')

n = G.number_of_nodes()
features_embed = nn.Embedding(n, n)
data = create_dataset(G, features_embed(torch.arange(n)))
model = GNNIndependentCascade(data.num_features, 64, len(data.x), len(data.edge_index[0]), num_layers=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
train_model(model, optimizer, data, cascades, 100)

Epoch 1/100, Total Loss: 13271.6597
Epoch 11/100, Total Loss: 10534.9819
Epoch 21/100, Total Loss: 9552.8940
Epoch 31/100, Total Loss: 9453.4146
Epoch 41/100, Total Loss: 9430.0796
Epoch 51/100, Total Loss: 9422.7651
Epoch 61/100, Total Loss: 9424.0000
Epoch 71/100, Total Loss: 9418.0762
Epoch 81/100, Total Loss: 9417.8579
Epoch 91/100, Total Loss: 9419.9067
Epoch 100/100, Total Loss: 9419.1880


In [7]:
model.eval()
l1_error = 0
l2_error = 0
edge_probs = model(data)

for i, e in enumerate(G.edges()):
  u, v = e
  p = G[u][v]['weight']
  l1_error += abs(p - edge_probs[i].item())
  l2_error += (p - edge_probs[i].item())**2

print(l1_error)
print(l1_error / G.number_of_edges())
print(edge_probs)

95.2113658612902
0.0986646278355339
tensor([[6.1738e-02],
        [7.9006e-01],
        [4.0442e-01],
        [4.4102e-01],
        [8.2723e-01],
        [6.6342e-01],
        [3.3259e-01],
        [4.0768e-01],
        [1.7554e-01],
        [5.3576e-01],
        [5.7342e-01],
        [2.8838e-01],
        [2.6197e-01],
        [9.4849e-02],
        [2.8793e-01],
        [2.8896e-02],
        [2.3368e-01],
        [3.3511e-01],
        [1.2920e-03],
        [4.6355e-01],
        [2.6946e-01],
        [3.1625e-01],
        [1.6136e-01],
        [8.2319e-02],
        [7.2024e-01],
        [1.6765e-01],
        [1.4062e-01],
        [2.5346e-01],
        [1.4348e-01],
        [1.7394e-01],
        [5.1224e-01],
        [5.3003e-01],
        [7.0310e-02],
        [6.7588e-01],
        [4.4773e-01],
        [4.0013e-01],
        [6.8070e-02],
        [2.6520e-01],
        [1.3172e-03],
        [4.7475e-01],
        [3.6202e-01],
        [4.5034e-01],
        [5.1787e-01],
        [3.3955e-0

In [38]:
class GNNIndependentCascade(torch.nn.Module):
  def __init__(self, in_dim, hidden_dim, n_nodes, n_edges, num_layers=2):
    super(GNNIndependentCascade, self).__init__()
    self.n = n_nodes
    self.m = n_edges

    self.num_layers = num_layers

    self.node_embed = nn.Embedding(n_nodes, in_dim)
    #self.edge_embed = nn.Parameter(torch.rand((n_edges, hidden_dim)))
    self.edge_embed = nn.Parameter(torch.Tensor(n_nodes, n_nodes, hidden_dim))

    self.convs = nn.ModuleList([
      GCNConv(in_dim if i == 0 else hidden_dim, hidden_dim) 
      for i in range(num_layers)
    ])

    self.edge_predictor = nn.Sequential(
      nn.Linear(2 * hidden_dim + hidden_dim, hidden_dim),
      nn.ELU(),
      nn.Linear(hidden_dim, 1),
    )
    #nn.Linear(hidden_dim, 1)

  def forward(self, data):
    edge_index = data.edge_index
    x = self.node_embed(torch.arange(self.n))

    # Node embedding
    for i in range(self.num_layers):
      x = self.convs[i](x, edge_index)
      x = F.gelu(x)
      x = torch.dropout(x, p=0.1, train=self.training)

    # Edge probability prediction
    src, dst = edge_index
    edge_feat = self.edge_embed[src, dst]
    edge_repr = torch.cat([x[src], x[dst], edge_feat], dim=1)
    edge_probs = torch.sigmoid(self.edge_predictor(edge_repr))

    #edge_probs = torch.sigmoid(torch.sum(x[row] * x[col], dim=1))

    return edge_probs

def train_model(model, optimizer, data, cascades, num_epochs, batch_size = 50):
  model.train()
  #batches = DataLoader(cascades, batch_size=10, shuffle=True)
  #print(batches)

  for epoch in range(num_epochs):
    loss = 0.0
    rng.shuffle(cascades)
    batches = [cascades[i:i+batch_size] for i in range(0, len(cascades), batch_size)]

    for batch in batches:
      optimizer.zero_grad()
      edge_probs = model.forward(data)

      batch_loss = compute_loss(edge_probs, data.edge_index, batch)
      batch_loss.backward()
      optimizer.step()
      loss += batch_loss.item()
    
    if epoch % 10 == 0 or epoch == num_epochs - 1:
      print(f"Epoch {epoch+1}/{num_epochs}, Total Loss: {loss:.4f}")

n = G.number_of_nodes()
m = G.number_of_edges()
features = torch.eye(n)
data = create_dataset(G, features)

model = GNNIndependentCascade(64, 128, n, m, num_layers=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
train_model(model, optimizer, data, cascades, 100, len(cascades) // 2)

Epoch 1/100, Total Loss: 12891.2979
Epoch 11/100, Total Loss: 9531.5522
Epoch 21/100, Total Loss: 9340.5542
Epoch 31/100, Total Loss: 9307.4482
Epoch 41/100, Total Loss: 9307.5200
Epoch 51/100, Total Loss: 9309.3545
Epoch 61/100, Total Loss: 9307.1001
Epoch 71/100, Total Loss: 9303.3945
Epoch 81/100, Total Loss: 9301.2231
Epoch 91/100, Total Loss: 9303.2954
Epoch 100/100, Total Loss: 9302.9438


In [39]:
model.eval()
l1_error = 0
l2_error = 0
edge_probs = model(data)

for i, e in enumerate(G.edges()):
  u, v = e
  p = G[u][v]['weight']
  l1_error += abs(p - edge_probs[i].item())
  l2_error += (p - edge_probs[i].item())**2

print(l1_error)
print(l1_error / G.number_of_edges())
print(edge_probs)

94.76169292003178
0.0981986455129863
tensor([[9.3022e-02],
        [6.3758e-01],
        [4.3496e-01],
        [3.8436e-01],
        [9.0986e-01],
        [6.2361e-01],
        [5.6749e-01],
        [4.8672e-01],
        [6.5809e-01],
        [6.7918e-01],
        [5.9955e-01],
        [7.0689e-03],
        [4.3678e-01],
        [1.5315e-02],
        [8.9735e-02],
        [2.6516e-04],
        [5.7472e-04],
        [1.0013e-03],
        [6.6801e-04],
        [6.4688e-01],
        [3.1434e-01],
        [1.0394e-01],
        [3.3242e-01],
        [2.6411e-01],
        [4.1514e-01],
        [5.7190e-04],
        [2.4836e-01],
        [1.5797e-01],
        [5.2332e-04],
        [5.1393e-01],
        [2.9643e-01],
        [5.8134e-01],
        [3.2381e-04],
        [3.7160e-01],
        [2.8276e-01],
        [3.7047e-01],
        [8.4071e-02],
        [4.1159e-01],
        [1.7177e-01],
        [3.7117e-01],
        [4.9841e-01],
        [1.6960e-01],
        [3.6264e-01],
        [1.5552e-