In [2]:
from collections import defaultdict
import torch
import os   
import pandas as pd
from torch_geometric.datasets import TUDataset
from torch_geometric.data import Data, InMemoryDataset

In [3]:
print(torch.__version__)

2.4.0.dev20240417


In [4]:
dataset =  TUDataset(root = 'data/TUDataset', name ='MUTAG')

In [5]:
data = dataset[0]

In [6]:
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Data(edge_index=[2, 38], x=[17, 7], edge_attr=[38, 4], y=[1])
Number of nodes: 17
Number of edges: 38
Average node degree: 2.24
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [7]:
## This is for an undirected graph

def edge_index (path):
    edge_df = pd.read_csv(path, sep = " ", header=None)
    edge_df.columns = ['u', 'v']
    sources = []
    dests = []
    for _, row in edge_df.iterrows():
        sources.append(row[0])
        dests.append(row[1])
        sources.append(row[1])
        dests.append(row[0])
    return sources, dests
sources, dests = edge_index("/Users/rezatabrizi/Downloads/facebook/adj.txt")
edge_index_tensor = torch.tensor([sources, dests], dtype=torch.long)

In [8]:
class CascadeRegression(InMemoryDataset):
    def __init__(self, root, name, edge_index_tensor, task, observation):
        assert task in ['classification', 'regression'], "Task must be either 'classification' or 'regression'"
        if task == 'classification':
            assert 0 <= observation <= 3, "Observation time must be between 0 and 3 for classification tasks"

        self.name = name
        self.task = task
        self.observation = observation
        self.edge_index_tensor = edge_index_tensor
        super(CascadeRegression, self).__init__(root)
        self.load(self.processed_paths[0])
    @property 
    def raw_dir(self):
        return os.path.join(self.root, "raw", self.task, self.name)
    
    @property
    def processed_dir(self):
        return os.path.join(self.root, "processed", self.task, self.name, str(self.observation))
    
    @property
    def num_features(self):
        return self.data.x.size(1) if self.data.x is not None else 0

    @property
    def raw_file_names(self):
        files = ['cascades', 'node_features']
        return [f'{file}.txt' for file in files]
    
    @property
    def processed_file_names(self):
        return 'data.pt'
    
    def __repr__(self):
        return f'{self.__class__.__name__} network: {self.name} task: {self.task} observation-window t={str(self.observation)} (Number of graphs: {len(self)}, Number of features [activation, deg cent, eigen cent, btwns cemt]: {self.num_features})'

    def process(self):
        cascades_file_path, node_features_file_path = self.raw_paths
        with open(cascades_file_path, 'r') as file:
            cascades_data = file.readlines()
        
        with open (node_features_file_path, 'r') as file:
            node_features_data = file.readlines()
        
        node_features = defaultdict(list)
        data = []

        for node_feature in node_features_data: 
            node, degree_centrality, eigenvector_centrality, betweeness_centrality = node_feature.split()
            node = int(node)   
            centralities = [float(degree_centrality), float(eigenvector_centrality), float(betweeness_centrality)]
            node_features[node] = centralities

        # Need to go through every cascade and create the Data object
        # depending on if this is a regression or classification task then the type of label is different
        # however, the X will always be the number of nodes time the node features which is going to be 
        for cascade in cascades_data:
            cascade = cascade.strip().split()
            if self.task == "regression":
                seeds = list(map(int, cascade[1:-1]))
                activation_count = float(cascade[-1])  # This might be used for labels or further processing
            elif self.task == "classification":
                activations = [node_activation.split(':') for node_activation in cascade[1:]]
                seeds = [int(node) for node, time in activations if int(time) <= self.observation]
                final_activations = [int(node) for node, _ in activations]

            seeds = set(seeds)
            X = torch.empty((len(node_features), len(node_features[0]) + 1)) 
            for node, features in node_features.items():
                activation_status = 1 if node in seeds else 0
                node_row = torch.tensor([activation_status] + features)
                X[node] = node_row
            
            if self.task == "regression":
                y = torch.tensor([activation_count], dtype=torch.float)
            elif self.task == "classification":
                y = torch.tensor([1 if node in seeds else 0 for node in range(len(node_features))], dtype=torch.long)
            data.append(Data(x=X, y=y, edge_index=self.edge_index_tensor))

        self.save(data, self.processed_paths[0])
        

In [9]:
fb_regression_ds = CascadeRegression(root="data", name="facebook", edge_index_tensor=edge_index_tensor, task="regression", observation=2)
fb_classification_ds = CascadeRegression(root="data", name="facebook", edge_index_tensor=edge_index_tensor, task="classification", observation=2)

In [10]:
fb_regression_ds



CascadeRegression network: facebook task: regression observation-window t=2 (Number of graphs: 2000, Number of features [activation, deg cent, eigen cent, btwns cemt]: 4)

In [11]:
fb_regression_ds.y.shape

torch.Size([2000])

In [44]:
import torch
from torch_geometric.nn import GATConv, global_mean_pool
import torch.nn.functional as F
import torch.nn as nn

class GAT(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(GAT, self).__init__()
        # multilayer perceptron is used for encoding the features
        self.mlp = nn.Sequential(
        nn.Linear(input_dim, 32),
        nn.ReLU(),
        nn.Linear(32, hidden_dim))
        
        self.first_gat = GATConv(hidden_dim, hidden_dim, heads=8, concat=False, dropout=0.2)
        self.gat_layers = nn.ModuleList([GATConv(hidden_dim, hidden_dim, heads=8, concat=False, dropout=0.2) for _ in range(num_layers-1)])
        self.final_gat = GATConv(hidden_dim, hidden_dim, heads=1, concat=False, dropout=0.2)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, data):
       # print("In the forward method")
        x, edge_index, batch = data.x, data.edge_index, data.batch
        #print (f"x dim {x.shape}")
        x = self.mlp(x)
        #print(f"MLP Done x dim {x.shape}")
        x = self.first_gat(x, edge_index)
        #print(f"GAT 1 Done x dim {x.shape}")
        for i, layer in enumerate(self.gat_layers): 
            x_in = x
            #print(f"Before Layer #{i} x_in dim {x_in.shape}   x dim {x.shape}")
            x = F.dropout(x, p=0.2, training=self.training)
            x = F.elu(layer(x, edge_index))
            #print(f"AFTER Layer #{i} x_in dim {x_in.shape}   x dim {x.shape}")
            x = x + x_in 
        x = self.final_gat(x, edge_index)
        x = global_mean_pool(x, batch)
        x = self.fc(x) 
        return x

In [45]:
model = GAT(fb_regression_ds.num_features, 64, 1, 6)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

In [24]:
from torch_geometric.data import DataLoader
from torch.utils.data import random_split
total_size = len(fb_regression_ds)
train_size = int(0.7 * total_size)
valid_size = int(0.15 * total_size) 
test_size = total_size - train_size - valid_size

train_dataset, valid_dataset, test_dataset = random_split (fb_regression_ds, [train_size, valid_size, test_size])


train_loader = DataLoader(train_dataset, batch_size=32, shuffle = True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = 32, shuffle = True)



In [56]:
(train_loader.dataset[2].y)

tensor([46.6090])

In [58]:
def train(loader):
    total_loss = 0 
    model.train()
    for idx, data in enumerate(loader):
        
        optimizer.zero_grad()
        out = model(data)
        loss = F.mse_loss(out, data.y.view(-1, 1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len (loader)

def test(loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for data in loader:
            out = model(data)
            loss = F.mse_loss(out, data.y.view(-1, 1))
            total_loss += loss.item()
    return total_loss / len(loader)

In [59]:
for epoch in range(100):
    train_loss = train(train_dataset)
    valid_loss = test(valid_dataset)
    print(f'Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Validation Loss: {valid_loss:.4f}')

test_loss = test(test_loader)
print(f'Test MSE Loss: {test_loss:.4f}')

KeyboardInterrupt: 