# Synthetic Experiments using Stochastic Block Models

In [21]:
import pickle as pkl
from torch_geometric.data import DataLoader
from itertools import combinations
import random
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR
from src.utils.CreateFeatures import CreateFeatures
from src.pygcn.GCN_synthetic import SiameseGNN
from torch_geometric.data import DataLoader
from torch_geometric.utils import to_networkx

import torch
import torch.nn as nn
import torch
import torch_geometric.data as data
from typing import Union
from src.utils.graphs import laplacian_embeddings, random_walk_embeddings, degree_matrix, identity
from torch_geometric.utils import to_networkx
import networkx as nx
import numpy as np
import itertools

from src.utils.sample import sample_pairs

In [2]:
from sklearn.metrics import precision_score, recall_score

def adjusted_f1_score(y_true, y_pred, beta=1.0):
    """
    Calculate the adjusted F1 score.
    
    Parameters:
    y_true (list or array): True labels.
    y_pred (list or array): Predicted labels.
    beta (float): Weight factor.
    
    Returns:
    float: Adjusted F1 score.
    """
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    
    if precision == 0 and recall == 0:
        return 0.0
    
    adjusted_f1 = (1 + beta**2) * (precision * recall) / (beta**2 * precision + recall)
    return adjusted_f1


In [9]:
def run_model(model, train_data, val_data, lr):
    torch.manual_seed(42)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = StepLR(optimizer, step_size=10, gamma=0.1)
    criterion = nn.BCELoss()  # Changed to BCEWithLogitsLoss for numerical stability

    for epoch in tqdm(range(10)):
        model.train()
        train_losses = []
        for data1, data2, label in train_data:

            optimizer.zero_grad()
            out = model(data1, data2)

            label = torch.tensor(label).view(1).float()
            loss = criterion(out.squeeze(0), label)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        scheduler.step()

        model.eval()
        with torch.no_grad():
            val_losses = []

            val_pred = []
            val_truth = []

            correct = 0
            total = 0
            for data1, data2, label in val_data:
                out = model(data1, data2)
                label = torch.tensor(label).view(1).float()
                val_loss = criterion(out.squeeze(0), label)
                val_losses.append(val_loss.item())

                predictions = torch.round(out.squeeze())

                val_pred.append(predictions)
                val_truth.append(label)

                correct += (predictions == label).sum().item()
                total += 1

            val_loss = sum(val_losses) / len(val_losses)
            val_accuracy = correct / total

        val_f1 = f1_score(val_truth, val_pred)
        print(f'Epoch: {epoch+1}, Training Loss: {sum(train_losses)/len(train_losses)}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}, Validation F1 Score: {val_f1}')
    return val_accuracy, val_f1

## Clique Data

In [10]:
import os
import glob
import pickle
import json

# Assuming root_dir is the path to your root directory
root_dir = 'results/synthetic/'

clique_data = {}
cp_times = {}
label_data = {}

# Walk through all directories and files in root_dir
for dirpath, dirnames, filenames in os.walk(root_dir):
    # If there's a data.p file in this directory, read it
    args_file = os.path.join(dirpath, 'args.json')
    if os.path.isfile(args_file):
        with open(args_file, 'rb') as f:
            arg_data = json.load(f)
            clique_size = arg_data['size_clique']

    data_file = os.path.join(dirpath, 'data.p')
    if os.path.isfile(data_file):
        with open(data_file, 'rb') as f:
            data = pickle.load(f)
            clique_data[clique_size] = data

    # If there's a time.json file in this directory, read it
    time_file = os.path.join(dirpath, 'time.json')
    if os.path.isfile(time_file):
        with open(time_file, 'r') as f:
            time_data = json.load(f)
            cp_times[clique_size] = time_data

    label_file = os.path.join(dirpath, 'labels.p')
    if os.path.isfile(label_file):
        with open(label_file, 'rb') as f:
            data = pickle.load(f)
            label_data[clique_size] = data

In [11]:
sizes = [20, 30, 40, 50, 60, 70, 80]

In [14]:
for s in [20]:
    for j, i in enumerate(clique_data[s]):
        edge_index = i.edge_index.to(torch.int64)
        networkx_graph = to_networkx(i)
        adjacency = nx.adjacency_matrix(networkx_graph)
        
        attributes = np.eye(adjacency.shape[0])
        clique_data[s][j].x = attributes
    
    train = clique_data[s][:1000]
    train_labels = label_data[s][:1000]

    val = clique_data[s][1000:2000]
    val_labels = label_data[s][1000:2000]

    test = clique_data[s][2000:]
    test_labels = label_data[s][2000:]

    graph_pairs_train = sample_pairs(train,train_labels,nsamples=2000)
    graph_pairs_val = sample_pairs(train,val_labels,nsamples=2000)

    for j in graph_pairs_train:
        j[2] = int(j[2].item())
    for j in graph_pairs_val:
        j[2] = int(j[2].item())

    # Define hyperparameter grids
    learning_rates = [0.001]
    dropout_rates = [0.1]
    sort_k_values = [30]
    hidden_units_values = [16]

    # Best params: lr=0.001, dropout_rate=0.1, sort_k=40, hidden_units=64

    # Create combinations of hyperparameters
    hyperparameter_combinations = list(itertools.product(learning_rates, dropout_rates, sort_k_values, hidden_units_values))

    best_hyperparams = None
    best_val_score = 0

    for lr, dropout_rate, sort_k, hidden_units in hyperparameter_combinations:
        print(f"Running with lr={lr}, dropout_rate={dropout_rate}, sort_k={sort_k}, hidden_units={hidden_units}")
        model = SiameseGNN(hidden_units, sort_k, dropout_rate)
        val_accuracy, val_f1 = run_model(model, graph_pairs_train, graph_pairs_val, lr)
        
        # Update the best hyperparameters based on validation F1 score
        if val_f1 > best_val_score:
            best_val_score = val_f1
            best_hyperparams = (lr, dropout_rate, sort_k, hidden_units)

    print(f"Best Hyperparameters: Learning Rate: {best_hyperparams[0]}, Dropout Rate: {best_hyperparams[1]}, Sort-k: {best_hyperparams[2]}, Hidden Units: {best_hyperparams[3]}")
    print(f"Best Validation F1 Score: {best_val_score}")

1000 positive and 1000 negative examples
1000 positive and 1000 negative examples
Running with lr=0.001, dropout_rate=0.1, sort_k=30, hidden_units=16


 10%|█         | 1/10 [01:09<10:21, 69.03s/it]

Epoch: 1, Training Loss: 0.6488162154513123, Validation Loss: 0.6768332703160156, Validation Accuracy: 0.6011168727562824, Validation F1 Score: 0.0


 20%|██        | 2/10 [02:17<09:11, 68.97s/it]

Epoch: 2, Training Loss: 0.6456865868027151, Validation Loss: 0.6743163139543354, Validation Accuracy: 0.6011168727562824, Validation F1 Score: 0.0


 30%|███       | 3/10 [03:26<08:00, 68.67s/it]

Epoch: 3, Training Loss: 0.6453274700351926, Validation Loss: 0.674170755607748, Validation Accuracy: 0.6011168727562824, Validation F1 Score: 0.0


 40%|████      | 4/10 [04:35<06:53, 68.84s/it]

Epoch: 4, Training Loss: 0.6440438593809421, Validation Loss: 0.6739354103649096, Validation Accuracy: 0.6011168727562824, Validation F1 Score: 0.0


 50%|█████     | 5/10 [05:41<05:40, 68.04s/it]

Epoch: 5, Training Loss: 0.6446300898688111, Validation Loss: 0.6738682423614628, Validation Accuracy: 0.6011168727562824, Validation F1 Score: 0.0


 60%|██████    | 6/10 [06:48<04:30, 67.61s/it]

Epoch: 6, Training Loss: 0.6437767698214605, Validation Loss: 0.6823341381302572, Validation Accuracy: 0.580374950139609, Validation F1 Score: 0.05565529622980251


 70%|███████   | 7/10 [07:56<03:23, 67.77s/it]

Epoch: 7, Training Loss: 0.6401357015656971, Validation Loss: 0.7013331853539809, Validation Accuracy: 0.562026326286398, Validation F1 Score: 0.14084507042253522


 80%|████████  | 8/10 [09:05<02:16, 68.05s/it]

Epoch: 8, Training Loss: 0.6324059018184329, Validation Loss: 0.7634722070362067, Validation Accuracy: 0.5416832867969685, Validation F1 Score: 0.25918762088974856


 90%|█████████ | 9/10 [10:14<01:08, 68.24s/it]

Epoch: 9, Training Loss: 0.6216219229873465, Validation Loss: 0.8910496097650954, Validation Accuracy: 0.5105704028719585, Validation F1 Score: 0.3806158505805149


100%|██████████| 10/10 [11:25<00:00, 68.57s/it]

Epoch: 10, Training Loss: 0.6094959310710512, Validation Loss: 0.9289528059891652, Validation Accuracy: 0.5185480654168328, Validation F1 Score: 0.45947156291983876
Best Hyperparameters: Learning Rate: 0.001, Dropout Rate: 0.1, Sort-k: 30, Hidden Units: 16
Best Validation F1 Score: 0.45947156291983876





In [34]:
torch.save(model.state_dict(), "models/sgnn-topk30-16hidden-20clique.pt")

In [22]:
def dist_labels_to_changepoint_labels_adjusted(labels: Union[np.ndarray, list], tolerance=2):
    """
    Convert graph distribution labels (phase) to change-point labels (0 or 1) using adjustment mechanism with level of tolerance

    :param labels:
    :param tolerance (int): flag as change points the timestamps at +/- tolerance around a change-point
    :return:
    """

    if isinstance(labels, list):
        labels = np.array(labels)

    cps = np.concatenate([np.zeros(1).astype(int), (abs(labels[1:] - labels[:-1]) > 0).astype(int)],axis=0)

    for i in range(1,tolerance+1):
        cps = (cps + np.concatenate([np.zeros(i), cps[:-i]], axis=0) + np.concatenate([cps[i:], np.zeros(i)], axis=0) > 0)

    return cps

In [36]:
for j, i in enumerate(clique_data[20]):
    edge_index = i.edge_index.to(torch.int64)
    networkx_graph = to_networkx(i)
    adjacency = nx.adjacency_matrix(networkx_graph)
    
    attributes = np.eye(adjacency.shape[0])
    clique_data[20][j].x = attributes
    
data = clique_data[s]
labels = label_data[s]

In [37]:
with open('results/test_synthetic/data.p', 'wb') as f:
    pickle.dump(data, f)

with open('results/test_synthetic/labels.p', 'wb') as f:
    pickle.dump(labels, f)