# Synthetic Experiments using Stochastic Block Models

In [7]:
import pickle as pkl
from torch_geometric.loader import DataLoader
from itertools import combinations
import random
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import argparse

import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR
from src.utils.CreateFeatures import CreateFeatures
from src.pygcn.GCN_batched import GraphSiamese
from torch_geometric.utils import to_networkx
from src.utils.embedding import GCN

import torch
import torch.nn as nn
import torch
import torch_geometric.data as data
from typing import Union
from src.utils.graphs import laplacian_embeddings, random_walk_embeddings, degree_matrix, identity
from torch_geometric.utils import to_networkx
import networkx as nx
import numpy as np
import itertools

from src.utils.sample import sample_pairs
from src.utils.misc import collate
from detect import detect_change_point

In [9]:
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import torch.nn as nn
from tqdm import tqdm
from sklearn.metrics import f1_score

def run_model(model, train_loader, val_loader):
    torch.manual_seed(42)
    optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001)
    scheduler = StepLR(optimizer, step_size=100, gamma=0.1)
    criterion = nn.BCEWithLogitsLoss()  # Changed to BCEWithLogitsLoss for numerical stability

    for epoch in tqdm(range(30)):
        model.train()
        train_losses = []
        for data1, data2, labels in train_loader:
            optimizer.zero_grad()
            out = model(data1, data2)
    
            labels = labels.float().view(-1, 1)  # Ensure labels are of the shape (batch_size, 1)
            loss = criterion(out, labels)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        scheduler.step()

        model.eval()
        with torch.no_grad():
            val_losses = []

            val_pred = []
            val_truth = []

            correct = 0
            total = 0
            for data1, data2, labels in val_loader:
                out = model(data1, data2)

                labels = labels.float().view(-1, 1)  # Ensure labels are of the shape (batch_size, 1)
                val_loss = criterion(out, labels)
                val_losses.append(val_loss.item())

                predictions = torch.round(out)

                val_pred.extend(predictions.cpu().numpy())
                val_truth.extend(labels.cpu().numpy())

                correct += (predictions == labels).sum().item()
                total += labels.size(0)

            val_loss = sum(val_losses) / len(val_losses)
            val_accuracy = correct / total

        val_f1 = f1_score(val_truth, val_pred)
        print(f'Epoch: {epoch+1}, Training Loss: {sum(train_losses)/len(train_losses)}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}, Validation F1 Score: {val_f1}')
    return val_accuracy, val_f1


## Clique Data

In [10]:
import os
import glob
import pickle
import json

# Assuming root_dir is the path to your root directory
root_dir = 'results/synthetic/'

clique_data = {}
cp_times = {}
label_data = {}

# Walk through all directories and files in root_dir
for dirpath, dirnames, filenames in os.walk(root_dir):
    # If there's a data.p file in this directory, read it
    args_file = os.path.join(dirpath, 'args.json')
    if os.path.isfile(args_file):
        with open(args_file, 'rb') as f:
            arg_data = json.load(f)
            clique_size = arg_data['size_clique']

    data_file = os.path.join(dirpath, 'data.p')
    if os.path.isfile(data_file):
        with open(data_file, 'rb') as f:
            data = pickle.load(f)
            clique_data[clique_size] = data

    # If there's a time.json file in this directory, read it
    time_file = os.path.join(dirpath, 'time.json')
    if os.path.isfile(time_file):
        with open(time_file, 'r') as f:
            time_data = json.load(f)
            cp_times[clique_size] = time_data

    label_file = os.path.join(dirpath, 'labels.p')
    if os.path.isfile(label_file):
        with open(label_file, 'rb') as f:
            data = pickle.load(f)
            label_data[clique_size] = data

In [11]:
sizes = [20, 30, 40, 50, 60, 70, 80]

In [12]:
for s in [20]:
    for j, i in enumerate(clique_data[s]):
        edge_index = i.edge_index.to(torch.int64)
        networkx_graph = to_networkx(i)
        adjacency = nx.adjacency_matrix(networkx_graph)
        
        attributes = np.eye(adjacency.shape[0])
        clique_data[s][j].x = attributes
    
    train = clique_data[s][:1000]
    train_labels = label_data[s][:1000]

    val = clique_data[s][1000:2000]
    val_labels = label_data[s][1000:2000]

    test = clique_data[s][2000:]
    test_labels = label_data[s][2000:]

    graph_pairs_train = sample_pairs(train,train_labels,nsamples=2000)
    graph_pairs_val = sample_pairs(train,val_labels,nsamples=2000)

    for j in graph_pairs_train:
        j[2] = int(j[2].item())
    for j in graph_pairs_val:
        j[2] = int(j[2].item())

    training_data_pairs = DataLoader(graph_pairs_train, batch_size=32, shuffle=True, collate_fn=collate,
                               drop_last=True)
    validation_data_pairs = DataLoader(graph_pairs_val, batch_size=32, shuffle=True, collate_fn=collate,
                               drop_last=True)

    input_dim = training_data_pairs.dataset[0][0].x.shape[1]

    # Define hyperparameter grids
    learning_rates = [0.01]
    dropout_rates = [0.05]
    sort_k_values = [30]
    hidden_units_values = [16]

    # Create combinations of hyperparameters
    hyperparameter_combinations = list(itertools.product(learning_rates, dropout_rates, sort_k_values, hidden_units_values))

    for lr, dropout_rate, sort_k, hidden_units in hyperparameter_combinations:
        embedding = GCN(input_dim=input_dim, hidden_dim=hidden_units, layers=3, dropout=dropout_rate)
        model = GraphSiamese(embedding, sort_k, nlinear = 2, nhidden=hidden_units, dropout = dropout_rate)
        val_accuracy, val_f1 = run_model(model, training_data_pairs, validation_data_pairs)

    model_name = f"models/sgnn-topk{sort_k}-64hidden-{s}clique.pt"
    torch.save(model.state_dict(), model_name)

    time_test = [t-2000 for t in cp_times[s] if t>=2000]

    with open(f'results/test_synthetic/{s}-data.p', 'wb') as f:
        pickle.dump(test, f)

    with open(f'results/test_synthetic/{s}-labels.p', 'wb') as f:
        pickle.dump(test_labels, f)

    with open(f'results/test_synthetic/{s}-time.json', 'w') as f:
        json.dump(time_test, f)

1000 positive and 1000 negative examples
1000 positive and 1000 negative examples


  3%|▎         | 1/30 [02:36<1:15:47, 156.80s/it]

Epoch: 1, Training Loss: 0.777213262957196, Validation Loss: 0.7506718858178839, Validation Accuracy: 0.6272590361445783, Validation F1 Score: 0.0


  7%|▋         | 2/30 [05:13<1:13:11, 156.85s/it]

Epoch: 2, Training Loss: 0.7407366896784583, Validation Loss: 0.7265052860041699, Validation Accuracy: 0.6276355421686747, Validation F1 Score: 0.0


 10%|█         | 3/30 [07:49<1:10:19, 156.27s/it]

Epoch: 3, Training Loss: 0.7221748038779857, Validation Loss: 0.7145954248416855, Validation Accuracy: 0.6283885542168675, Validation F1 Score: 0.0


 13%|█▎        | 4/30 [10:25<1:07:41, 156.21s/it]

Epoch: 4, Training Loss: 0.7124104596847711, Validation Loss: 0.7078666155596813, Validation Accuracy: 0.6276355421686747, Validation F1 Score: 0.0


 17%|█▋        | 5/30 [13:02<1:05:10, 156.44s/it]

Epoch: 5, Training Loss: 0.7069445065287656, Validation Loss: 0.7040091401123139, Validation Accuracy: 0.6265060240963856, Validation F1 Score: 0.0
