# Synthetic Experiments using Stochastic Block Models

In [1]:
import pickle as pkl
from torch_geometric.data import DataLoader
from itertools import combinations
import random
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR
from src.utils.CreateFeatures import CreateFeatures
from src.pygcn.GCN_synthetic import SiameseGNN
from torch_geometric.data import DataLoader
from torch_geometric.utils import to_networkx

import torch
import torch.nn as nn
import torch
import torch_geometric.data as data

from src.utils.graphs import laplacian_embeddings, random_walk_embeddings, degree_matrix, identity
from torch_geometric.utils import to_networkx
import networkx as nx
import numpy as np
import itertools

In [2]:
def create_synthetic_pairs(data, cp_time):
    all_pairs = list(combinations(range(200), 2))
    random_pairs = random.sample(all_pairs, 1000)

    graph_pairs = []
    for i in random_pairs:
        first, second = i[0], i[1]

        if first < cp_time and second < cp_time:
            y_label = 1
        elif first >= cp_time and second >= cp_time:
            y_label = 1
        else:
            y_label = 0

        graph_pairs.append((data[first], data[second], y_label))

    flattened_train, flattened_test = train_test_split(graph_pairs, test_size=0.40, random_state=42)
    flattened_test, flattened_val = train_test_split(graph_pairs, test_size=0.5, random_state=42)

    return flattened_train, flattened_test, flattened_val

In [3]:
def run_model(train_data, val_data, lr, dropout_rate, sort_k, hidden_units):
    torch.manual_seed(42)
    model = SiameseGNN(hidden_units, sort_k, dropout_rate)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = StepLR(optimizer, step_size=10, gamma=0.1)  # Adjust step_size and gamma as needed
    criterion = nn.BCELoss()

    for epoch in tqdm(range(10)):
        model.train()
        train_losses = []
        for data1, data2, label in train_data:
            optimizer.zero_grad()
            out = model(data1, data2)
            label = torch.tensor(label).view(1).float()
            loss = criterion(out.squeeze(0), label)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        scheduler.step()  # Add this line to update the learning rate

        model.eval()
        with torch.no_grad():
            val_losses = []

            val_pred = []
            val_truth = []

            correct = 0
            total = 0
            for data1, data2, label in val_data:
                out = model(data1, data2)
                label = torch.tensor(label).view(1).float()
                val_loss = criterion(out.squeeze(0), label)
                val_losses.append(val_loss.item())

                predictions = torch.round(out.squeeze())

                val_pred.append(predictions)
                val_truth.append(label)

                correct += (predictions == label).sum().item()
                total += 1

            val_loss = sum(val_losses) / len(val_losses)
            val_accuracy = correct / total

        print(f'Epoch: {epoch+1}, Training Loss: {sum(train_losses)/len(train_losses)}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}, Validation F1 Score: {f1_score(val_truth, val_pred)}')
    return val_accuracy, f1_score(val_truth, val_pred)

## Merge Data

In [4]:
import os
import glob
import pickle
import json

# Assuming root_dir is the path to your root directory
root_dir = 'results/synthetic/merge'

merge_data = {}
cp_times = {}

# Walk through all directories and files in root_dir
for dirpath, dirnames, filenames in os.walk(root_dir):
    # If there's a data.p file in this directory, read it
    args_file = os.path.join(dirpath, 'args.json')
    if os.path.isfile(args_file):
        with open(args_file, 'rb') as f:
            arg_data = json.load(f)
            p_value = arg_data['p']

    data_file = os.path.join(dirpath, 'data.p')
    if os.path.isfile(data_file):
        with open(data_file, 'rb') as f:
            data = pickle.load(f)
            merge_data[p_value] = data

    # If there's a time.json file in this directory, read it
    time_file = os.path.join(dirpath, 'time.json')
    if os.path.isfile(time_file):
        with open(time_file, 'r') as f:
            time_data = json.load(f)
            cp_times[p_value] = time_data

### Degree Matrix

In [6]:
for j, i in enumerate(merge_data[0.3]):
    edge_index = i.edge_index.to(torch.int64)
    networkx_graph = to_networkx(i)
    adjacency = nx.to_scipy_sparse_array(networkx_graph, format='csr')
            
    x = np.diag(degree_matrix(adjacency).todense(), k=0).reshape(-1,1)
    merge_data[0.3][j].x = x

flattened_train, flattened_test, flattened_val = create_synthetic_pairs(merge_data[0.3], cp_times[0.3])

positive_samples = [item for item in flattened_train if item[2] == 1]
negative_samples = [item for item in flattened_train if item[2] == 0]

# Calculate the difference in count
diff = len(negative_samples) - len(positive_samples)

# Upsample positive samples
if diff > 0:
    positive_samples_upsampled = positive_samples * (diff // len(positive_samples)) + random.sample(positive_samples, diff % len(positive_samples))
    balanced_data = negative_samples + positive_samples + positive_samples_upsampled
else:
    balanced_data = flattened_train

random.shuffle(balanced_data)

# Define hyperparameter grids
learning_rates = [0.001, 0.01]
dropout_rates = [0.01, 0.05, 0.1]
sort_k_values = [20, 40, 100]
hidden_units_values = [16, 32, 64]

# Best params: lr=0.001, dropout_rate=0.1, sort_k=40, hidden_units=64

# Create combinations of hyperparameters
hyperparameter_combinations = list(itertools.product(learning_rates, dropout_rates, sort_k_values, hidden_units_values))

best_hyperparams = None
best_val_score = 0

for lr, dropout_rate, sort_k, hidden_units in hyperparameter_combinations:
    print(f"Running with lr={lr}, dropout_rate={dropout_rate}, sort_k={sort_k}, hidden_units={hidden_units}")
    val_accuracy, val_f1 = run_model(balanced_data, flattened_val, lr, dropout_rate, sort_k, hidden_units)
    
    # Update the best hyperparameters based on validation F1 score
    if val_f1 > best_val_score:
        best_val_score = val_f1
        best_hyperparams = (lr, dropout_rate, sort_k, hidden_units)

print(f"Best Hyperparameters: Learning Rate: {best_hyperparams[0]}, Dropout Rate: {best_hyperparams[1]}, Sort-k: {best_hyperparams[2]}, Hidden Units: {best_hyperparams[3]}")
print(f"Best Validation F1 Score: {best_val_score}")

Running with lr=0.001, dropout_rate=0.1, sort_k=40, hidden_units=64


 10%|█         | 1/10 [00:27<04:08, 27.57s/it]

Epoch: 1, Training Loss: 0.6963474320868651, Validation Loss: 0.6749369359016418, Validation Accuracy: 0.62, Validation F1 Score: 0.7654320987654321


 20%|██        | 2/10 [00:54<03:35, 26.98s/it]

Epoch: 2, Training Loss: 0.6881913221875826, Validation Loss: 0.6731768381595612, Validation Accuracy: 0.62, Validation F1 Score: 0.7654320987654321


 30%|███       | 3/10 [01:19<03:03, 26.28s/it]

Epoch: 3, Training Loss: 0.6878298337260882, Validation Loss: 0.6728987216949462, Validation Accuracy: 0.62, Validation F1 Score: 0.7654320987654321


 40%|████      | 4/10 [01:45<02:36, 26.08s/it]

Epoch: 4, Training Loss: 0.6858611965676149, Validation Loss: 0.6678843367099762, Validation Accuracy: 0.62, Validation F1 Score: 0.7654320987654321


 50%|█████     | 5/10 [02:10<02:09, 25.83s/it]

Epoch: 5, Training Loss: 0.6866415205597878, Validation Loss: 0.6671185410022735, Validation Accuracy: 0.62, Validation F1 Score: 0.7654320987654321


 60%|██████    | 6/10 [02:36<01:42, 25.67s/it]

Epoch: 6, Training Loss: 0.682774517138799, Validation Loss: 0.6678065538406373, Validation Accuracy: 0.62, Validation F1 Score: 0.7654320987654321


 70%|███████   | 7/10 [03:01<01:16, 25.52s/it]

Epoch: 7, Training Loss: 0.6831856133540471, Validation Loss: 0.6664223670959473, Validation Accuracy: 0.62, Validation F1 Score: 0.7654320987654321


 80%|████████  | 8/10 [03:27<00:51, 25.59s/it]

Epoch: 8, Training Loss: 0.6828666445116202, Validation Loss: 0.6666052949428558, Validation Accuracy: 0.62, Validation F1 Score: 0.7654320987654321


 90%|█████████ | 9/10 [03:52<00:25, 25.68s/it]

Epoch: 9, Training Loss: 0.6841998573640983, Validation Loss: 0.6663987874984741, Validation Accuracy: 0.62, Validation F1 Score: 0.7654320987654321


100%|██████████| 10/10 [04:18<00:00, 25.83s/it]


Epoch: 10, Training Loss: 0.682914386789004, Validation Loss: 0.6665736401081085, Validation Accuracy: 0.62, Validation F1 Score: 0.7654320987654321
Running with lr=0.01, dropout_rate=0.1, sort_k=40, hidden_units=64


 10%|█         | 1/10 [00:25<03:51, 25.67s/it]

Epoch: 1, Training Loss: 0.7029932231083512, Validation Loss: 0.6732045084238052, Validation Accuracy: 0.62, Validation F1 Score: 0.7654320987654321


 20%|██        | 2/10 [00:52<03:33, 26.64s/it]

Epoch: 2, Training Loss: 0.6873510731756687, Validation Loss: 0.6672911652326584, Validation Accuracy: 0.62, Validation F1 Score: 0.7654320987654321


 30%|███       | 3/10 [01:18<03:04, 26.33s/it]

Epoch: 3, Training Loss: 0.6813986180226008, Validation Loss: 0.6654541726112366, Validation Accuracy: 0.62, Validation F1 Score: 0.7654320987654321


 40%|████      | 4/10 [01:44<02:36, 26.01s/it]

Epoch: 4, Training Loss: 0.6813455215593179, Validation Loss: 0.6653825563192367, Validation Accuracy: 0.62, Validation F1 Score: 0.7654320987654321


 50%|█████     | 5/10 [02:09<02:08, 25.76s/it]

Epoch: 5, Training Loss: 0.6813181965549787, Validation Loss: 0.6653651654720306, Validation Accuracy: 0.62, Validation F1 Score: 0.7654320987654321


 60%|██████    | 6/10 [02:35<01:42, 25.60s/it]

Epoch: 6, Training Loss: 0.6813281909128031, Validation Loss: 0.6653626537322999, Validation Accuracy: 0.62, Validation F1 Score: 0.7654320987654321


 70%|███████   | 7/10 [03:00<01:16, 25.53s/it]

Epoch: 7, Training Loss: 0.6817549201349418, Validation Loss: 0.6653983318805694, Validation Accuracy: 0.62, Validation F1 Score: 0.7654320987654321


 80%|████████  | 8/10 [03:26<00:51, 25.69s/it]

Epoch: 8, Training Loss: 0.6814154241482416, Validation Loss: 0.6653938090801239, Validation Accuracy: 0.62, Validation F1 Score: 0.7654320987654321


 90%|█████████ | 9/10 [03:51<00:25, 25.57s/it]

Epoch: 9, Training Loss: 0.6816403838992119, Validation Loss: 0.6665417976379394, Validation Accuracy: 0.62, Validation F1 Score: 0.7654320987654321


100%|██████████| 10/10 [04:17<00:00, 25.76s/it]

Epoch: 10, Training Loss: 0.681710265527169, Validation Loss: 0.6653460693359375, Validation Accuracy: 0.62, Validation F1 Score: 0.7654320987654321
Best Hyperparameters: Learning Rate: 0.001, Dropout Rate: 0.1, Sort-k: 40, Hidden Units: 64
Best Validation F1 Score: 0.7654320987654321





### Random-Walk

In [10]:
for j, i in enumerate(merge_data):
    edge_index = i.edge_index.to(torch.int64)
    networkx_graph = to_networkx(i)
    adjacency = nx.to_scipy_sparse_array(networkx_graph, format='csr')
            
    x = random_walk_embeddings(adjacency, k=1)
    merge_data[j].x = x

flattened_train, flattened_test, flattened_val = create_synthetic_pairs(merge_data, cp_time)
run_model(flattened_train, flattened_val)

 20%|██        | 1/5 [00:28<01:54, 28.59s/it]

Epoch: 1, Training Loss: 0.7054354806492726, Validation Loss: 0.6984284114837647, Validation Accuracy: 0.522, Validation F1 Score: 0.6859395532194481


 40%|████      | 2/5 [00:57<01:25, 28.55s/it]

Epoch: 2, Training Loss: 0.6988908539215724, Validation Loss: 0.6956806327104569, Validation Accuracy: 0.522, Validation F1 Score: 0.6859395532194481


 60%|██████    | 3/5 [01:25<00:57, 28.54s/it]

Epoch: 3, Training Loss: 0.6965483835836251, Validation Loss: 0.6966160855293274, Validation Accuracy: 0.522, Validation F1 Score: 0.6859395532194481


 80%|████████  | 4/5 [01:53<00:28, 28.43s/it]

Epoch: 4, Training Loss: 0.6961831471323967, Validation Loss: 0.6942015937566757, Validation Accuracy: 0.522, Validation F1 Score: 0.6859395532194481


100%|██████████| 5/5 [02:21<00:00, 28.35s/it]

Epoch: 5, Training Loss: 0.6951501431067785, Validation Loss: 0.6936068749427795, Validation Accuracy: 0.522, Validation F1 Score: 0.6859395532194481





### Laplacian Embeddings

In [11]:
for j, i in enumerate(merge_data):
    edge_index = i.edge_index.to(torch.int64)
    networkx_graph = to_networkx(i)
    adjacency = nx.to_scipy_sparse_array(networkx_graph, format='csr')
            
    x = laplacian_embeddings(adjacency, k=1)
    merge_data[j].x = x

flattened_train, flattened_test, flattened_val = create_synthetic_pairs(merge_data, cp_time)
run_model(flattened_train, flattened_val)

 20%|██        | 1/5 [00:29<01:56, 29.02s/it]

Epoch: 1, Training Loss: 0.7086347258090973, Validation Loss: 0.6929218556880951, Validation Accuracy: 0.51, Validation F1 Score: 0.6754966887417219


 40%|████      | 2/5 [00:58<01:27, 29.20s/it]

Epoch: 2, Training Loss: 0.6984774100780488, Validation Loss: 0.6929834187030792, Validation Accuracy: 0.51, Validation F1 Score: 0.6754966887417219


 60%|██████    | 3/5 [01:27<00:58, 29.30s/it]

Epoch: 3, Training Loss: 0.6966335082550844, Validation Loss: 0.694138139128685, Validation Accuracy: 0.51, Validation F1 Score: 0.6754966887417219


 80%|████████  | 4/5 [01:57<00:29, 29.52s/it]

Epoch: 4, Training Loss: 0.6972869330644608, Validation Loss: 0.6941545931100845, Validation Accuracy: 0.51, Validation F1 Score: 0.6754966887417219


100%|██████████| 5/5 [02:27<00:00, 29.51s/it]

Epoch: 5, Training Loss: 0.6956800158818562, Validation Loss: 0.6944538348913193, Validation Accuracy: 0.51, Validation F1 Score: 0.6754966887417219





### Identity Embeddings

In [5]:
for j, i in enumerate(merge_data):
    edge_index = i.edge_index.to(torch.int64)
    networkx_graph = to_networkx(i)
    adjacency = nx.to_scipy_sparse_array(networkx_graph, format='csr')
            
    x = identity(400)
    merge_data[j].x = x

flattened_train, flattened_test, flattened_val = create_synthetic_pairs(merge_data, cp_time)
run_model(flattened_train, flattened_val)

 20%|██        | 1/5 [00:29<01:57, 29.30s/it]

Epoch: 1, Training Loss: 0.7082885177433491, Validation Loss: 0.6926153062582016, Validation Accuracy: 0.508, Validation F1 Score: 0.6737400530503979


 40%|████      | 2/5 [00:58<01:27, 29.21s/it]

Epoch: 2, Training Loss: 0.7004396438598632, Validation Loss: 0.6938644955158234, Validation Accuracy: 0.492, Validation F1 Score: 0.0


 60%|██████    | 3/5 [01:28<00:58, 29.42s/it]

Epoch: 3, Training Loss: 0.6983883129556974, Validation Loss: 0.6947316192388534, Validation Accuracy: 0.492, Validation F1 Score: 0.0


 80%|████████  | 4/5 [01:58<00:29, 29.75s/it]

Epoch: 4, Training Loss: 0.6970296726624171, Validation Loss: 0.6948082302808761, Validation Accuracy: 0.492, Validation F1 Score: 0.0


100%|██████████| 5/5 [02:26<00:00, 29.36s/it]

Epoch: 5, Training Loss: 0.6975281210740407, Validation Loss: 0.6932941085100174, Validation Accuracy: 0.492, Validation F1 Score: 0.0





## Clique Data

In [5]:
with open("results/synthetic/06_04_11:13:29_clique_cp_1_T_200_n_400_p_0.2_q_0.05_20_0/data.p", "rb") as f:
    clique_data = pkl.load(f)

cp_time = 133

FileNotFoundError: [Errno 2] No such file or directory: 'results/synthetic/06_04_11:13:29_clique_cp_1_T_200_n_400_p_0.2_q_0.05_20_0/data.p'

### Degree Matrix

In [8]:
for j, i in enumerate(clique_data):
    edge_index = i.edge_index.to(torch.int64)
    networkx_graph = to_networkx(i)
    adjacency = nx.to_scipy_sparse_array(networkx_graph, format='csr')
            
    x = np.diag(degree_matrix(adjacency).todense(), k=0).reshape(-1,1)
    clique_data[j].x = x

flattened_train, flattened_test, flattened_val = create_synthetic_pairs(clique_data, cp_time)
run_model(flattened_train, flattened_val)

 20%|██        | 1/5 [00:15<01:02, 15.72s/it]

Epoch: 1, Training Loss: 0.6992556133369605, Validation Loss: 0.6923871096372605, Validation Accuracy: 0.516, Validation F1 Score: 0.6807387862796834


 40%|████      | 2/5 [00:30<00:45, 15.18s/it]

Epoch: 2, Training Loss: 0.6929134220878284, Validation Loss: 0.689760978102684, Validation Accuracy: 0.516, Validation F1 Score: 0.6807387862796834


 60%|██████    | 3/5 [00:44<00:29, 14.77s/it]

Epoch: 3, Training Loss: 0.6868213406205177, Validation Loss: 0.6814346568584442, Validation Accuracy: 0.516, Validation F1 Score: 0.6807387862796834


 80%|████████  | 4/5 [00:59<00:14, 14.58s/it]

Epoch: 4, Training Loss: 0.6622537118693193, Validation Loss: 0.6425437939763069, Validation Accuracy: 0.67, Validation F1 Score: 0.7433903576982893


100%|██████████| 5/5 [01:12<00:00, 14.59s/it]

Epoch: 5, Training Loss: 0.5708100362246236, Validation Loss: 0.5006951180100441, Validation Accuracy: 0.79, Validation F1 Score: 0.8037383177570093





### Random Walk

In [9]:
for j, i in enumerate(clique_data):
    edge_index = i.edge_index.to(torch.int64)
    networkx_graph = to_networkx(i)
    adjacency = nx.to_scipy_sparse_array(networkx_graph, format='csr')
            
    x = random_walk_embeddings(adjacency, k=1)
    clique_data[j].x = x

flattened_train, flattened_test, flattened_val = create_synthetic_pairs(clique_data, cp_time)
run_model(flattened_train, flattened_val)

 20%|██        | 1/5 [00:14<00:58, 14.74s/it]

Epoch: 1, Training Loss: 0.7019147059818109, Validation Loss: 0.6959783440828323, Validation Accuracy: 0.522, Validation F1 Score: 0.6859395532194481


 40%|████      | 2/5 [00:28<00:43, 14.42s/it]

Epoch: 2, Training Loss: 0.6939904629190763, Validation Loss: 0.6951362727880478, Validation Accuracy: 0.522, Validation F1 Score: 0.6859395532194481


 60%|██████    | 3/5 [00:43<00:28, 14.31s/it]

Epoch: 3, Training Loss: 0.6924663008749485, Validation Loss: 0.6945376887321472, Validation Accuracy: 0.522, Validation F1 Score: 0.6859395532194481


 80%|████████  | 4/5 [00:56<00:14, 14.11s/it]

Epoch: 4, Training Loss: 0.6916149450341861, Validation Loss: 0.6942118445634842, Validation Accuracy: 0.522, Validation F1 Score: 0.6859395532194481


100%|██████████| 5/5 [01:11<00:00, 14.24s/it]

Epoch: 5, Training Loss: 0.6918824293216069, Validation Loss: 0.694335107088089, Validation Accuracy: 0.522, Validation F1 Score: 0.6859395532194481





### Laplacian Embeddings

In [10]:
for j, i in enumerate(clique_data):
    edge_index = i.edge_index.to(torch.int64)
    networkx_graph = to_networkx(i)
    adjacency = nx.to_scipy_sparse_array(networkx_graph, format='csr')
            
    x = laplacian_embeddings(adjacency, k=1)
    clique_data[j].x = x

flattened_train, flattened_test, flattened_val = create_synthetic_pairs(clique_data, cp_time)
run_model(flattened_train, flattened_val)

 20%|██        | 1/5 [00:13<00:55, 13.99s/it]

Epoch: 1, Training Loss: 0.7026044877370199, Validation Loss: 0.7127146036624908, Validation Accuracy: 0.532, Validation F1 Score: 0.6945169712793734


 40%|████      | 2/5 [00:28<00:42, 14.22s/it]

Epoch: 2, Training Loss: 0.698015845467647, Validation Loss: 0.7064780175685883, Validation Accuracy: 0.532, Validation F1 Score: 0.6945169712793734


 60%|██████    | 3/5 [00:42<00:28, 14.33s/it]

Epoch: 3, Training Loss: 0.6962834357221921, Validation Loss: 0.7058776593208314, Validation Accuracy: 0.532, Validation F1 Score: 0.6945169712793734


 80%|████████  | 4/5 [00:56<00:14, 14.09s/it]

Epoch: 4, Training Loss: 0.6937986141443253, Validation Loss: 0.7009501012563706, Validation Accuracy: 0.532, Validation F1 Score: 0.6945169712793734


100%|██████████| 5/5 [01:10<00:00, 14.19s/it]

Epoch: 5, Training Loss: 0.6910516557594141, Validation Loss: 0.7095978164672851, Validation Accuracy: 0.532, Validation F1 Score: 0.6945169712793734





### Identity Matrix

In [11]:
for j, i in enumerate(clique_data):
    edge_index = i.edge_index.to(torch.int64)
    networkx_graph = to_networkx(i)
    adjacency = nx.to_scipy_sparse_array(networkx_graph, format='csr')
            
    x = identity(400)
    clique_data[j].x = x

flattened_train, flattened_test, flattened_val = create_synthetic_pairs(clique_data, cp_time)
run_model(flattened_train, flattened_val)

 20%|██        | 1/5 [00:14<00:58, 14.61s/it]

Epoch: 1, Training Loss: 0.7043468555559714, Validation Loss: 0.6890924557447433, Validation Accuracy: 0.536, Validation F1 Score: 0.6979166666666666


 40%|████      | 2/5 [00:28<00:42, 14.23s/it]

Epoch: 2, Training Loss: 0.6917967011034488, Validation Loss: 0.6863912776708603, Validation Accuracy: 0.536, Validation F1 Score: 0.6979166666666666


 60%|██████    | 3/5 [00:42<00:28, 14.17s/it]

Epoch: 3, Training Loss: 0.6876791834831237, Validation Loss: 0.6763330624103546, Validation Accuracy: 0.536, Validation F1 Score: 0.6979166666666666


 80%|████████  | 4/5 [00:56<00:14, 14.16s/it]

Epoch: 4, Training Loss: 0.6813007913529873, Validation Loss: 0.6608687014579773, Validation Accuracy: 0.73, Validation F1 Score: 0.7906976744186046


100%|██████████| 5/5 [01:10<00:00, 14.03s/it]

Epoch: 5, Training Loss: 0.6493846730391184, Validation Loss: 0.5793172103762627, Validation Accuracy: 0.736, Validation F1 Score: 0.7480916030534351



