In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import embedders
import networkx as nx
import torch

In [3]:
# Special function to split dataset while ensuring pairs are in the same split
from sklearn.model_selection import train_test_split


def split_dataset(X, y, **kwargs):
    n_pairs, n_dims = X.shape
    n_nodes = int(n_pairs**0.5)

    # Reshape
    X_reshaped = X.view(n_nodes, n_nodes, -1)
    y_reshaped = y.view(n_nodes, n_nodes)

    # Take 20% Of the nodes as test nodes
    idx = list(range(n_nodes))
    idx_train, idx_test = train_test_split(idx, **kwargs)

    # Return test and train sets
    X_train = X_reshaped[idx_train][:, idx_train].reshape(-1, n_dims)
    y_train = y_reshaped[idx_train][:, idx_train].reshape(-1)

    X_test = X_reshaped[idx_test][:, idx_test].reshape(-1, n_dims)
    y_test = y_reshaped[idx_test][:, idx_test].reshape(-1)

    return X_train, X_test, y_train, y_test

In [14]:
import pandas as pd
from tqdm.notebook import tqdm

# Hyperparams
# DATASETS = ["karate_club", "lesmis", "football"]
DATASETS = ["adjnoun", "dolphins", "polbooks"]
COMPONENT_SIG = [(-1, 2), (0, 2), (1, 2)]
USE_SPECIAL_DIMS = False
N_FEATURES = "d_choose_2"
USE_DISTS = True
TEST_SIZE = 0.2
TOTAL_ITERATIONS = 5_000
MAX_DEPTH = 3
N_TRIALS = 100
SCALE_LR = 0
# LR = 1e-3
LR = 1e-4

# Run benchmark
results = []
my_tqdm = tqdm(total=len(DATASETS) * N_TRIALS)
i = 0
for dataset in DATASETS:
    dists, labels, adj = embedders.dataloaders.load(dataset)
    dists = dists / dists.max()
    results_dataset = []
    # for i in range(N_TRIALS):
    while len(results_dataset) < N_TRIALS:
        try:
            pm = embedders.manifolds.ProductManifold(signature=COMPONENT_SIG)

            torch.manual_seed(i)
            X_embed, losses = embedders.coordinate_learning.train_coords(
                pm,
                dists,
                burn_in_iterations=int(0.1 * TOTAL_ITERATIONS),
                training_iterations=int(0.9 * TOTAL_ITERATIONS),
                scale_factor_learning_rate=SCALE_LR,
                burn_in_learning_rate=LR * 0.1,
                learning_rate=LR,
            )

            X, y, pm_new = embedders.link_prediction.make_link_prediction_dataset(X_embed, pm, adj, add_dists=USE_DISTS)
            X_train, X_test, y_train, y_test = split_dataset(X, y, test_size=TEST_SIZE, random_state=i)
            res = embedders.benchmarks.benchmark(
                X=None,
                y=None,
                X_train=X_train,
                X_test=X_test,
                y_train=y_train,
                y_test=y_test,
                pm=pm_new,
                max_depth=MAX_DEPTH,
                task="classification",
                use_special_dims=USE_SPECIAL_DIMS,
                n_features=N_FEATURES,
            )
            res["dataset"] = dataset
            res["trial"] = i
            my_tqdm.update(1)
            results_dataset.append(res)
            i += 1
        except Exception as e:
            print(e)
            pass
    results += results_dataset

results = pd.DataFrame(results)

  0%|          | 0/300 [00:00<?, ?it/s]

Top CC has 112 nodes; original graph has 112 nodes.


  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

In [11]:
results = pd.DataFrame(results)

In [12]:
pd.DataFrame(results).groupby("dataset").mean()

Unnamed: 0_level_0,sklearn_dt,sklearn_rf,product_dt,product_rf,tangent_dt,tangent_rf,knn,ps_perceptron,trial
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
football,0.877755,0.838367,0.876939,0.832653,0.877143,0.838163,0.820612,0.831837,249.5
karate_club,0.930612,0.882653,0.93,0.888776,0.931837,0.884286,0.867551,0.602245,49.5
lesmis,0.95375,0.92918,0.953555,0.922539,0.9525,0.927148,0.940781,0.921406,149.5


In [13]:
results.to_csv("../data/results/link_prediction2.csv", index=False)