# Graph Convolutional Network for Root Prediction
This notebook demonstrates a non-linear approach using a GCN to predict the root node in parsed dependency trees for one language.

In [82]:
# !pip install GraphSL

In [1]:
import ast
import pandas as pd
import networkx as nx
import numpy as np
import torch
from scipy import sparse
from GraphSL.utils      import diffusion_generation, split_dataset
from GraphSL.Prescribed import LPSI, NetSleuth, OJC
import warnings
warnings.filterwarnings("ignore")

In [2]:
INFECT_PROB   = 0.3
SEED_RATIO    = 0.2
SIM_NUM_TRAIN = 100
TRAIN_RATIO   = 0.8

In [3]:
train_df = pd.read_csv("../../data/train.csv")  # has language,n,edgelist,root
test_df  = pd.read_csv("../../data/test.csv")   # has id,language,n,edgelist

In [4]:
def build_graph(edgelist_str, n):
    edges = ast.literal_eval(edgelist_str)
    G = nx.Graph()
    G.add_nodes_from(range(1, n+1))
    G.add_edges_from(edges)
    return G

train_df["G"] = train_df.apply(lambda r: build_graph(r["edgelist"], r["n"]), axis=1)
test_df ["G"] = test_df .apply(lambda r: build_graph(r["edgelist"], r["n"]), axis=1)

In [5]:
best_models = {}

for lang, grp in train_df.groupby("language"):
    print(f"\n→ Tuning on {lang}…")
    # pick your representative graph (unchanged)
    G0    = grp.iloc[0]["G"]
    A0_sp = nx.to_scipy_sparse_array(G0, dtype=float, format="csr")

    # simulate cascades
    data_dict = diffusion_generation(
        graph       = {"adj_mat": A0_sp},
        diff_type   = "IC",
        sim_num     = SIM_NUM_TRAIN,
        seed_ratio  = SEED_RATIO,
        infect_prob = INFECT_PROB
    )

    # split for hyper‐tuning
    adj_mat, train_ds, val_ds = split_dataset(data_dict, train_ratio=TRAIN_RATIO)

    # --- 2a) tune LPSI with a restricted α‐grid (light regularization) ---
    lpsi       = LPSI()
    alpha_grid = np.logspace(-4, -1, 10)   # prefer smaller α to avoid over‐smoothing
    alpha, theta, auc_l, f1_l, _ = lpsi.train(
        adj_mat, train_ds,
        alpha_list=alpha_grid,
        num_thres=10
    )
    print(f"   LPSI  → α={alpha:.4f}, θ={theta:.3f},  val AUC={auc_l:.3f}")

    # --- 2b) tune NetSleuth ---
    ns    = NetSleuth()
    beta, auc_n, f1_n = ns.train(adj_mat, train_ds)
    print(f"   NS    → β={beta:.3f},        val AUC={auc_n:.3f}")

    # --- 2c) tune OJC ---
    ojc   = OJC()
    try:
        # newer GraphSL: returns 6‐tuple
        ojc_model, tgt, num_src, auc_o, f1_o, _ = ojc.train(adj_mat, train_ds)
    except ValueError:
        # fallback: older API returns only (model, target, num_source)
        ojc_model, tgt, num_src = ojc.train(adj_mat, train_ds)
        auc_o, f1_o = 0.0, 0.0
    print(f"   OJC   → K={num_src:.3f},     val AUC={auc_o:.3f}")

    # --- 2d) pick the best by validation AUC ---
    choices = {
        "lpsi": (auc_l, ("lpsi", lpsi,       alpha)),
        "ns":   (auc_n, ("ns",   ns,         beta)),
        "ojc":  (auc_o, ("ojc",  (ojc_model, tgt, num_src)))
    }
    winner = max(choices.values(), key=lambda x: x[0])[1]
    best_models[lang] = winner
    print(f" → {lang:12s}  ✅ picked {winner[0].upper():8s} (AUC={choices[winner[0]][0]:.3f})")


→ Tuning on Arabic…
alpha = 0.0001, train_auc = 0.554
alpha = 0.00021544346900318845, train_auc = 0.554
alpha = 0.00046415888336127773, train_auc = 0.554
alpha = 0.001, train_auc = 0.554
alpha = 0.002154434690031882, train_auc = 0.554
alpha = 0.004641588833612777, train_auc = 0.554
alpha = 0.01, train_auc = 0.554
alpha = 0.021544346900318822, train_auc = 0.554
alpha = 0.046415888336127774, train_auc = 0.554
alpha = 0.1, train_auc = 0.554
thres = 0.091, train_f1 = 0.316
thres = 0.182, train_f1 = 0.250
thres = 0.273, train_f1 = 0.308
thres = 0.364, train_f1 = 0.182
thres = 0.455, train_f1 = 0.222
thres = 0.545, train_f1 = 0.000
thres = 0.636, train_f1 = 0.000
thres = 0.727, train_f1 = 0.000
thres = 0.818, train_f1 = 0.000
thres = 0.909, train_f1 = 0.000
   LPSI  → α=0.0001, θ=0.091,  val AUC=0.554
k = 2, train_auc = 0.441
k = 5, train_auc = 0.441
k = 10, train_auc = 0.441
   NS    → β=2.000,        val AUC=0.441
Y = 2, train_auc = 0.441
Y = 5, train_auc = 0.441
Y = 10, train_auc = 0.441

In [6]:
preds = []

for _, row in test_df.iterrows():
    lang, model_info = row["language"], best_models[row["language"]]
    method = model_info[0]
    G_orig = row["G"]

    # relabel to 0…n–1
    G = nx.convert_node_labels_to_integers(G_orig, first_label=0)
    n = G.number_of_nodes()

    # 3a) simulate one cascade
    A_sp = nx.to_scipy_sparse_array(G, dtype=float, format="csr")
    out  = diffusion_generation(
        graph       = {"adj_mat": A_sp},
        diff_type   = "IC",
        sim_num     = 1,
        seed_ratio  = SEED_RATIO,
        infect_prob = INFECT_PROB
    )
    final_inf = out["diff_mat"][0][:, -1]

    # 3b) build Laplacian & to torch
    A_np = A_sp.toarray()
    D_np = np.diag(A_np.sum(axis=1))
    L_np = D_np - A_np

    device = (model_info[1].device
              if method == "lpsi"
              else torch.device("cpu"))  # NS and OJC run on CPU
    L_t = torch.tensor(L_np, dtype=torch.float32, device=device)
    d_t = torch.tensor(final_inf, dtype=torch.float32, device=device)

    # 3c) dispatch to the right predictor
    if method == "lpsi":
        _, lpsi_model, alpha = model_info
        scores = lpsi_model.predict(L_t, n, alpha, d_t)

    elif method == "ns":
        _, ns_model, beta = model_info
        scores = ns_model.predict(G, beta, d_t)

    elif method == "ojc":
        _, (ojc_model, tgt, num_src) = model_info
        scores = ojc_model.predict(G, tgt, num_src, d_t)

    else:
        raise RuntimeError(f"unknown method {method}")

    root_pred = int(scores.argmax().item() + 1)  # back → 1‐indexed
    preds.append({"id": row["id"], "root_pred": root_pred})

submission = pd.DataFrame(preds)

In [7]:
truth = pd.read_csv("../../data/labeled_test.csv")  # id,root
cmp   = submission.merge(truth, on="id")
cmp["correct"] = cmp["root_pred"] == cmp["root"]
acc = cmp["correct"].mean()

print(f"\nTest accuracy: {acc:.3%} ({cmp['correct'].sum()}/{len(cmp)})")
print("Some mis‐predictions:")
print(cmp.loc[~cmp["correct"], ["id","root_pred","root"]].head())


Test accuracy: 9.226% (959/10395)
Some mis‐predictions:
   id  root_pred  root
0   1          2     4
1   2          2    17
2   3          2     5
3   4          1    15
4   5          1     9
