# Fine Tune Node2vec Citation Directed Unweighted split train

In [2]:
import pandas as pd
import networkx as nx
import numpy as np
import optuna
from node2vec import Node2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import pickle

# === 1. Load data & splits
split_path = "D:/NLP/tfidf_xgboost/split_train_val/citation_pairs_split_train_val.csv"
df = pd.read_csv(split_path)
train_df = df[df['split']=='train']
val_df   = df[df['split']=='val']

# === 2. Build weighted directed graph from positive citations only
train_pos = train_df[train_df['label']==1]
edge_weights = (
    train_pos
    .groupby(['citing','cited'])
    .size()
    .reset_index(name='weight')
)

G = nx.DiGraph()
# add all nodes (so that isolated ones are included, αν υπάρχουν)
G.add_nodes_from(pd.unique(df[['citing','cited']].values.ravel()))
for _, row in edge_weights.iterrows():
    G.add_edge(int(row['citing']), int(row['cited']))

print(f"Graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

# === 3. Prepare train/val pairs + labels
train_pairs = list(zip(train_df['citing'], train_df['cited']))
train_y     = train_df['label'].tolist()

val_pairs = list(zip(val_df['citing'], val_df['cited']))
val_y     = val_df['label'].tolist()

# === 4. Define Optuna objective
def objective(trial):
    # sample hyperparams
    p      = trial.suggest_loguniform("p", 0.25, 4.0)
    q      = trial.suggest_loguniform("q", 0.25, 4.0)
    dims   = trial.suggest_categorical("dims", [64, 128, 256])
    wl     = trial.suggest_int("walk_length", 20, 100)
    nw     = trial.suggest_int("num_walks", 10, 50)
    window = trial.suggest_int("window", 5, 15)
  

    # train Node2Vec
    n2v = Node2Vec(
        G,
        dimensions=dims,
        walk_length=wl,
        num_walks=nw,
        p=p, q=q,
        weight_key='weight',
        workers=4,
        quiet=True
    )
    model = n2v.fit(window=window, min_count=1, epochs=epochs)

    # embed feature for a pair as Hadamard product
    def feat(u, v):
        vu = model.wv.get_vector(str(u))
        vv = model.wv.get_vector(str(v))
        return vu * vv

    X_train = np.vstack([feat(u, v) for u, v in train_pairs])
    X_val   = np.vstack([feat(u, v) for u, v in val_pairs])

    # simple classifier + AUC
    clf = LogisticRegression(max_iter=200)
    clf.fit(X_train, train_y)
    preds = clf.predict_proba(X_val)[:,1]
    return roc_auc_score(val_y, preds)

# === 5. Run study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print("Best AUC:", study.best_value)
print("Best params:", study.best_params)

# === 6. Train final model with best params and save embeddings
best = study.best_params
n2v_final = Node2Vec(
    G,
    dimensions=best['dims'],
    walk_length=best['walk_length'],
    num_walks=best['num_walks'],
    p=best['p'],
    q=best['q'],
    weight_key='weight',
    workers=4,
    quiet=False
)
model_final = n2v_final.fit(
    window=best['window'],
    min_count=1,
    epochs=best['epochs']
)

# extract and pickle embeddings
embeddings = {
    int(node): model_final.wv.get_vector(str(node))
    for node in G.nodes()
    if str(node) in model_final.wv
}
out_path = "D:/NLP/tfidf_xgboost/split_train_val/citation_node2vec_tuned.pkl"
with open(out_path, "wb") as f:
    pickle.dump(embeddings, f)

print(f"✅ Saved tuned embeddings to {out_path}")


Graph: 138499 nodes, 982760 weighted edges


[I 2025-05-16 11:56:14,848] A new study created in memory with name: no-name-e48fa9e6-8be0-4fde-ba45-522047301e75
  p      = trial.suggest_loguniform("p", 0.25, 4.0)
  q      = trial.suggest_loguniform("q", 0.25, 4.0)
[I 2025-05-16 12:00:20,792] Trial 0 finished with value: 0.6768239081431762 and parameters: {'p': 0.9770158376766437, 'q': 0.9600119783631689, 'dims': 64, 'walk_length': 50, 'num_walks': 16, 'window': 13, 'epochs': 1}. Best is trial 0 with value: 0.6768239081431762.
  p      = trial.suggest_loguniform("p", 0.25, 4.0)
  q      = trial.suggest_loguniform("q", 0.25, 4.0)
[I 2025-05-16 12:05:29,911] Trial 1 finished with value: 0.7224612559769644 and parameters: {'p': 0.5949122429295841, 'q': 3.5165434307548096, 'dims': 256, 'walk_length': 91, 'num_walks': 23, 'window': 12, 'epochs': 1}. Best is trial 1 with value: 0.7224612559769644.
  p      = trial.suggest_loguniform("p", 0.25, 4.0)
  q      = trial.suggest_loguniform("q", 0.25, 4.0)
[I 2025-05-16 12:09:40,519] Trial 2 fin

Best AUC: 0.9638755961319057
Best params: {'p': 1.340278589091097, 'q': 0.30263732679550054, 'dims': 256, 'walk_length': 23, 'num_walks': 44, 'window': 5, 'epochs': 5}


Computing transition probabilities: 100%|████████████████████████████████████| 138499/138499 [01:06<00:00, 2096.69it/s]


✅ Saved tuned embeddings to D:/NLP/tfidf_xgboost/split_train_val/citation_node2vec_tuned.pkl
