In [1]:
# Fine-tuning SPECTER for Citation Link Prediction

In [1]:
import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_repo

# 1. Φόρτωση abstracts
abstracts = pd.read_csv(
    r'C:\Users\mysmu\Desktop\Natural Language Processing\nlp-cse-uoi-2025\data_new\abstracts.txt',
    sep=r'\|\-\-\|', engine='python',
    header=None, names=['paper','abstract'], dtype={'paper':int}
).fillna('')

papers = abstracts['paper'].tolist()
docs   = abstracts['abstract'].tolist()



  from .autonotebook import tqdm as notebook_tqdm


In [None]:
##

In [None]:
##

In [2]:
# 2. Precompute SPECTER embeddings (pretrained, no fine-tune)
model_name = "allenai/specter"
tokenizer  = AutoTokenizer.from_pretrained(model_name)
model      = AutoModel.from_pretrained(model_name)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def embed_specter(texts, batch_size=16):
    embs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, padding=True, truncation=True,
                        max_length=512, return_tensors="pt").to(device)
        with torch.no_grad():
            out = model(**enc).last_hidden_state[:,0]
        embs.append(out.cpu().numpy())
    return np.vstack(embs)

spec_emb = embed_specter(docs, batch_size=16)  # shape (N_papers, 768)



In [3]:
# Save for later reuse
np.save(r'D:\NLP\citation_link_prediction\specter_pretrained.npy', spec_emb)

In [None]:
# Test specter results

In [2]:
# 3. Build paper→index map
paper2idx = {p:i for i,p in enumerate(papers)}
spec_emb = np.load(r'D:\NLP\citation_link_prediction\specter_pretrained.npy',allow_pickle=True)

# 4. Load edge lists
train = pd.read_csv(r'D:\NLP\citation_link_prediction\train_edges.csv', dtype=int)
val   = pd.read_csv(r'D:\NLP\citation_link_prediction\val_edges.csv',   dtype=int)

In [8]:
print(train)

         citing   cited  label
0        107029   20782      0
1          4599  128992      0
2          1940   44762      1
3          6825   90145      0
4          7560   71423      1
...         ...     ...    ...
1965515   20166   95126      1
1965516   32075   31113      0
1965517   13248   13231      1
1965518   51552   33486      1
1965519   12942   19343      1

[1965520 rows x 3 columns]


In [10]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# — your existing stuff —
paper2idx = {p:i for i,p in enumerate(papers)}
spec_emb  = np.load(r'D:\NLP\citation_link_prediction\specter_pretrained.npy', allow_pickle=True)
train     = pd.read_csv(r'D:\NLP\citation_link_prediction\train_edges.csv', dtype=int)
val       = pd.read_csv(r'D:\NLP\citation_link_prediction\val_edges.csv',   dtype=int)

# helper to fetch embeddings
def get_emb(paper_id):
    idx = paper2idx[paper_id]
    return spec_emb[idx]

# compute features for a DataFrame (train or val)
def add_specter_features(df):
    n = len(df)
    cos_sims   = np.empty(n, dtype=float)
    dots       = np.empty(n, dtype=float)
    abs_diffs  = np.empty(n, dtype=float)
    
    for i, (u, v) in enumerate(tqdm(zip(df['citing'], df['cited']),total=n,desc="Specter feats")):
        eu = get_emb(u)
        ev = get_emb(v)
        
        # dot-product
        dots[i] = np.dot(eu, ev)
        # cosine similarity
        cos_sims[i] = cosine_similarity(eu.reshape(1, -1), ev.reshape(1, -1))[0,0]
        # L1 (sum of absolute differences)
        abs_diffs[i] = np.sum(np.abs(eu - ev))
    
    df['spec_dot']          = dots
    df['spec_cosine_sim']   = cos_sims
    df['spec_abs_diff_l1']  = abs_diffs
    return df

train = add_specter_features(train)
val   = add_specter_features(val)


Specter feats: 100%|███████████████████████████████████████████████████████| 1965520/1965520 [14:28<00:00, 2263.15it/s]
Specter feats: 100%|█████████████████████████████████████████████████████████| 218390/218390 [01:37<00:00, 2230.49it/s]


In [11]:
print(len(train.columns))

6


In [12]:
features = [
  'spec_dot',
  'spec_cosine_sim',
  'spec_abs_diff_l1',
]

X_train = train[features]
y_train = train['label']

X_val   = val[features]
y_val   = val['label']


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, log_loss

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf',    LogisticRegression(
                   solver='lbfgs',
                   max_iter=1000,   # αύξησε αν δεν συγκρατεί
                   class_weight='balanced'  # αν έχεις ανισορροπία κλάσεων
               ))
])
pipe.fit(X_train, y_train)

from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

# προβλέψεις κατηγορίας
y_pred  = pipe.predict(X_val)
# προβλεπόμενες πιθανότητες για θετική κλάση
y_proba = pipe.predict_proba(X_val)[:, 1]

print(classification_report(y_val, y_pred))
print("Accuracy :", accuracy_score(y_val, y_pred))
print("ROC AUC  :", roc_auc_score(y_val, y_proba))
print("Log Loss :", log_loss(y_val, y_proba))



              precision    recall  f1-score   support

           0       0.85      0.88      0.86    109195
           1       0.87      0.85      0.86    109195

    accuracy                           0.86    218390
   macro avg       0.86      0.86      0.86    218390
weighted avg       0.86      0.86      0.86    218390

Accuracy : 0.8617381748248546
ROC AUC  : 0.9331476715379775
Log Loss : 0.33054176481348324


In [14]:
train.to_csv(r'D:\NLP\citation_link_prediction\train_specter_sim.csv', index=False)
val.to_csv(r'D:\NLP\citation_link_prediction\val_specter_sim.csv', index=False)


In [None]:
# PCA on 768 dims to 98% variance

In [2]:
# 3. Build paper→index map
paper2idx = {p:i for i,p in enumerate(papers)}
spec_emb = np.load(r'D:\NLP\citation_link_prediction\specter_pretrained.npy',allow_pickle=True)

# 4. Load edge lists
train = pd.read_csv(r'D:\NLP\citation_link_prediction\train_edges.csv', dtype=int)
val   = pd.read_csv(r'D:\NLP\citation_link_prediction\val_edges.csv',   dtype=int)

In [3]:
import numpy as np
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95, svd_solver='full', whiten=False)  
# svd_solver='full' εξασφαλίζει ακριβή υπολογισμό του 98%
reduced_emb = pca.fit_transform(spec_emb)

print("Αρχικός αριθμός διαστάσεων:", spec_emb.shape[1])
print("Νέος αριθμός διαστάσεων:", pca.n_components_)
cum_var = np.cumsum(pca.explained_variance_ratio_)
print("Συσσωρευμένη διακύμανση με", pca.n_components_, "συνιστώσες:", cum_var[-1])


Αρχικός αριθμός διαστάσεων: 768
Νέος αριθμός διαστάσεων: 157
Συσσωρευμένη διακύμανση με 157 συνιστώσες: 0.9502669807927918


In [4]:
def make_pair_features(u, v):
    eu = reduced_emb[paper2idx[u]]
    ev = reduced_emb[paper2idx[v]]
    return np.hstack([
        eu,
        ev,
        np.abs(eu - ev),   # element‐wise diff
        eu * ev            # element‐wise product
    ])  # shape = (4 * k,)

# apply to each row of train/val
X_train = np.vstack([
    make_pair_features(u, v)
    for u,v in zip(train['citing'], train['cited'])
])
y_train = train['label'].values

X_val   = np.vstack([
    make_pair_features(u, v)
    for u,v in zip(val['citing'], val['cited'])
])
y_val   = val['label'].values


In [None]:
# "new" 90% Accuracy mlp with embeds hadamard/abs_dif/u/v (best so far 21/5/2025 night ) 

In [5]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

mlp_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp',    MLPClassifier(
                  hidden_layer_sizes=(256, 128),
                  activation='relu',
                  alpha=1e-4,            # L2 regularization
                  batch_size=16,
                  max_iter=20,
                  early_stopping=True,
                  n_iter_no_change=5,
                  random_state=42,
                  verbose=True
              ))
])

mlp_pipe.fit(X_train, y_train)


Iteration 1, loss = 0.26496088
Validation score: 0.897483
Iteration 2, loss = 0.25458015
Validation score: 0.898897
Iteration 3, loss = 0.25029979
Validation score: 0.899853
Iteration 4, loss = 0.24759928
Validation score: 0.900840




In [None]:
# "new"  88% Accuracy mlp with abs_diff (embed)/hadamard only 

In [20]:
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss

y_proba_mlp = mlp_pipe.predict_proba(X_val)[:,1]
y_pred_mlp  = mlp_pipe.predict(X_val)

print("Accuracy :", accuracy_score(y_val, y_pred_mlp))
print("ROC AUC  :", roc_auc_score(y_val, y_proba_mlp))
print("Log Loss :", log_loss(y_val, y_proba_mlp))


Accuracy : 0.8844223636613397
ROC AUC  : 0.9563316399272859
Log Loss : 0.267465022499169


In [None]:
# pytorch for faster training times with GPU/ CUDA

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# 1) Προετοιμασία δεδομένων
X_train_t = torch.from_numpy(X_train).float()
y_train_t = torch.from_numpy(y_train).float()    # float για BCEWithLogitsLoss
X_val_t   = torch.from_numpy(X_val).float()
y_val_t   = torch.from_numpy(y_val).float()

train_ds = TensorDataset(X_train_t, y_train_t)
val_ds   = TensorDataset(X_val_t, y_val_t)

batch_size = 64
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  pin_memory=True)
val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, pin_memory=True)

# 2) Συσκευή (GPU αν υπάρχει)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# 3) Ορισμός μοντέλου
class CitationMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
           # single logit
        )
    def forward(self, x):
        return self.net(x).squeeze(1)  # σχήμα [B]

model = CitationMLP(X_train.shape[1]).to(device)

# 4) Loss & optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

# 5) Training loop με verbose & early stopping
n_epochs = 20
best_val_acc = 0.0
patience, counter = 5, 0

for epoch in range(1, n_epochs+1):
    # --- Training step ---
    model.train()
    running_loss = 0.0
    for Xb, yb in train_loader:
        Xb, yb = Xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(Xb)               # [B]
        loss   = criterion(logits, yb)   # BCEWithLogitsLoss
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * Xb.size(0)
    epoch_loss = running_loss / len(train_loader.dataset)

    # --- Validation step ---
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for Xb, yb in val_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            logits = model(Xb)
            probs  = torch.sigmoid(logits)
            preds  = (probs > 0.5).long()
            correct += (preds == yb.long()).sum().item()
            total   += yb.size(0)
    val_acc = correct / total

    print(f"Epoch {epoch:02d} — train loss: {epoch_loss:.4f} — val acc: {val_acc:.4f}")

    # Early stopping
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        counter = 0
        torch.save(model.state_dict(), "best_citation_mlp.pt")
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping triggered.")
            break

# 6) Φόρτωση των καλύτερων βαρών
model.load_state_dict(torch.load("best_citation_mlp.pt"))
print(f"Best validation accuracy: {best_val_acc:.4f}")


Using device: cuda
Epoch 01 — train loss: 0.2652 — val acc: 0.8921
Epoch 02 — train loss: 0.2617 — val acc: 0.8919
Epoch 03 — train loss: 0.2613 — val acc: 0.8929
Epoch 04 — train loss: 0.2615 — val acc: 0.8898
Epoch 05 — train loss: 0.2613 — val acc: 0.8927
Epoch 06 — train loss: 0.2613 — val acc: 0.8905
Epoch 07 — train loss: 0.2612 — val acc: 0.8918


KeyboardInterrupt: 

In [None]:
# Baseline logistic regression in order to compare results with tfidf

In [2]:
import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report

# 1. Φόρτωση abstracts
abstracts = pd.read_csv(
    r'C:\Users\mysmu\Desktop\Natural Language Processing\nlp-cse-uoi-2025\data_new\abstracts.txt',
    sep=r'\|\-\-\|', engine='python',
    header=None, names=['paper','abstract'], dtype={'paper':int}
).fillna('')

papers = abstracts['paper'].tolist()
docs   = abstracts['abstract'].tolist()



  from .autonotebook import tqdm as notebook_tqdm
