# Fine-tuning SentenceTransformer to align text with graph embeddings (128D)


In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# === Step 1: Load and preprocess data ===

In [None]:
df = pd.read_csv("/kaggle/input/qa-embeding/combined_doi_questions_embeddings.csv")
all_samples = []

for _, row in df.iterrows():
    questions = str(row['questions']).split(';')
    embedding = row[[str(i) for i in range(128)]].astype(float).values
    for q in questions:
        all_samples.append((q.strip(), embedding))


# Group questions by id
id_to_samples = {}
for _, row in df.iterrows():
    questions = str(row['questions']).split(';')
    embedding = row[[str(i) for i in range(128)]].astype(float).values
    doc_id = row['id']
    id_to_samples[doc_id] = [(q.strip(), embedding) for q in questions]

# Split ids into train and test sets (1:9 ratio, test_size = 0.1)
unique_ids = list(id_to_samples.keys())
train_ids, test_ids = train_test_split(
    unique_ids,
    test_size=0.1,  # 1/(1+9) = 0.1 for 1:9 ratio
    random_state=42  # For reproducibility
)

# Create train and test samples based on id split
train_samples = [sample for doc_id in train_ids for sample in id_to_samples[doc_id]]
test_samples = [sample for doc_id in test_ids for sample in id_to_samples[doc_id]]

print(f"Total samples: {len(all_samples)}")
print(f"Training samples: {len(train_samples)} ({len(train_samples)/len(all_samples)*100:.1f}%)")
print(f"Test samples: {len(test_samples)} ({len(test_samples)/len(all_samples)*100:.1f}%)")


# Dataset Class

In [None]:
class QuestionGraphDataset(Dataset):
    def __init__(self, samples, tokenizer, max_len=64):
        self.samples = samples
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        question, target_embedding = self.samples[idx]
        encoded = self.tokenizer(question, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'target': torch.tensor(target_embedding, dtype=torch.float)
        }


# Model

In [None]:
class QuestionEncoder(nn.Module):
    def __init__(self, pretrained_model, out_dim=128):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(pretrained_model)
        self.projection = nn.Linear(self.encoder.config.hidden_size, out_dim)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0]  # [CLS] token
        projected = self.projection(cls_output)
        return projected

# Training class

In [None]:
def train_model(model, train_dataloader, test_dataloader, epochs=200, lr=2e-5, device='cuda' if torch.cuda.is_available() else 'cpu', save_path="question_encoder.pt"):
    model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    # Training loop
    for epoch in range(epochs):
        model.train()
        total_train_loss = 0.0
        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1} (Train)"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            target = batch['target'].to(device)

            optimizer.zero_grad()
            output = model(input_ids, attention_mask)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_dataloader)

        # Evaluate on test set
        model.eval()
        total_test_loss = 0.0
        all_outputs = []
        all_targets = []
        with torch.no_grad():
            for batch in test_dataloader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                target = batch['target'].to(device)

                output = model(input_ids, attention_mask)
                loss = criterion(output, target)
                total_test_loss += loss.item()

                all_outputs.append(output.cpu().numpy())
                all_targets.append(target.cpu().numpy())

        avg_test_loss = total_test_loss / len(test_dataloader)

        # Compute cosine similarity
        all_outputs = np.concatenate(all_outputs, axis=0)
        all_targets = np.concatenate(all_targets, axis=0)
        cosine_sim = np.mean([cosine_similarity([output], [target])[0][0] for output, target in zip(all_outputs, all_targets)])

        print(f"Epoch {epoch+1} - Train Loss: {avg_train_loss:.4f}, Test Loss: {avg_test_loss:.4f}, Test Cosine Similarity: {cosine_sim:.4f}")

    # Save model
    torch.save(model.state_dict(), save_path)
    print(f"✅ Model saved to {save_path}")

# Eval Function

In [None]:
def evaluate_model(model, test_dataloader, device='cuda' if torch.cuda.is_available() else 'cpu'):
    model.to(device)
    model.eval()
    criterion = nn.MSELoss()
    total_test_loss = 0.0
    all_outputs = []
    all_targets = []

    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            target = batch['target'].to(device)

            output = model(input_ids, attention_mask)
            loss = criterion(output, target)
            total_test_loss += loss.item()

            all_outputs.append(output.cpu().numpy())
            all_targets.append(target.cpu().numpy())

    avg_test_loss = total_test_loss / len(test_dataloader)
    all_outputs = np.concatenate(all_outputs, axis=0)
    all_targets = np.concatenate(all_targets, axis=0)
    cosine_sim = np.mean([cosine_similarity([output], [target])[0][0] for output, target in zip(all_outputs, all_targets)])

    print(f"Evaluation - Test MSE Loss: {avg_test_loss:.4f}, Test Cosine Similarity: {cosine_sim:.4f}")
    return avg_test_loss, cosine_sim

# Finetune model

In [None]:
model_names = [ "intfloat/e5-large-v2"]
device = 'cuda' if torch.cuda.is_available() else 'cpu'
for model_name in model_names:
    print(f"\n=== Fine-Tuning {model_name} ===")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Create datasets and dataloaders
    train_dataset = QuestionGraphDataset(train_samples, tokenizer, max_len=64)
    test_dataset = QuestionGraphDataset(test_samples, tokenizer, max_len=64)
    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    # Initialize and train model
    model = QuestionEncoder(pretrained_model=model_name, out_dim=128)
    save_path = f"{model_name.split('/')[-1]}_question_encoder.pt"
    train_model(model, train_dataloader, test_dataloader, epochs=400, lr=2e-5, save_path=save_path)

    # Evaluate model
    print(f"\n=== Evaluating {model_name} ===")
    evaluate_model(model, test_dataloader)

    # === Query Inference ===
    model.eval()
    query = "How do the studies of neutrino oscillations in the Sudbury Neutrino Observatory (SNO), the strong coupling dynamics of the standard Higgs sector, and the evolution of color exchange in QCD hard scattering collectively contribute to advancing our understanding of fundamental particle interactions and their implications for experimental observations at high-energy facilities like the LHC?"
    encoded = tokenizer(query, return_tensors='pt', truncation=True, padding='max_length', max_length=64)
    encoded = {k: v.to(device) for k, v in encoded.items()}

    with torch.no_grad():
        query_vector = model(encoded['input_ids'], encoded['attention_mask']).cpu().numpy()

    # Compute cosine similarity with DOI embeddings
    doi_embeddings = df[[str(i) for i in range(128)]].values
    doi_ids = df['id'].tolist()
    similarities = cosine_similarity(query_vector, doi_embeddings)[0]
    top_indices = np.argsort(similarities)[::-1][:5]
    top_dois = [doi_ids[i] for i in top_indices]
    top_scores = [similarities[i] for i in top_indices]

    print(f"\n🔍 Top 5 closest papers for {model_name}:")
    for rank, (doi, score) in enumerate(zip(top_dois, top_scores), 1):
        print(f"{rank}. DOI: {doi} — Similarity: {score:.4f}")