In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Load your dataset
df = pd.read_csv("/data2/home/rohitsunil/termreport/arxiv_new.csv")
df = df.rename(columns={'summary': 'abstract'})

# Load the pretrained tokenizer and model for Baseline (Bi-Encoder)
tokenizer_baseline = AutoTokenizer.from_pretrained('microsoft/MiniLM-L12-H384-uncased')
minilm_baseline = AutoModel.from_pretrained('microsoft/MiniLM-L12-H384-uncased')
minilm_baseline.to(device)

# Load the pretrained tokenizer and model for CTPE (Cross-Encoder)
tokenizer_ctpe = AutoTokenizer.from_pretrained('microsoft/MiniLM-L12-H384-uncased')
minilm_ctpe = AutoModel.from_pretrained('microsoft/MiniLM-L12-H384-uncased')
minilm_ctpe.to(device)

# Maximum sequence length for MiniLM
max_length = 128  # Adjust as needed

# Function to tokenize and encode texts for Baseline (Bi-Encoder)
def tokenize_texts_baseline(texts):
    return tokenizer_baseline(
        texts.tolist(),
        add_special_tokens=True,
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_tensors='pt'  # Return PyTorch tensors
    )

# Function to tokenize and encode texts for CTPE (Cross-Encoder)
def tokenize_texts_ctpe(titles, abstracts):
    # MiniLM does not support token_type_ids, so we'll concatenate titles and abstracts with [SEP]
    concatenated_texts = [f"{title} [SEP] {abstract}" for title, abstract in zip(titles, abstracts)]
    return tokenizer_ctpe(
        concatenated_texts,
        add_special_tokens=True,
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )

# Tokenize titles and abstracts for Baseline
title_encodings_baseline = tokenize_texts_baseline(df['title'])
abstract_encodings_baseline = tokenize_texts_baseline(df['abstract'])

# Create positive pairs for Baseline
positive_pairs_baseline = pd.DataFrame({
    'title_input_ids': list(title_encodings_baseline['input_ids']),
    'title_attention_mask': list(title_encodings_baseline['attention_mask']),
    'abstract_input_ids': list(abstract_encodings_baseline['input_ids']),
    'abstract_attention_mask': list(abstract_encodings_baseline['attention_mask']),
    'label': 1
})

# Create negative pairs by shuffling abstracts for Baseline
shuffled_abstracts = df['abstract'].sample(frac=1).reset_index(drop=True)
shuffled_abstract_encodings_baseline = tokenize_texts_baseline(shuffled_abstracts)

negative_pairs_baseline = pd.DataFrame({
    'title_input_ids': list(title_encodings_baseline['input_ids']),
    'title_attention_mask': list(title_encodings_baseline['attention_mask']),
    'abstract_input_ids': list(shuffled_abstract_encodings_baseline['input_ids']),
    'abstract_attention_mask': list(shuffled_abstract_encodings_baseline['attention_mask']),
    'label': 0
})

# Combine and shuffle Baseline pairs
all_pairs_baseline = pd.concat([positive_pairs_baseline, negative_pairs_baseline], ignore_index=True)
all_pairs_baseline = all_pairs_baseline.sample(frac=1).reset_index(drop=True)

# Split into training and validation sets for Baseline
train_pairs_baseline, val_pairs_baseline = train_test_split(all_pairs_baseline, test_size=0.1, random_state=42)

print(f"Baseline - Training pairs: {len(train_pairs_baseline)}, Validation pairs: {len(val_pairs_baseline)}")

# Prepare data for CTPE (Cross-Encoder)
positive_pairs_ctpe = pd.DataFrame({
    'title': df['title'],
    'abstract': df['abstract'],
    'label': 1
})

negative_pairs_ctpe = pd.DataFrame({
    'title': df['title'],
    'abstract': shuffled_abstracts,
    'label': 0
})


         


2024-11-28 22:34:14.440335: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-28 22:34:14.443157: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-28 22:34:14.478484: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-28 22:34:14.478505: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-28 22:34:14.478536: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

Using device: cuda




tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/133M [00:00<?, ?B/s]

  return torch.load(checkpoint_file, map_location="cpu")


Baseline - Training pairs: 45797, Validation pairs: 5089


In [2]:
# Combine and shuffle CTPE pairs
all_pairs_ctpe = pd.concat([positive_pairs_ctpe, negative_pairs_ctpe], ignore_index=True)
all_pairs_ctpe = all_pairs_ctpe.sample(frac=1).reset_index(drop=True)

# Split into training and validation sets for CTPE
train_pairs_ctpe, val_pairs_ctpe = train_test_split(all_pairs_ctpe, test_size=0.1, random_state=42)

print(f"CTPE - Training pairs: {len(train_pairs_ctpe)}, Validation pairs: {len(val_pairs_ctpe)}")

# Dataset for Baseline (Bi-Encoder)
class BaselinePairDataset(Dataset):
    def __init__(self, pairs):
        self.title_input_ids = pairs['title_input_ids'].values
        self.title_attention_mask = pairs['title_attention_mask'].values
        self.abstract_input_ids = pairs['abstract_input_ids'].values
        self.abstract_attention_mask = pairs['abstract_attention_mask'].values
        self.labels = pairs['label'].values.astype(np.float32)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'title_input_ids': torch.tensor(self.title_input_ids[idx], dtype=torch.long),
            'title_attention_mask': torch.tensor(self.title_attention_mask[idx], dtype=torch.long),
            'abstract_input_ids': torch.tensor(self.abstract_input_ids[idx], dtype=torch.long),
            'abstract_attention_mask': torch.tensor(self.abstract_attention_mask[idx], dtype=torch.long),
            'label': torch.tensor(self.labels[idx], dtype=torch.float)
        }

# Dataset for CTPE (Cross-Encoder)
class CTPEDataset(Dataset):
    def __init__(self, pairs):
        self.titles = pairs['title'].values
        self.abstracts = pairs['abstract'].values
        self.labels = pairs['label'].values.astype(np.float32)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'title': self.titles[idx],
            'abstract': self.abstracts[idx],
            'label': torch.tensor(self.labels[idx], dtype=torch.float)
        }

# Create datasets
train_dataset_baseline = BaselinePairDataset(train_pairs_baseline)
val_dataset_baseline = BaselinePairDataset(val_pairs_baseline)

train_dataset_ctpe = CTPEDataset(train_pairs_ctpe)
val_dataset_ctpe = CTPEDataset(val_pairs_ctpe)

# Create data loaders
batch_size = 32  # Adjust as needed based on your GPU capacity

train_loader_baseline = DataLoader(train_dataset_baseline, batch_size=batch_size, shuffle=True)
val_loader_baseline = DataLoader(val_dataset_baseline, batch_size=batch_size)

train_loader_ctpe = DataLoader(train_dataset_ctpe, batch_size=batch_size, shuffle=True)
val_loader_ctpe = DataLoader(val_dataset_ctpe, batch_size=batch_size)

# Baseline (Bi-Encoder) Model
class BaselineDocumentSimilarityModel(nn.Module):
    def __init__(self, minilm_model):
        super(BaselineDocumentSimilarityModel, self).__init__()
        self.minilm = minilm_model
        self.dropout = nn.Dropout(0.1)
        self.cosine_similarity = nn.CosineSimilarity(dim=1)
    
    def forward(self, title_input_ids, title_attention_mask, abstract_input_ids, abstract_attention_mask):
        # Process titles
        title_outputs = self.minilm(
            input_ids=title_input_ids,
            attention_mask=title_attention_mask
        )
        title_pooled_output = title_outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
        
        # Process abstracts
        abstract_outputs = self.minilm(
            input_ids=abstract_input_ids,
            attention_mask=abstract_attention_mask
        )
        abstract_pooled_output = abstract_outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
        
        # Apply dropout
        title_vector = self.dropout(title_pooled_output)
        abstract_vector = self.dropout(abstract_pooled_output)
        
        # Compute cosine similarity
        similarity = self.cosine_similarity(title_vector, abstract_vector)
        return similarity

# CTPE (Cross-Encoder) Model
class CTPEDocumentSimilarityModel(nn.Module):
    def __init__(self, minilm_model):
        super(CTPEDocumentSimilarityModel, self).__init__()
        self.minilm = minilm_model
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.minilm.config.hidden_size, 1)  # Output a single score
    
    def forward(self, input_ids, attention_mask):
        outputs = self.minilm(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs.last_hidden_state[:, 0, :]  # Using [CLS] token representation
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output).squeeze(-1)  # Shape: (batch_size)
        return logits

# Initialize the models
model_baseline = BaselineDocumentSimilarityModel(minilm_baseline)
model_baseline.to(device)

model_ctpe = CTPEDocumentSimilarityModel(minilm_ctpe)
model_ctpe.to(device)

# Define the loss functions
def contrastive_loss(similarity, label):
    """
    Contrastive loss function for Baseline (Bi-Encoder) model.
    Args:
        similarity: Cosine similarity scores between title and abstract embeddings.
        label: Ground truth labels (1 for positive pairs, 0 for negative pairs).
    Returns:
        Computed contrastive loss.
    """
    margin = 0.1  # Margin value
    loss = torch.mean(
        label * torch.pow(1 - similarity, 2) +
        (1 - label) * torch.pow(torch.clamp(similarity - margin, min=0.0), 2)
    )
    return loss

def binary_cross_entropy_loss(logits, labels):
    """
    Binary Cross-Entropy Loss with logits for CTPE (Cross-Encoder) model.
    Args:
        logits: Raw output scores from the classifier.
        labels: Ground truth labels (1 for positive pairs, 0 for negative pairs).
    Returns:
        Computed binary cross-entropy loss.
    """
    loss_fct = nn.BCEWithLogitsLoss()
    return loss_fct(logits, labels)

# Define the optimizers
optimizer_baseline = torch.optim.AdamW(model_baseline.parameters(), lr=2e-5)
optimizer_ctpe = torch.optim.AdamW(model_ctpe.parameters(), lr=2e-5)



CTPE - Training pairs: 45797, Validation pairs: 5089


In [7]:
# Training Parameters
num_epochs = 2  # Adjust as needed based on your dataset and resources

# Training Loop for Baseline (Bi-Encoder)
def train_baseline(model, dataloader, optimizer):
    """
    Training loop for the Baseline (Bi-Encoder) model.
    Args:
        model: BaselineDocumentSimilarityModel instance.
        dataloader: DataLoader for the training data.
        optimizer: Optimizer for the model.
    Returns:
        Average training loss for the epoch.
    """
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc='Training Baseline'):
        # Move data to GPU
        title_input_ids = batch['title_input_ids'].to(device)
        title_attention_mask = batch['title_attention_mask'].to(device)
        abstract_input_ids = batch['abstract_input_ids'].to(device)
        abstract_attention_mask = batch['abstract_attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        similarities = model(
            title_input_ids,
            title_attention_mask,
            abstract_input_ids,
            abstract_attention_mask
        )
        
        # Compute loss
        loss = contrastive_loss(similarities, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    return avg_loss

# Training Loop for CTPE (Cross-Encoder)
def train_ctpe(model, dataloader, optimizer):
    """
    Training loop for the CTPE (Cross-Encoder) model.
    Args:
        model: CTPEDocumentSimilarityModel instance.
        dataloader: DataLoader for the training data.
        optimizer: Optimizer for the model.
    Returns:
        Average training loss for the epoch.
    """
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc='Training CTPE'):
        # Move data to GPU
        titles = batch['title']
        abstracts = batch['abstract']
        labels = batch['label'].to(device)
        
        # Tokenize the concatenated title and abstract
        encodings = tokenize_texts_ctpe(titles, abstracts)
        input_ids = encodings['input_ids'].to(device)
        attention_mask = encodings['attention_mask'].to(device)
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        logits = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Compute loss
        loss = binary_cross_entropy_loss(logits, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    return avg_loss

# Evaluation Function for Baseline (Bi-Encoder)
def evaluate_baseline(model, dataloader):
    """
    Evaluation loop for the Baseline (Bi-Encoder) model.
    Args:
        model: BaselineDocumentSimilarityModel instance.
        dataloader: DataLoader for the validation data.
    Returns:
        Tuple containing average validation loss, accuracy, and F1 score.
    """
    model.eval()
    total_loss = 0
    all_labels = []
    all_predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Evaluating Baseline'):
            # Move data to GPU
            title_input_ids = batch['title_input_ids'].to(device)
            title_attention_mask = batch['title_attention_mask'].to(device)
            abstract_input_ids = batch['abstract_input_ids'].to(device)
            abstract_attention_mask = batch['abstract_attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            # Forward pass
            similarities = model(
                title_input_ids,
                title_attention_mask,
                abstract_input_ids,
                abstract_attention_mask
            )
            
            # Compute loss
            loss = contrastive_loss(similarities, labels)
            total_loss += loss.item()
            
            # Compute predictions
            preds = (similarities >= 0.5).float()
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(preds.cpu().numpy())
    
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_predictions)
    f1 = f1_score(all_labels, all_predictions)
    return avg_loss, accuracy, f1

# Evaluation Function for CTPE (Cross-Encoder)
def evaluate_ctpe(model, dataloader):
    """
    Evaluation loop for the CTPE (Cross-Encoder) model.
    Args:
        model: CTPEDocumentSimilarityModel instance.
        dataloader: DataLoader for the validation data.
    Returns:
        Tuple containing average validation loss, accuracy, and F1 score.
    """
    model.eval()
    total_loss = 0
    all_labels = []
    all_predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Evaluating CTPE'):
            # Move data to GPU
            titles = batch['title']
            abstracts = batch['abstract']
            labels = batch['label'].to(device)
            
            # Tokenize the concatenated title and abstract
            encodings = tokenize_texts_ctpe(titles, abstracts)
            input_ids = encodings['input_ids'].to(device)
            attention_mask = encodings['attention_mask'].to(device)
            
            # Forward pass
            logits = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            # Compute loss
            loss = binary_cross_entropy_loss(logits, labels)
            total_loss += loss.item()
            
            # Compute predictions
            probs = torch.sigmoid(logits)
            preds = (probs >= 0.5).float()
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(preds.cpu().numpy())
    
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_predictions)
    f1 = f1_score(all_labels, all_predictions)
    return avg_loss, accuracy, f1

# Training and Evaluation Loop
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    
    # Train Baseline
    train_loss_baseline = train_baseline(model_baseline, train_loader_baseline, optimizer_baseline)
    print(f'Baseline - Training Loss: {train_loss_baseline:.4f}')
    
    # Train CTPE
    train_loss_ctpe = train_ctpe(model_ctpe, train_loader_ctpe, optimizer_ctpe)
    print(f'CTPE - Training Loss: {train_loss_ctpe:.4f}')
    
    # Evaluate Baseline
    val_loss_baseline, val_acc_baseline, val_f1_baseline = evaluate_baseline(model_baseline, val_loader_baseline)
    print(f'Baseline - Validation Loss: {val_loss_baseline:.4f}, Accuracy: {val_acc_baseline:.4f}, F1 Score: {val_f1_baseline:.4f}')
    
    # Evaluate CTPE
    val_loss_ctpe, val_acc_ctpe, val_f1_ctpe = evaluate_ctpe(model_ctpe, val_loader_ctpe)
    print(f'CTPE - Validation Loss: {val_loss_ctpe:.4f}, Accuracy: {val_acc_ctpe:.4f}, F1 Score: {val_f1_ctpe:.4f}')

# Function to compute accuracy for Baseline (Bi-Encoder)
def compute_accuracy_baseline(model, dataloader, threshold=0.5):
    """
    Computes accuracy for the Baseline (Bi-Encoder) model.
    Args:
        model: BaselineDocumentSimilarityModel instance.
        dataloader: DataLoader for the validation data.
        threshold: Threshold for classifying similarity scores.
    Returns:
        Computed accuracy.
    """
    model.eval()
    total_correct = 0
    total_examples = 0
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Accuracy Evaluation Baseline'):
            # Move data to GPU
            title_input_ids = batch['title_input_ids'].to(device)
            title_attention_mask = batch['title_attention_mask'].to(device)
            abstract_input_ids = batch['abstract_input_ids'].to(device)
            abstract_attention_mask = batch['abstract_attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            # Forward pass
            similarities = model(
                title_input_ids,
                title_attention_mask,
                abstract_input_ids,
                abstract_attention_mask
            )
            
            # Compute accuracy
            preds = (similarities >= threshold).float()
            correct = (preds == labels).float().sum()
            total_correct += correct.item()
            total_examples += len(labels)
    
    accuracy = total_correct / total_examples
    return accuracy

# Function to compute accuracy for CTPE (Cross-Encoder)
def compute_accuracy_ctpe(model, dataloader, threshold=0.5):
    """
    Computes accuracy for the CTPE (Cross-Encoder) model.
    Args:
        model: CTPEDocumentSimilarityModel instance.
        dataloader: DataLoader for the validation data.
        threshold: Threshold for classifying similarity scores.
    Returns:
        Computed accuracy.
    """
    model.eval()
    total_correct = 0
    total_examples = 0
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Accuracy Evaluation CTPE'):
            # Move data to GPU
            titles = batch['title']
            abstracts = batch['abstract']
            labels = batch['label'].to(device)
            
            # Tokenize the concatenated title and abstract
            encodings = tokenize_texts_ctpe(titles, abstracts)
            input_ids = encodings['input_ids'].to(device)
            attention_mask = encodings['attention_mask'].to(device)
            
            # Forward pass
            logits = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            # Compute predictions
            probs = torch.sigmoid(logits)
            preds = (probs >= threshold).float()
            correct = (preds == labels).float().sum()
            total_correct += correct.item()
            total_examples += len(labels)
    
    accuracy = total_correct / total_examples
    return accuracy

# Final Evaluation
final_accuracy_baseline = compute_accuracy_baseline(model_baseline, val_loader_baseline)
final_accuracy_ctpe = compute_accuracy_ctpe(model_ctpe, val_loader_ctpe)

print(f'\nFinal Validation Accuracy:')
print(f'Baseline (Bi-Encoder): {final_accuracy_baseline:.4f}')
print(f'CTPE (Cross-Encoder): {final_accuracy_ctpe:.4f}')

# Function to get embeddings for Baseline (Bi-Encoder)
def get_embeddings_baseline(model, dataloader):
    """
    Extracts embeddings for titles and abstracts using the Baseline (Bi-Encoder) model.
    Args:
        model: BaselineDocumentSimilarityModel instance.
        dataloader: DataLoader for the validation data.
    Returns:
        Tuple of numpy arrays containing title embeddings and abstract embeddings.
    """
    model.eval()
    title_embeddings = []
    abstract_embeddings = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Extracting Baseline Embeddings'):
            # Titles
            title_input_ids = batch['title_input_ids'].to(device)
            title_attention_mask = batch['title_attention_mask'].to(device)
            title_outputs = model.minilm(
                input_ids=title_input_ids,
                attention_mask=title_attention_mask
            )
            title_pooled_output = model.dropout(title_outputs.last_hidden_state[:, 0, :]).cpu().numpy()
            title_embeddings.append(title_pooled_output)
            
            # Abstracts
            abstract_input_ids = batch['abstract_input_ids'].to(device)
            abstract_attention_mask = batch['abstract_attention_mask'].to(device)
            abstract_outputs = model.minilm(
                input_ids=abstract_input_ids,
                attention_mask=abstract_attention_mask
            )
            abstract_pooled_output = model.dropout(abstract_outputs.last_hidden_state[:, 0, :]).cpu().numpy()
            abstract_embeddings.append(abstract_pooled_output)
    
    title_embeddings = np.concatenate(title_embeddings, axis=0)
    abstract_embeddings = np.concatenate(abstract_embeddings, axis=0)
    return title_embeddings, abstract_embeddings

# Function to get embeddings for CTPE (Cross-Encoder)
def get_embeddings_ctpe(model, dataloader):
    """
    Extracts embeddings for title-abstract pairs using the CTPE (Cross-Encoder) model.
    Args:
        model: CTPEDocumentSimilarityModel instance.
        dataloader: DataLoader for the validation data.
    Returns:
        Numpy array containing pair embeddings.
    """
    model.eval()
    pair_embeddings = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Extracting CTPE Embeddings'):
            titles = batch['title']
            abstracts = batch['abstract']
            encodings = tokenize_texts_ctpe(titles, abstracts)
            input_ids = encodings['input_ids'].to(device)
            attention_mask = encodings['attention_mask'].to(device)
            outputs = model.minilm(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            pooled_output = model.dropout(outputs.last_hidden_state[:, 0, :]).cpu().numpy()
            pair_embeddings.append(pooled_output)
    
    pair_embeddings = np.concatenate(pair_embeddings, axis=0)
    return pair_embeddings

# Extract Embeddings
print("\nExtracting Embeddings for Baseline (Bi-Encoder)...")
title_embeddings_baseline, abstract_embeddings_baseline = get_embeddings_baseline(model_baseline, val_loader_baseline)
print(f'Title Embeddings Shape (Baseline): {title_embeddings_baseline.shape}')
print(f'Abstract Embeddings Shape (Baseline): {abstract_embeddings_baseline.shape}')

print("\nExtracting Embeddings for CTPE (Cross-Encoder)...")
pair_embeddings_ctpe = get_embeddings_ctpe(model_ctpe, val_loader_ctpe)
print(f'Pair Embeddings Shape (CTPE): {pair_embeddings_ctpe.shape}')

# Compute per-sample cosine similarities
def compute_per_sample_cosine_similarity(title_embeds, abstract_embeds):
    """
    Computes cosine similarity for each pair of title and abstract embeddings.
    Args:
        title_embeds (np.ndarray): Embeddings for titles, shape (n_samples, hidden_size)
        abstract_embeds (np.ndarray): Embeddings for abstracts, shape (n_samples, hidden_size)
    Returns:
        np.ndarray: Array of cosine similarity scores, shape (n_samples,)
    """
    # Normalize embeddings
    title_norm = title_embeds / np.linalg.norm(title_embeds, axis=1, keepdims=True)
    abstract_norm = abstract_embeds / np.linalg.norm(abstract_embeds, axis=1, keepdims=True)
    
    # Compute element-wise cosine similarity
    cosine_sim = np.sum(title_norm * abstract_norm, axis=1)
    return cosine_sim

# Compute per-sample similarities for the validation set
similarities_baseline_correct = compute_per_sample_cosine_similarity(title_embeddings_baseline, abstract_embeddings_baseline)


# Compute Similarity Scores for CTPE
# CTPE already models the similarity, using sigmoid-activated logits as similarity scores
print("\nUsing CTPE logits as similarity scores...")
similarity_scores_ctpe = torch.sigmoid(torch.tensor(pair_embeddings_ctpe)).numpy()
print(f'Similarity Scores (CTPE): {similarity_scores_ctpe[:5]}')




Epoch 1/2


Training Baseline:   0%|          | 0/1432 [00:00<?, ?it/s]

  'title_input_ids': torch.tensor(self.title_input_ids[idx], dtype=torch.long),
  'title_attention_mask': torch.tensor(self.title_attention_mask[idx], dtype=torch.long),
  'abstract_input_ids': torch.tensor(self.abstract_input_ids[idx], dtype=torch.long),
  'abstract_attention_mask': torch.tensor(self.abstract_attention_mask[idx], dtype=torch.long),
Training Baseline: 100%|██████████| 1432/1432 [02:49<00:00,  8.43it/s]


Baseline - Training Loss: 0.1781


Training CTPE: 100%|██████████| 1432/1432 [01:45<00:00, 13.56it/s]


CTPE - Training Loss: 0.0179


Evaluating Baseline: 100%|██████████| 160/160 [00:06<00:00, 25.13it/s]


Baseline - Validation Loss: 0.1489, Accuracy: 0.7326, F1 Score: 0.7706


Evaluating CTPE: 100%|██████████| 160/160 [00:05<00:00, 31.93it/s]


CTPE - Validation Loss: 0.0312, Accuracy: 0.9896, F1 Score: 0.9896

Epoch 2/2


  'title_input_ids': torch.tensor(self.title_input_ids[idx], dtype=torch.long),
  'title_attention_mask': torch.tensor(self.title_attention_mask[idx], dtype=torch.long),
  'abstract_input_ids': torch.tensor(self.abstract_input_ids[idx], dtype=torch.long),
  'abstract_attention_mask': torch.tensor(self.abstract_attention_mask[idx], dtype=torch.long),
Training Baseline: 100%|██████████| 1432/1432 [02:49<00:00,  8.43it/s]


Baseline - Training Loss: 0.1263


Training CTPE: 100%|██████████| 1432/1432 [01:45<00:00, 13.60it/s]


CTPE - Training Loss: 0.0141


Evaluating Baseline: 100%|██████████| 160/160 [00:06<00:00, 25.18it/s]


Baseline - Validation Loss: 0.1067, Accuracy: 0.8112, F1 Score: 0.8313


Evaluating CTPE: 100%|██████████| 160/160 [00:04<00:00, 32.47it/s]


CTPE - Validation Loss: 0.0284, Accuracy: 0.9914, F1 Score: 0.9914


  'title_input_ids': torch.tensor(self.title_input_ids[idx], dtype=torch.long),
  'title_attention_mask': torch.tensor(self.title_attention_mask[idx], dtype=torch.long),
  'abstract_input_ids': torch.tensor(self.abstract_input_ids[idx], dtype=torch.long),
  'abstract_attention_mask': torch.tensor(self.abstract_attention_mask[idx], dtype=torch.long),
Accuracy Evaluation Baseline: 100%|██████████| 160/160 [00:06<00:00, 25.07it/s]
Accuracy Evaluation CTPE: 100%|██████████| 160/160 [00:04<00:00, 32.50it/s]



Final Validation Accuracy:
Baseline (Bi-Encoder): 0.8112
CTPE (Cross-Encoder): 0.9914

Extracting Embeddings for Baseline (Bi-Encoder)...


Extracting Baseline Embeddings: 100%|██████████| 160/160 [00:06<00:00, 24.95it/s]


Title Embeddings Shape (Baseline): (5089, 384)
Abstract Embeddings Shape (Baseline): (5089, 384)

Extracting Embeddings for CTPE (Cross-Encoder)...


Extracting CTPE Embeddings: 100%|██████████| 160/160 [00:04<00:00, 32.85it/s]

Pair Embeddings Shape (CTPE): (5089, 384)

Using CTPE logits as similarity scores...
Similarity Scores (CTPE): [[0.3580209  0.40198722 0.47324777 ... 0.35267502 0.37274146 0.44419846]
 [0.3628221  0.40083635 0.47095144 ... 0.3529054  0.38161293 0.44883114]
 [0.60188454 0.6139684  0.4991282  ... 0.6318804  0.57787585 0.546747  ]
 [0.5537811  0.5808418  0.50977176 ... 0.60095686 0.5214416  0.5432543 ]
 [0.59874874 0.61812985 0.5098833  ... 0.63340414 0.6241622  0.5477605 ]]



