In [4]:
import pandas as pd
import numpy as np
import torch  # Not strictly necessary for this model but kept for consistency
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_curve, auc
from sklearn.preprocessing import normalize
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import gensim.downloader as api  # For loading GloVe embeddings

# -----------------------------
# 1. Data Preparation
# -----------------------------

# Load your dataset
df = pd.read_csv("/data2/home/rohitsunil/termreport/arxiv_new.csv")
df = df.rename(columns={'summary': 'abstract'})

# Create positive pairs
positive_pairs = df[['title', 'abstract']].copy()
positive_pairs['label'] = 1

# Create negative pairs by shuffling abstracts
shuffled_abstracts = df['abstract'].sample(frac=1, random_state=42).reset_index(drop=True)
negative_pairs = pd.DataFrame({
    'title': df['title'],
    'abstract': shuffled_abstracts,
    'label': 0
})

# Combine and shuffle all pairs
all_pairs = pd.concat([positive_pairs, negative_pairs], ignore_index=True)
all_pairs = all_pairs.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into training and validation sets
train_pairs, val_pairs = train_test_split(all_pairs, test_size=0.1, random_state=42)

print(f"Training pairs: {len(train_pairs)}, Validation pairs: {len(val_pairs)}")

# -----------------------------
# 2. Load Pre-trained GloVe Embeddings
# -----------------------------

# Download and load GloVe embeddings using Gensim
print("Loading GloVe embeddings...")
glove = api.load("glove-wiki-gigaword-100")  # This may take a while if not already downloaded

embedding_dim = 100  # Dimension of GloVe embeddings

# Function to convert text to embedding by averaging word embeddings
def text_to_embedding(text, embedding_model, embedding_dim):
    """
    Converts a given text to an embedding by averaging its word embeddings.
    
    Args:
        text (str): The input text.
        embedding_model: Pre-trained embedding model (GloVe).
        embedding_dim (int): Dimension of the embeddings.
        
    Returns:
        np.ndarray: Averaged embedding vector.
    """
    words = text.lower().split()
    valid_embeddings = [embedding_model[word] for word in words if word in embedding_model]
    if valid_embeddings:
        return np.mean(valid_embeddings, axis=0)
    else:
        return np.zeros(embedding_dim)

# Apply the function to all titles and abstracts in training and validation sets
print("Converting texts to embeddings...")

train_titles_emb = np.array([text_to_embedding(title, glove, embedding_dim) for title in tqdm(train_pairs['title'])])
train_abstracts_emb = np.array([text_to_embedding(abstract, glove, embedding_dim) for abstract in tqdm(train_pairs['abstract'])])
train_labels = train_pairs['label'].values

val_titles_emb = np.array([text_to_embedding(title, glove, embedding_dim) for title in tqdm(val_pairs['title'])])
val_abstracts_emb = np.array([text_to_embedding(abstract, glove, embedding_dim) for abstract in tqdm(val_pairs['abstract'])])
val_labels = val_pairs['label'].values

# Normalize the embeddings to unit vectors
train_titles_emb = normalize(train_titles_emb)
train_abstracts_emb = normalize(train_abstracts_emb)
val_titles_emb = normalize(val_titles_emb)
val_abstracts_emb = normalize(val_abstracts_emb)

# -----------------------------
# 3. Feature Extraction
# -----------------------------

# Baseline (Bi-Encoder): Compute cosine similarity between title and abstract embeddings
# CTPE (Cross-Encoder): Concatenate title and abstract embeddings and use as input features

# For CTPE, we'll concatenate title and abstract embeddings
train_features_ctpe = np.concatenate([train_titles_emb, train_abstracts_emb], axis=1)
val_features_ctpe = np.concatenate([val_titles_emb, val_abstracts_emb], axis=1)

# -----------------------------
# 4. Define Datasets and DataLoaders
# -----------------------------

# Since we're using precomputed embeddings, we can use custom datasets

class BaselineDataset(Dataset):
    def __init__(self, titles, abstracts, labels):
        self.titles = torch.tensor(titles, dtype=torch.float32)
        self.abstracts = torch.tensor(abstracts, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'title': self.titles[idx],
            'abstract': self.abstracts[idx],
            'label': self.labels[idx]
        }

class CTPEDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'features': self.features[idx],
            'label': self.labels[idx]
        }

# Create datasets
train_dataset_baseline = BaselineDataset(train_titles_emb, train_abstracts_emb, train_labels)
val_dataset_baseline = BaselineDataset(val_titles_emb, val_abstracts_emb, val_labels)

train_dataset_ctpe = CTPEDataset(train_features_ctpe, train_labels)
val_dataset_ctpe = CTPEDataset(val_features_ctpe, val_labels)

# Create DataLoaders
batch_size = 32

train_loader_baseline = DataLoader(train_dataset_baseline, batch_size=batch_size, shuffle=True)
val_loader_baseline = DataLoader(val_dataset_baseline, batch_size=batch_size)

train_loader_ctpe = DataLoader(train_dataset_ctpe, batch_size=batch_size, shuffle=True)
val_loader_ctpe = DataLoader(val_dataset_ctpe, batch_size=batch_size)

# -----------------------------
# 5. Define the Models
# -----------------------------

# Baseline (Bi-Encoder): Compute cosine similarity and use contrastive loss
# Since we're using Logistic Regression and GloVe, we'll implement a simple neural network

class BaselineNN(nn.Module):
    def __init__(self, input_dim):
        super(BaselineNN, self).__init__()
        self.fc = nn.Linear(input_dim, 1)  # Combine title and abstract embeddings
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, title, abstract):
        # Compute element-wise product as interaction feature
        interaction = title * abstract
        out = self.fc(interaction)
        out = self.sigmoid(out)
        return out.squeeze(dim=1)  # Corrected squeeze operation

# CTPE (Cross-Encoder): Use a neural network classifier on concatenated embeddings

class CTPE_NN(nn.Module):
    def __init__(self, input_dim):
        super(CTPE_NN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, features):
        out = self.fc1(features)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out.squeeze(dim=1)  # Ensure output shape is (batch_size,)

# Initialize models
baseline_model = BaselineNN(input_dim=embedding_dim).to('cpu')  # No GPU needed
ctpe_model = CTPE_NN(input_dim=embedding_dim*2).to('cpu')    # No GPU needed

# -----------------------------
# 6. Define Loss Functions and Optimizers
# -----------------------------

# Binary Cross-Entropy Loss
criterion = nn.BCELoss()

# Optimizers
optimizer_baseline = optim.Adam(baseline_model.parameters(), lr=1e-3)
optimizer_ctpe = optim.Adam(ctpe_model.parameters(), lr=1e-3)

# -----------------------------
# 7. Training and Evaluation Functions
# -----------------------------

def train_baseline(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc='Training Baseline'):
        titles = batch['title']
        abstracts = batch['abstract']
        labels = batch['label']
        
        optimizer.zero_grad()
        outputs = model(titles, abstracts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    return avg_loss

def evaluate_baseline(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Evaluating Baseline'):
            titles = batch['title']
            abstracts = batch['abstract']
            labels = batch['label']
            
            outputs = model(titles, abstracts)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            preds = (outputs >= 0.5).float()
            all_labels.extend(labels.numpy())
            all_preds.extend(preds.numpy())
    
    # Print shapes for verification
    print(f"Outputs Shape: {outputs.shape}")
    print(f"Labels Shape: {labels.shape}")
    
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    return avg_loss, accuracy, f1

def train_ctpe(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc='Training CTPE'):
        features = batch['features']
        labels = batch['label']
        
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    return avg_loss

def evaluate_ctpe(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Evaluating CTPE'):
            features = batch['features']
            labels = batch['label']
            
            outputs = model(features)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            preds = (outputs >= 0.5).float()
            all_labels.extend(labels.numpy())
            all_preds.extend(preds.numpy())
    
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    return avg_loss, accuracy, f1

# -----------------------------
# 8. Training and Evaluation Loop
# -----------------------------

num_epochs = 3  # Adjust as needed

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    
    # Train Baseline
    train_loss_baseline = train_baseline(baseline_model, train_loader_baseline, optimizer_baseline, criterion)
    print(f'Baseline - Training Loss: {train_loss_baseline:.4f}')
    
    # Train CTPE
    train_loss_ctpe = train_ctpe(ctpe_model, train_loader_ctpe, optimizer_ctpe, criterion)
    print(f'CTPE - Training Loss: {train_loss_ctpe:.4f}')
    
    # Evaluate Baseline
    val_loss_baseline, val_acc_baseline, val_f1_baseline = evaluate_baseline(baseline_model, val_loader_baseline, criterion)
    print(f'Baseline - Validation Loss: {val_loss_baseline:.4f}, Accuracy: {val_acc_baseline:.4f}, F1 Score: {val_f1_baseline:.4f}')
    
    # Evaluate CTPE
    val_loss_ctpe, val_acc_ctpe, val_f1_ctpe = evaluate_ctpe(ctpe_model, val_loader_ctpe, criterion)
    print(f'CTPE - Validation Loss: {val_loss_ctpe:.4f}, Accuracy: {val_acc_ctpe:.4f}, F1 Score: {val_f1_ctpe:.4f}')



# Baseline
baseline_model.eval()
all_labels_baseline = []
all_scores_baseline = []
with torch.no_grad():
    for batch in tqdm(val_loader_baseline, desc='Extracting Baseline Scores'):
        titles = batch['title']
        abstracts = batch['abstract']
        labels = batch['label'].numpy()
        outputs = baseline_model(titles, abstracts)
        all_labels_baseline.extend(labels)
        all_scores_baseline.extend(outputs.numpy())

# CTPE
ctpe_model.eval()
all_labels_ctpe = []
all_scores_ctpe = []
with torch.no_grad():
    for batch in tqdm(val_loader_ctpe, desc='Extracting CTPE Scores'):
        features = batch['features']
        labels = batch['label'].numpy()
        outputs = ctpe_model(features)
        all_labels_ctpe.extend(labels)
        all_scores_ctpe.extend(outputs.numpy())



print("\nFinal Validation Metrics:")
print(f"Baseline (Bi-Encoder) - Accuracy: {accuracy_score(all_labels_baseline, (np.array(all_scores_baseline) >= 0.5).astype(float)):.4f}, F1 Score: {f1_score(all_labels_baseline, (np.array(all_scores_baseline) >= 0.5).astype(float)):.4f}")
print(f"CTPE (Cross-Encoder) - Accuracy: {accuracy_score(all_labels_ctpe, (np.array(all_scores_ctpe) >= 0.5).astype(float)):.4f}, F1 Score: {f1_score(all_labels_ctpe, (np.array(all_scores_ctpe) >= 0.5).astype(float)):.4f}")


Training pairs: 45797, Validation pairs: 5089
Loading GloVe embeddings...
Converting texts to embeddings...


100%|██████████| 45797/45797 [00:01<00:00, 34291.08it/s]
100%|██████████| 45797/45797 [00:11<00:00, 4143.71it/s]
100%|██████████| 5089/5089 [00:00<00:00, 33711.03it/s]
100%|██████████| 5089/5089 [00:01<00:00, 4100.40it/s]



Epoch 1/3


Training Baseline: 100%|██████████| 1432/1432 [00:01<00:00, 971.53it/s]


Baseline - Training Loss: 0.6911


Training CTPE: 100%|██████████| 1432/1432 [00:02<00:00, 614.11it/s]


CTPE - Training Loss: 0.6900


Evaluating Baseline: 100%|██████████| 160/160 [00:00<00:00, 2501.01it/s]


Outputs Shape: torch.Size([1])
Labels Shape: torch.Size([1])
Baseline - Validation Loss: 0.6888, Accuracy: 0.7088, F1 Score: 0.7315


Evaluating CTPE: 100%|██████████| 160/160 [00:00<00:00, 1805.98it/s]


CTPE - Validation Loss: 0.6763, Accuracy: 0.6310, F1 Score: 0.6945

Epoch 2/3


Training Baseline: 100%|██████████| 1432/1432 [00:01<00:00, 1323.02it/s]


Baseline - Training Loss: 0.6872


Training CTPE: 100%|██████████| 1432/1432 [00:02<00:00, 572.77it/s]


CTPE - Training Loss: 0.6443


Evaluating Baseline: 100%|██████████| 160/160 [00:00<00:00, 2435.10it/s]


Outputs Shape: torch.Size([1])
Labels Shape: torch.Size([1])
Baseline - Validation Loss: 0.6847, Accuracy: 0.7106, F1 Score: 0.7371


Evaluating CTPE: 100%|██████████| 160/160 [00:00<00:00, 1802.36it/s]


CTPE - Validation Loss: 0.6093, Accuracy: 0.6854, F1 Score: 0.7210

Epoch 3/3


Training Baseline: 100%|██████████| 1432/1432 [00:01<00:00, 1311.40it/s]


Baseline - Training Loss: 0.6834


Training CTPE: 100%|██████████| 1432/1432 [00:02<00:00, 559.05it/s]


CTPE - Training Loss: 0.5993


Evaluating Baseline: 100%|██████████| 160/160 [00:00<00:00, 2437.34it/s]


Outputs Shape: torch.Size([1])
Labels Shape: torch.Size([1])
Baseline - Validation Loss: 0.6807, Accuracy: 0.7300, F1 Score: 0.7275


Evaluating CTPE: 100%|██████████| 160/160 [00:00<00:00, 1528.81it/s]


CTPE - Validation Loss: 0.5787, Accuracy: 0.7031, F1 Score: 0.7204


Extracting Baseline Scores: 100%|██████████| 160/160 [00:00<00:00, 2756.96it/s]
Extracting CTPE Scores: 100%|██████████| 160/160 [00:00<00:00, 1744.72it/s]


Final Validation Metrics:
Baseline (Bi-Encoder) - Accuracy: 0.7300, F1 Score: 0.7275
CTPE (Cross-Encoder) - Accuracy: 0.7031, F1 Score: 0.7204



