In [1]:
import pandas as pd
import torch
torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = True
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

import torch.nn as nn
from transformers import AutoModel

import torch.optim as optim
import torch.nn.functional as F

import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load the dataset ONLY 300
df = pd.read_csv("IEEE_Papers_Dataset.csv")
df = df.dropna(subset=['title', 'abstract'])
df = df.head(400)
df.reset_index(drop=True, inplace=True)

# Use a pre-trained tokenizer (e.g., BERT-based)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

class PaperDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        title = str(self.data.iloc[idx]["title"])
        abstract = str(self.data.iloc[idx]["abstract"])
        input_text = title + " " + abstract  # Combine title and abstract
        
        tokens = self.tokenizer(
            input_text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            "input_ids": tokens["input_ids"].squeeze(0),
            "attention_mask": tokens["attention_mask"].squeeze(0),
            "paper_index": idx  # Used for retrieval
        }


# Create dataset and DataLoader
dataset = PaperDataset(df, tokenizer, 256)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [3]:
class PaperRecommender(nn.Module):
    def __init__(self, model_name="bert-base-uncased", embedding_dim=768, dropout=0.1):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.embedding_dim = embedding_dim
        self.fc = nn.Linear(self.embedding_dim, embedding_dim)  # Projection layer
        self.dropout = nn.Dropout(dropout)
        self.normalize = nn.functional.normalize  # L2 normalization for retrieval

    def forward(self, input_ids, attention_mask):
        input_ids = input_ids.to(next(self.parameters()).device)  # Ensure inputs are on the same device as the model
        attention_mask = attention_mask.to(next(self.parameters()).device)
        
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # CLS token representation
        embedding = self.fc(self.dropout(pooled_output))
        return self.normalize(embedding, p=2, dim=1)  # Normalize embeddings

In [4]:
# Initialize model
model = PaperRecommender().to("cuda")
optimizer = optim.Adam(model.parameters(), lr=1e-5)

def contrastive_loss(embeddings):
    """Loss function using pairwise Euclidean distances in the batch."""
    # Normalize embeddings (optional, helps numerical stability)
    embeddings = F.normalize(embeddings, p=2, dim=1)  
    # Compute pairwise Euclidean distances
    distance_matrix = torch.cdist(embeddings, embeddings, p=2)  # (batch_size, batch_size)
    # Minimize the sum of all distances (encourages compact embedding space)
    loss = distance_matrix.sum() / (embeddings.shape[0] ** 2)  # Normalize by batch_size^2
    return loss

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for batch in dataloader:
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")

        optimizer.zero_grad()
        embeddings = model(input_ids, attention_mask)  # Shape: (batch_size, embedding_dim)

        # Compute loss using pairwise distances between all embeddings
        loss = contrastive_loss(embeddings)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


Epoch 1, Loss: 0.2748
Epoch 2, Loss: 0.3242
Epoch 3, Loss: 0.2133
Epoch 4, Loss: 0.1660
Epoch 5, Loss: 0.1453
Epoch 6, Loss: 0.1224
Epoch 7, Loss: 0.1211
Epoch 8, Loss: 0.1167
Epoch 9, Loss: 0.1015
Epoch 10, Loss: 0.1081


In [5]:
def recommend_papers(query, model, df, top_k=5):
    model.eval()
    
    # Tokenize and encode query
    query_tokens = tokenizer(query, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        query_embedding = model(query_tokens["input_ids"], query_tokens["attention_mask"]).cpu().numpy()

    # Compute Euclidean distances between query and all paper embeddings
    paper_embeddings = []
    paper_indices = []

    for batch in dataloader:
        batch_input_ids = batch["input_ids"].to("cuda")
        batch_attention_mask = batch["attention_mask"].to("cuda")

        with torch.no_grad():
            batch_embeddings = model(batch_input_ids, batch_attention_mask).cpu().numpy()
            paper_embeddings.append(batch_embeddings)
            paper_indices.extend(batch["paper_index"].numpy())  # Store original indices

    paper_embeddings = np.vstack(paper_embeddings)  # Stack all embeddings
    paper_indices = np.array(paper_indices)

    # Compute pairwise Euclidean distances
    distances = np.linalg.norm(paper_embeddings - query_embedding, axis=1)

    # Get top-k closest papers (smallest distances)
    top_indices = np.argsort(distances)[:top_k]

    print("\nRecommended Papers:")
    for idx in top_indices:
        paper_idx = paper_indices[idx]
        print(f"Title: {df.iloc[paper_idx]['title']}\nAbstract: {df.iloc[paper_idx]['abstract']}\nDistance: {distances[idx]:.4f}\n")

In [6]:
recommend_papers("deep learning for edge computing", model, df)


Recommended Papers:
Title: Corrections to “Complex Permittivity of NaOH Solutions Used in Liquid-Metal Circuits”
Abstract: In the above article 
[1]
, 
Table 1
, Figs. 4 and 5, and Appendices A and B of the associated supplementary materials unfortunately contain minor errors. This article serves to correct those errors.
Distance: 0.0740

Title: Correction to “A Novel Unbalance Compensation Method for Distribution Solid-State Transformer Based on Reduced Order Generalized Integrator”
Abstract: 1. In page 108598, the title “IV ICOMPENSATION ABILITY ANALYSIS” should be “IV COMPENSATION ABILITY ANALYSIS”.
Distance: 0.0743

Title: Comments on “On Favorable Propagation in Massive MIMO Systems and Different Antenna Configurations”
Abstract: It is shown that the condition of Theorem 1 in (X. Wu, N. C. Beaulieu, D. Liu, “On favorable propagation in massive MIMO systems and different antenna configurations,” IEEE Access, vol. 5, pp. 5578-5593, May 2017) never holds in practice and that Theorem