In [1]:
import pandas as pd
import torch
torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = True
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
import tensorflow_hub as hub

import torch.nn as nn
from transformers import AutoModel

import torch.optim as optim
import torch.nn.functional as F

import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.model_selection import train_test_split




In [2]:
# train 70%, val 20%, test 10%
df = pd.read_csv("database_clean.csv")
df = df.dropna(subset=['title', 'abstract'])
df = df.head(400)
df.reset_index(drop=True, inplace=True)

X_train, X_temp = train_test_split(df, test_size=0.3, random_state=42)
X_val, X_test = train_test_split(X_temp, test_size=1/3, random_state=42)
print(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")

Train: (280, 8), Validation: (80, 8), Test: (40, 8)


In [3]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/distiluse-base-multilingual-cased-v1")

class PaperDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        title = str(self.data.iloc[idx]["title"])
        abstract = str(self.data.iloc[idx]["abstract"])
        input_text = title + " " + abstract  # Combine title and abstract
        
        tokens = self.tokenizer(
            input_text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            "input_ids": tokens["input_ids"].squeeze(0),
            "attention_mask": tokens["attention_mask"].squeeze(0),
            "paper_index": idx  # Used for retrieval
        }


# Create dataset and DataLoader
dataset_train = PaperDataset(X_train, tokenizer, 256)
dataloader = DataLoader(dataset_train, batch_size=8, shuffle=True)

In [4]:
for batch in dataloader:
    print("Batch Input IDs Shape:", batch["input_ids"].shape)
    print("Batch Attention Mask Shape:", batch["attention_mask"].shape)
    break  # Check only the first batch

Batch Input IDs Shape: torch.Size([8, 256])
Batch Attention Mask Shape: torch.Size([8, 256])


In [5]:
class PaperRecommender(nn.Module):
    def __init__(self, model_name="sentence-transformers/distiluse-base-multilingual-cased-v1", embedding_dim=768, dropout=0.1):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.embedding_dim = embedding_dim
        self.fc = nn.Linear(self.embedding_dim, embedding_dim)  # Projection layer
        self.dropout = nn.Dropout(dropout)
        self.normalize = nn.functional.normalize  # L2 normalization for retrieval

    def forward(self, input_ids, attention_mask):
        input_ids = input_ids.to(next(self.parameters()).device)  # Ensure inputs are on the same device as the model
        attention_mask = attention_mask.to(next(self.parameters()).device)
        
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # CLS token representation
        embedding = self.fc(self.dropout(pooled_output))
        return self.normalize(embedding, p=2, dim=1)  # Normalize embeddings

In [6]:
# Initialize model
model = PaperRecommender().to("cuda")
optimizer = optim.Adam(model.parameters(), lr=1e-5)

def contrastive_loss(embeddings):
    """Loss function using pairwise Euclidean distances in the batch."""
    # Normalize embeddings (optional, helps numerical stability)
    embeddings = F.normalize(embeddings, p=2, dim=1)  
    # Compute pairwise Euclidean distances
    distance_matrix = torch.cdist(embeddings, embeddings, p=2)  # (batch_size, batch_size)
    # Minimize the sum of all distances (encourages compact embedding space)
    loss = distance_matrix.sum() / (embeddings.shape[0] ** 2)  # Normalize by batch_size^2
    return loss

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for batch in dataloader:
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")

        optimizer.zero_grad()
        embeddings = model(input_ids, attention_mask)  # Shape: (batch_size, embedding_dim)

        # Compute loss using pairwise distances between all embeddings
        loss = contrastive_loss(embeddings)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Epoch 1, Loss: 0.1867
Epoch 2, Loss: 0.1280
Epoch 3, Loss: 0.1028
Epoch 4, Loss: 0.0711
Epoch 5, Loss: 0.0523
Epoch 6, Loss: 0.0305
Epoch 7, Loss: 0.0162
Epoch 8, Loss: 0.0096
Epoch 9, Loss: 0.0067
Epoch 10, Loss: 0.0051


In [7]:
def recommend_papers(query, model, df, top_k=5):
    model.eval()
    
    # Tokenize and encode query
    query_tokens = tokenizer(query, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        query_embedding = model(query_tokens["input_ids"], query_tokens["attention_mask"]).cpu().numpy()

    # Compute Euclidean distances between query and all paper embeddings
    paper_embeddings = []
    paper_indices = []

    for batch in dataloader:
        batch_input_ids = batch["input_ids"].to("cuda")
        batch_attention_mask = batch["attention_mask"].to("cuda")

        with torch.no_grad():
            batch_embeddings = model(batch_input_ids, batch_attention_mask).cpu().numpy()
            paper_embeddings.append(batch_embeddings)
            paper_indices.extend(batch["paper_index"].numpy())  # Store original indices

    paper_embeddings = np.vstack(paper_embeddings)  # Stack all embeddings
    paper_indices = np.array(paper_indices)

    # Compute pairwise Euclidean distances
    distances = np.linalg.norm(paper_embeddings - query_embedding, axis=1)

    # Get top-k closest papers (smallest distances)
    top_indices = np.argsort(distances)[:top_k]

    print("\nRecommended Papers:")
    for idx in top_indices:
        paper_idx = paper_indices[idx]
        print(f"Title: {df.iloc[paper_idx]['title']}\nAbstract: {df.iloc[paper_idx]['abstract']}\nDistance: {distances[idx]:.4f}\n")

In [8]:
recommend_papers("deep learning for edge computing", model, df)


Recommended Papers:
Title: Analysis of EV Charging Coordination Efficiency in Presence of Cheating Customers
Abstract: Charging coordination is employed to efficiently serve electric vehicle (EV) charging requests without overloading the distribution network. Parameters such as parking duration, battery state-of-charge (SoC), and charging amount are provided by EVs to the charging coordination center to schedule their charging requests efficiently. The existing literature assumes that the customers always provide correct information. Unfortunately, customers may provide false information to gain higher charging priority. Assessing the impact of cheating behavior represents a significant and open problem. Herein paper, the impact of providing false information (e.g., parking duration) on the efficiency of the charging coordination mechanism is investigated. The charging coordination strategy is formulated as a linear optimization problem. Two different objectives are used to assess the