In [1]:
import pandas as pd
import torch
torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = True
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
import tensorflow_hub as hub

import torch.nn as nn
from transformers import AutoModel

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import torch.optim as optim
import torch.nn.functional as F

import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.model_selection import train_test_split




In [2]:
# train 70%, val 20%, test 10%
df = pd.read_csv("database_clean.csv")
df = df.dropna(subset=['title', 'abstract'])
#df = df.head(400)
df.reset_index(drop=True, inplace=True)

X_train, X_temp = train_test_split(df, test_size=0.3, random_state=42)
X_val, X_test = train_test_split(X_temp, test_size=1/3, random_state=42)
print(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")

Train: (385, 8), Validation: (110, 8), Test: (56, 8)


In [3]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/distiluse-base-multilingual-cased-v1")

class PaperDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        title = str(self.data.iloc[idx]["title"])
        abstract = str(self.data.iloc[idx]["abstract"])
        input_text = title + " " + abstract  # Combine title and abstract
        
        tokens = self.tokenizer(
            input_text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            "input_ids": tokens["input_ids"].squeeze(0),
            "attention_mask": tokens["attention_mask"].squeeze(0),
            "paper_index": idx  # Used for retrieval
        }


# Create dataset and DataLoader
dataset_train = PaperDataset(X_train, tokenizer, 256)
dataloader = DataLoader(dataset_train, batch_size=8, shuffle=True)

In [4]:
for batch in dataloader:
    print("Batch Input IDs Shape:", batch["input_ids"].shape)
    print("Batch Attention Mask Shape:", batch["attention_mask"].shape)
    break  # Check only the first batch

Batch Input IDs Shape: torch.Size([8, 256])
Batch Attention Mask Shape: torch.Size([8, 256])


In [5]:
class PaperRecommender(nn.Module):
    def __init__(self, model_name="sentence-transformers/distiluse-base-multilingual-cased-v1", embedding_dim=768, dropout=0.1):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.embedding_dim = embedding_dim
        self.fc = nn.Linear(self.embedding_dim, embedding_dim)  # Projection layer
        self.dropout = nn.Dropout(dropout)
        self.normalize = nn.functional.normalize  # L2 normalization for retrieval

    def forward(self, input_ids, attention_mask):
        input_ids = input_ids.to(next(self.parameters()).device)  # Ensure inputs are on the same device as the model
        attention_mask = attention_mask.to(next(self.parameters()).device)
        
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # CLS token representation
        embedding = self.fc(self.dropout(pooled_output))
        return self.normalize(embedding, p=2, dim=1)  # Normalize embeddings

In [6]:
def compute_tfidf_similarity(df):
    corpus = df['title'] + " " + df['abstract']
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix

similarity_matrix = compute_tfidf_similarity(df)
# Convert to tensor for GPU computation
similarity_matrix = torch.tensor(similarity_matrix, dtype=torch.float32).to("cuda")
print(similarity_matrix)

threshold = 0.0785

positive_pairs = similarity_matrix > threshold
negative_pairs = ~positive_pairs
print(positive_pairs)
print(negative_pairs)

tensor([[1.0000, 0.0750, 0.0598,  ..., 0.0521, 0.0948, 0.0936],
        [0.0750, 1.0000, 0.0727,  ..., 0.0725, 0.0826, 0.0915],
        [0.0598, 0.0727, 1.0000,  ..., 0.0706, 0.0856, 0.0740],
        ...,
        [0.0521, 0.0725, 0.0706,  ..., 1.0000, 0.0538, 0.0668],
        [0.0948, 0.0826, 0.0856,  ..., 0.0538, 1.0000, 0.1076],
        [0.0936, 0.0915, 0.0740,  ..., 0.0668, 0.1076, 1.0000]],
       device='cuda:0')
tensor([[ True, False, False,  ..., False,  True,  True],
        [False,  True, False,  ..., False,  True,  True],
        [False, False,  True,  ..., False,  True, False],
        ...,
        [False, False, False,  ...,  True, False, False],
        [ True,  True,  True,  ..., False,  True,  True],
        [ True,  True, False,  ..., False,  True,  True]], device='cuda:0')
tensor([[False,  True,  True,  ...,  True, False, False],
        [ True, False,  True,  ...,  True, False, False],
        [ True,  True, False,  ...,  True, False,  True],
        ...,
        [ Tr

In [7]:
# Initialize model
model = PaperRecommender().to("cuda")
optimizer = optim.Adam(model.parameters(), lr=1e-5)

def contrastive_loss(embeddings, similarity_matrix, indices, margin=0.5):
    """
    Contrastive loss using TF-IDF similarity as ground truth.
    
    embeddings: (batch_size, embedding_dim)
    similarity_matrix: Precomputed TF-IDF cosine similarity.
    indices: Indices of batch samples in dataset.
    margin: Margin for contrastive loss.
    """
    batch_size = embeddings.shape[0]

    # Ensure embeddings are L2 normalized
    embeddings = F.normalize(embeddings, p=2, dim=1)

    # Compute cosine similarity
    cosine_sim = torch.mm(embeddings, embeddings.T)  # (batch_size, batch_size)
    cosine_distances = 1 - cosine_sim  # Convert similarity to distance

    # Extract ground truth similarity values for batch samples
    ground_truth_similarities = similarity_matrix[indices][:, indices]

    # Define positive and negative pairs
    threshold = 0.0785  # Adjust this value if needed
    positive_pairs = (ground_truth_similarities > threshold).float()
    negative_pairs = (ground_truth_similarities <= threshold).float()

    # Compute losses
    positive_loss = (cosine_distances * positive_pairs).sum() / (positive_pairs.sum() + 1e-8)
    negative_loss = torch.clamp(margin - cosine_distances, min=0) * negative_pairs
    negative_loss = negative_loss.sum() / (negative_pairs.sum() + 1e-8)

    loss = positive_loss + negative_loss

    return loss


num_epochs = 10
for epoch in range(num_epochs):
    total_loss = 0

    for batch in dataloader:
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")
        indices = batch["paper_index"].cpu().numpy()  

        optimizer.zero_grad()
        embeddings = model(input_ids, attention_mask)

        # Compute loss
        loss = contrastive_loss(embeddings, similarity_matrix, indices, margin=0.5)
        
        if loss.item() == 0:
            print(f"Warning: Zero loss at epoch {epoch+1}. Debug required.")

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Avg Loss: {total_loss / len(dataloader):.4f}")



Epoch 1, Avg Loss: 0.3925
Epoch 2, Avg Loss: 0.3717
Epoch 3, Avg Loss: 0.3610
Epoch 4, Avg Loss: 0.3470
Epoch 5, Avg Loss: 0.3437
Epoch 6, Avg Loss: 0.3293
Epoch 7, Avg Loss: 0.3034
Epoch 8, Avg Loss: 0.2916
Epoch 9, Avg Loss: 0.2927
Epoch 10, Avg Loss: 0.2669


In [8]:
def recommend_papers(query, model, df, top_k=5):
    model.eval()
    
    # Tokenize and encode query
    query_tokens = tokenizer(query, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        query_embedding = model(query_tokens["input_ids"], query_tokens["attention_mask"]).cpu().numpy()

    # Compute Euclidean distances between query and all paper embeddings
    paper_embeddings = []
    paper_indices = []

    for batch in dataloader:
        batch_input_ids = batch["input_ids"].to("cuda")
        batch_attention_mask = batch["attention_mask"].to("cuda")

        with torch.no_grad():
            batch_embeddings = model(batch_input_ids, batch_attention_mask).cpu().numpy()
            paper_embeddings.append(batch_embeddings)
            paper_indices.extend(batch["paper_index"].numpy())  # Store original indices

    paper_embeddings = np.vstack(paper_embeddings)  # Stack all embeddings
    paper_indices = np.array(paper_indices)

    # Compute pairwise Euclidean distances
    distances = np.linalg.norm(paper_embeddings - query_embedding, axis=1)

    # Get top-k closest papers (smallest distances)
    top_indices = np.argsort(distances)[:top_k]

    print("\nRecommended Papers:")
    for idx in top_indices:
        paper_idx = paper_indices[idx]
        print(f"Title: {df.iloc[paper_idx]['title']}\nAbstract: {df.iloc[paper_idx]['abstract']}\nDistance: {distances[idx]:.4f}\n")

In [9]:
recommend_papers("deep learning for edge computing", model, df)


Recommended Papers:
Title: Automated Software Test Data Generation With Generative Adversarial Networks
Abstract: With the rapid increase of software scale and complexity, the cost of traditional software testing methods will increase faster than the scale of software. In order to improve test efficiency, it is particularly important to automatically generate high-quality test cases. This paper introduces a framework for automatic test data generation based on the generative adversarial network (GAN). GAN is employed to train a generative model over execution path information to learn the behavior of the software. Then we can use the trained generative model to produce new test data, and select the test data that can improve the branch coverage according to our proposed selection strategy. Compared to prior work, our proposed method is able to handle programs under test with large-scale branches without analyzing branch expressions. In the experiment, we exhibit the performance of our

In [10]:
# Generate embeddings for all papers using the trained model
def get_embeddings(dataloader, model):
    model.eval()
    embeddings = []
    paper_indices = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to("cuda")
            attention_mask = batch["attention_mask"].to("cuda")

            batch_embeddings = model(input_ids, attention_mask)  # Keep as CUDA tensor
            embeddings.append(batch_embeddings)  # Store without converting to NumPy
            paper_indices.extend(batch["paper_index"].numpy())

    return torch.cat(embeddings, dim=0), np.array(paper_indices)  # Return PyTorch tensor

# Get embeddings for train and test sets
train_dataloader = DataLoader(PaperDataset(X_train, tokenizer, 256), batch_size=8, shuffle=False)
test_dataloader = DataLoader(PaperDataset(X_test, tokenizer, 256), batch_size=8, shuffle=False)

train_embeddings, train_indices = get_embeddings(train_dataloader, model)
test_embeddings, test_indices = get_embeddings(test_dataloader, model)

# Compute cosine similarity in CUDA
train_embeddings = F.normalize(train_embeddings, p=2, dim=1)  # Normalize embeddings
test_embeddings = F.normalize(test_embeddings, p=2, dim=1)

print("Sample Test Embedding:", test_embeddings[0][:15])  # First 15 values
print("Sample Train Embedding:", train_embeddings[0][:15])  # First 15 values

similarity_matrix = torch.matmul(test_embeddings, train_embeddings.T).cpu().numpy()  # Cosine similarity

# Select top-N most similar papers
top_n = 10
top_indices = np.argsort(-similarity_matrix, axis=1)[:, :top_n]

# Print recommended papers
recommended_paper_ids = []

for i, test_idx in enumerate(top_indices):
    recommended_for_test = []
    print(f"\nTest Paper {i+1}:")
    
    for j, train_idx in enumerate(test_idx):
        recommended_paper_id = X_train.iloc[train_indices[train_idx]]["Id"]
        recommended_for_test.append(recommended_paper_id)
        
        print(f"  {j+1}. Recommended Paper ID: {recommended_paper_id} (Similarity: {similarity_matrix[i, train_idx]:.4f})")
    
    recommended_paper_ids.append(recommended_for_test)

Sample Test Embedding: tensor([ 0.0412, -0.0199, -0.0312,  0.0079,  0.0086, -0.0558,  0.0416, -0.0383,
        -0.1401,  0.0331, -0.0152,  0.0166,  0.0149, -0.0541, -0.0188],
       device='cuda:0')
Sample Train Embedding: tensor([ 0.0310, -0.0439, -0.0171, -0.0420, -0.0154, -0.0615,  0.0401, -0.0684,
        -0.0966, -0.0058, -0.0105,  0.0442,  0.0455, -0.0300, -0.0260],
       device='cuda:0')

Test Paper 1:
  1. Recommended Paper ID: 180 (Similarity: 0.8937)
  2. Recommended Paper ID: 144 (Similarity: 0.8934)
  3. Recommended Paper ID: 331 (Similarity: 0.8930)
  4. Recommended Paper ID: 379 (Similarity: 0.8926)
  5. Recommended Paper ID: 53 (Similarity: 0.8918)
  6. Recommended Paper ID: 109 (Similarity: 0.8913)
  7. Recommended Paper ID: 299 (Similarity: 0.8908)
  8. Recommended Paper ID: 529 (Similarity: 0.8907)
  9. Recommended Paper ID: 583 (Similarity: 0.8905)
  10. Recommended Paper ID: 512 (Similarity: 0.8905)

Test Paper 2:
  1. Recommended Paper ID: 432 (Similarity: 0.9299)