In [52]:
import json
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration, logging
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from sklearn.model_selection import train_test_split
from rapidfuzz import fuzz
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
from tqdm.notebook import tqdm as notebook_tqdm
import nltk
import os
import warnings
warnings.filterwarnings('ignore')

In [4]:
os.environ["WANDB_DISABLED"] = "true"

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [21]:
class KeywordDataset(Dataset):
    def __init__(self, abstracts, keywords, tokenizer, max_length=1024, max_target_length=128):
        self.tokenizer = tokenizer
        self.abstracts = abstracts
        self.keywords = keywords
        self.max_length = max_length
        self.max_target_length = max_target_length
        
    def __len__(self):
        return len(self.abstracts)
    
    def __getitem__(self, idx):
        abstract = "Based on the following paper abstract, predict the keywords of the paper: " + self.abstracts[idx]
        
        # Convert keyword list to a comma-separated string
        keyword_list = self.keywords[idx]
        if isinstance(keyword_list, list):
            keywords_text = ", ".join(keyword_list)
        else:
            keywords_text = keyword_list
        
        # Encode the inputs
        inputs = self.tokenizer(
            abstract, 
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        # Encode the targets
        targets = self.tokenizer(
            keywords_text,
            max_length=self.max_target_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        input_ids = inputs.input_ids.squeeze()
        attention_mask = inputs.attention_mask.squeeze()
        labels = targets.input_ids.squeeze()
        
        # Replace padding token id with -100 so it's ignored in loss calculation
        labels[labels == self.tokenizer.pad_token_id] = -100
        
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }


In [22]:
def preprocess_data(train_df, test_df):
    """Clean and prepare the dataframes for training"""
    # Clean abstracts
    for df in [train_df, test_df]:
        df['abstract_content'] = df['abstract_content'].str.replace("\n\n", " ", regex=False)
        df['abstract_content'] = df['abstract_content'].str.replace("\n", " ", regex=False)
        df['abstract_content'] = df['abstract_content'].str.strip()
        
        # Ensure keywords are in list format
        if not isinstance(df['keywords'].iloc[0], list):
            df['keywords'] = df['keywords'].apply(lambda x: x.split() if isinstance(x, str) else x)
            
    return train_df, test_df

In [24]:
def prepare_training_data(train_df, test_df, model_name="facebook/bart-base"):
    """Prepare datasets for BART training"""
    # Initialize tokenizer
    tokenizer = BartTokenizer.from_pretrained(model_name)
    
    # Create datasets
    train_dataset = KeywordDataset(
        train_df['abstract_content'].tolist(),
        train_df['keywords'].tolist(),
        tokenizer
    )
    
    eval_dataset = KeywordDataset(
        test_df['abstract_content'].tolist(),
        test_df['keywords'].tolist(),
        tokenizer
    )
    
    return train_dataset, eval_dataset, tokenizer

In [25]:
def train_bart_model(train_dataset, eval_dataset, tokenizer, model_name="facebook/bart-base"):
    """Train the BART model for keyword extraction"""
    # Initialize model
    model = BartForConditionalGeneration.from_pretrained(model_name)
    model.to(device)
    
    # Define training arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir="./bart-keyword-model",
        eval_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        weight_decay=0.01,
        save_total_limit=1,
        num_train_epochs=10,
        predict_with_generate=True,
        generation_max_length=128,
        logging_dir="./logs",
        logging_steps=100,
        save_strategy="epoch",
        gradient_accumulation_steps=4,
        report_to="none"
    )
    
    # Initialize trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
    )
    
    # Train the model
    print("Starting training...")
    trainer.train()
    
    return model, trainer

In [35]:
def calculate_fuzzy_map_score(true_keywords, predicted_keywords, fuzzy_threshold=80):
    """
    Calculate Mean Average Precision with fuzzy matching
    """
    if not predicted_keywords or not true_keywords:
        return 0.0
    
    # Track which true keywords have been matched
    matched_keywords = set()
    precision_sum = 0.0
    num_hits = 0
    
    # Check each predicted keyword in order
    for i, pred_kw in enumerate(predicted_keywords):
        # Try to match with any unmatched true keyword
        for true_kw in true_keywords:
            if true_kw in matched_keywords:
                continue
                
            # Calculate similarity
            similarity = fuzz.ratio(pred_kw.lower(), true_kw.lower())
            
            # If it's a match, count it and mark as matched
            if similarity >= fuzzy_threshold:
                matched_keywords.add(true_kw)
                num_hits += 1
                precision_sum += num_hits / (i + 1)  # Precision at recall point i+1
                break
    
    # Calculate final MAP
    if num_hits > 0:
        return precision_sum / min(len(true_keywords), len(predicted_keywords))
    else:
        return 0.0

In [63]:
def calculate_semantic_map_score(true_keywords, predicted_keywords, semantic_model, similarity_threshold=0.5):
    """
    Calculate Mean Average Precision with semantic similarity matching
    """
    if not predicted_keywords or not true_keywords:
        return 0.0
    
    # Track which true keywords have been matched
    matched_keywords = set()
    precision_sum = 0.0
    num_hits = 0

    semantic_model.show_progress_bar = False

    #Converting predicted and true keywords to embeddings
    true_embeddings = semantic_model.encode(true_keywords, convert_to_tensor=True, show_progress_bar=False)
    predicted_embeddings = semantic_model.encode(predicted_keywords, convert_to_tensor=True, show_progress_bar=False)

    for i, pred_emb in enumerate(predicted_embeddings):
        unmatched_indices = [j for j, kw in enumerate(true_keywords) if kw not in matched_keywords]
        if not unmatched_indices:
            break
        
        unmatched_true_embs = true_embeddings[unmatched_indices]
        cos_scores = util.cos_sim(pred_emb, unmatched_true_embs)[0] #calculating cosine similarity to all unmatched keywords
        
        # Find best match
        best_idx = torch.argmax(cos_scores).item()
        best_score = cos_scores[best_idx].item()

        if best_score >= similarity_threshold:
            matched_kw = true_keywords[unmatched_indices[best_idx]]
            matched_keywords.add(matched_kw)
            num_hits += 1
            precision_sum += num_hits / (i + 1)
    
    if num_hits > 0:
        return precision_sum / min(len(true_keywords), len(predicted_keywords))
    else:
        return 0.0

In [64]:
def evaluate_keyword_extraction(model, tokenizer, test_df, fuzzy_threshold=50, semantic_threshold=0.5, top_k=5):
    """Evaluate the model using MAP with fuzzy matching"""
    model.eval()
    semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
    fuzzy_map_scores = []
    semantic_map_scores = []
    
    print("Evaluating model...")
    for i, row in notebook_tqdm(test_df.iterrows(), total=len(test_df)):
        abstract = row['abstract_content']
        true_keywords = row['keywords']
        
        # Tokenize input
        inputs = tokenizer(abstract, return_tensors="pt", max_length=1024, 
                          truncation=True).to(device)
        
        # Generate keywords
        with torch.no_grad():
            outputs = model.generate(
                inputs.input_ids, 
                attention_mask=inputs.attention_mask,
                max_length=128,
                num_beams=5,
                early_stopping=True,
            )
        
        # Decode and split the generated keywords
        pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predicted_keywords = [k.strip() for k in pred_text.split(',')][:top_k]
        
        # Calculate Fuzzy MAP score
        fuzzy_map_score = calculate_fuzzy_map_score(true_keywords, predicted_keywords, fuzzy_threshold)
        fuzzy_map_scores.append(fuzzy_map_score)

        # Calculate Semantic MAP score
        semantic_map_score = calculate_semantic_map_score(true_keywords, predicted_keywords, semantic_model, semantic_threshold)
        semantic_map_scores.append(semantic_map_score)
    
    # Calculate average MAP scores
    avg_fuzzy_map = sum(fuzzy_map_scores) / len(fuzzy_map_scores)
    avg_semantic_map = sum(semantic_map_scores) / len(semantic_map_scores)
    #print(f"MAP@{top_k} Score with fuzzy threshold {fuzzy_threshold}: {avg_map:.4f}")
    
    return avg_fuzzy_map, avg_semantic_map

In [27]:
def example_predictions(model, tokenizer, test_df, num_examples=5):
    """Show example predictions from the model"""
    indices = np.random.choice(len(test_df), min(num_examples, len(test_df)), replace=False)
    
    for idx in indices:
        abstract = test_df.iloc[idx]['abstract_content']
        true_keywords = test_df.iloc[idx]['keywords']
        
        # Tokenize input
        inputs = tokenizer(abstract, return_tensors="pt", max_length=1024, 
                          truncation=True).to(device)
        
        # Generate keywords
        with torch.no_grad():
            outputs = model.generate(
                inputs.input_ids, 
                attention_mask=inputs.attention_mask,
                max_length=128,
                num_beams=5,
                early_stopping=True
            )
        
        # Decode and split the generated keywords
        pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predicted_keywords = [k.strip() for k in pred_text.split(',')]
        
        print("\n" + "="*80)
        print("ABSTRACT SNIPPET:")
        print(abstract[:300] + "..." if len(abstract) > 300 else abstract)
        print("\nTRUE KEYWORDS:")
        print(true_keywords)
        print("\nPREDICTED KEYWORDS:")
        print(predicted_keywords)

In [None]:
train_data = []
with open("data/training-data.ndjson", "r", encoding="utf-8") as f:
    train_data = [json.loads(line) for line in f if line.strip()]  # skip blank lines
train_df = pd.DataFrame(train_data)
train_df = train_df[['abstract_content', 'keywords']]

In [None]:
test_data = []
with open("data/test-data.ndjson", "r", encoding="utf-8") as f:
    test_data = [json.loads(line) for line in f if line.strip()]  # skip blank lines
test_df = pd.DataFrame(test_data)
test_df = test_df[['abstract_content', 'keywords']]

In [30]:
#Preprocessing
train_df, test_df = preprocess_data(train_df, test_df)
print(f"Beginning training loop for BART model trained on abstract contents")
print(f"Training on {len(train_df)} examples, testing on {len(test_df)} examples")
    
train_dataset, eval_dataset, tokenizer = prepare_training_data(train_df, test_df)
    
#Model Training
model, trainer = train_bart_model(train_dataset, eval_dataset, tokenizer)
    
#Model Evaluation
fuzzy_map_score, semantic_map_score = evaluate_keyword_extraction(model, tokenizer, test_df)
print(f"Fuzzy Matching MAP score for model trained on abstracts: {fuzzy_map_score}")
print(f"Semantic Matching MAP score for model trained on abstracts: {semantic_map_score}")
    
#Visualizing Predictions
#example_predictions(model, tokenizer, test_df)

Beginning training loop for BART model trained on abstract contents
Training on 16000 examples, testing on 4000 examples


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Starting training...


Epoch,Training Loss,Validation Loss
1,2.8169,2.516139
2,2.5464,2.425934
3,2.3403,2.395354
4,2.2188,2.362302
5,2.0569,2.374098
6,1.9229,2.365574
7,1.884,2.366636
8,1.7966,2.383343
9,1.756,2.400268
10,1.7593,2.397155


Evaluating model...


100%|██████████| 4000/4000 [16:04<00:00,  4.15it/s]

MAP score for model trained on abstracts: 0.40561243055555835





In [65]:
fuzzy_map_score, semantic_map_score = evaluate_keyword_extraction(model, tokenizer, test_df)
print(f"Fuzzy Matching MAP score for model trained on abstracts: {fuzzy_map_score}")
print(f"Semantic Matching MAP score for model trained on abstracts: {semantic_map_score}")

Evaluating model...


  0%|          | 0/4000 [00:00<?, ?it/s]

Fuzzy Matching MAP score for model trained on abstracts: 0.40561243055555835
Semantic Matching MAP score for model trained on abstracts: 0.5819333333333369


In [68]:
fuzzy_map_score, semantic_map_score = evaluate_keyword_extraction(model, tokenizer, test_df, semantic_threshold=0.6)
print(f"Fuzzy Matching MAP score for model trained on abstracts: {fuzzy_map_score}")
print(f"Semantic Matching MAP score for model trained on abstracts: {semantic_map_score}")

Evaluating model...


  0%|          | 0/4000 [00:00<?, ?it/s]

Fuzzy Matching MAP score for model trained on abstracts: 0.40561243055555835
Semantic Matching MAP score for model trained on abstracts: 0.4505481944444467


In [69]:
example_predictions(model, tokenizer, test_df)


ABSTRACT SNIPPET:
Deep Convolutional Neural Networks (DCNNs) were originally inspired by principles of biological vision, have evolved into best current computational models of object recognition, and consequently indicate strong architectural and functional parallelism with the ventral visual pathway throughout comp...

TRUE KEYWORDS:
['vision', 'attention', 'brain', 'deep neural networks', 'eye tracking', 'saliency map', 'object recognition', 'face detection']

PREDICTED KEYWORDS:
['Deep Convolutional Neural Networks (DCNNs)', 'Attention Mechanism (ADM)', 'Human-like Attention']

ABSTRACT SNIPPET:
State-of-the-art parametric and non-parametric style transfer approaches are prone to either distorted local style patterns due to global statistics alignment, or unpleasing artifacts resulting from patch mismatching. In this paper, we study a novel semi-parametric neural style transfer framework th...

TRUE KEYWORDS:
['Neural style transfer · Graph neural networks · Attention-based messag