# Install libraries

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
!pip install keybert

Collecting keybert
  Downloading keybert-0.8.5-py3-none-any.whl.metadata (15 kB)
Downloading keybert-0.8.5-py3-none-any.whl (37 kB)
Installing collected packages: keybert
Successfully installed keybert-0.8.5


In [3]:
!pip install keybert sentence-transformers



In [4]:
!pip install keybert transformers



In [5]:
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Tuple
import re
from collections import defaultdict
from keybert import KeyBERT
from tqdm.notebook import tqdm
import time
import pickle
from datetime import datetime
import os
import pandas as pd
import string

# Load dataset SemEval

In [6]:
# Load dataset
ds = load_dataset("SemEvalWorkshop/sem_eval_2010_task_8")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.23k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/673k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/231k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2717 [00:00<?, ? examples/s]

## See some samples

In [7]:
# Take first 10 samples for testing
test_samples = list(ds['train'].select(range(10)))

# Display first sample structure
print("Sample data structure:")
print(test_samples[0])

# Display all 10 sentences and their entities
print("\nSamples with their entities:")
for i, sample in enumerate(test_samples):
    entities = re.findall(r'<e[12]>(.*?)</e[12]>', sample['sentence'])
    print(f"\n{i+1}. Sentence: {sample['sentence']}")
    print(f"   Entities: {entities}")

Sample data structure:
{'sentence': 'The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.', 'relation': 3}

Samples with their entities:

1. Sentence: The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.
   Entities: ['configuration', 'elements']

2. Sentence: The <e1>child</e1> was carefully wrapped and bound into the <e2>cradle</e2> by means of a cord.
   Entities: ['child', 'cradle']

3. Sentence: The <e1>author</e1> of a keygen uses a <e2>disassembler</e2> to look at the raw assembly code.
   Entities: ['author', 'disassembler']

4. Sentence: A misty <e1>ridge</e1> uprises from the <e2>surge</e2>.
   Entities: ['ridge', 'surge']

5. Sentence: The <e1>student</e1> <e2>association</e2> is the voice of the undergraduate student population of the State University of New York at Buffalo.
   Entities: ['student', 'association']

6. Sentence: Th

# Load model & define functions

In [9]:
from typing_extensions import NoReturn
def clean_sentence(text: str) -> str:
    """Remove XML tags from text"""
    return re.sub(r'</?e[12]>', '', text).strip()


def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove mentions (@user)
    text = re.sub(r'@\w+', '', text)

    # Remove hashtags (#hashtag)
    text = re.sub(r'#\w+', '', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def extract_keyphrases(text: str, model, top_n: int = 2) -> List[str]:
    """Extract keyphrases using KeyBERT"""
    keyphrases = model.extract_keywords(text,
                                      keyphrase_ngram_range=(3, 3),
                                      stop_words='english',
                                      top_n=top_n,
                                      nr_candidates=5,
                                      use_mmr=True, diversity=0.4)
    return [k[0] for k in keyphrases]


## Check the samples

## Evaluation method

In [11]:
def evaluate_matches(true_entities: List[str], extracted_phrases: List[str], partial_match: bool = True) -> Tuple[float, float, float]:
    """Calculate precision, recall, and F1 score"""
    if partial_match:
        # Count each match only once by tracking which true entities have been matched
        matched_true = set()
        matched_extracted = set()

        for i, ext in enumerate(extracted_phrases):
            for j, gold in enumerate(true_entities):
                if (ext.lower() in gold.lower() or gold.lower() in ext.lower()):
                    matched_extracted.add(i)
                    matched_true.add(j)

        matches = len(matched_true)  # Count unique matches
    else:
        matches = sum(1 for ext in extracted_phrases
                     if any(ext.lower() == gold.lower() for gold in true_entities))

    precision = matches / len(extracted_phrases) if extracted_phrases else 0
    recall = matches / len(true_entities) if true_entities else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1



## Process dataset and save results

In [15]:
def process_dataset_in_batches(modelname, mode, ngram, nr_cand, diversity, dataset, model, batch_size=200, save_every=1000):
    """
    Process the dataset in batches
    """
    start_time = time.time()
    all_metrics = []
    all_results = []
    total_samples = len(dataset)

    # Create progress bar
    pbar = tqdm(total=total_samples, desc="Processing samples")

    for i in range(0, total_samples, batch_size):
        # Get batch
        batch = dataset.select(range(i, min(i + batch_size, total_samples)))
        batch_metrics = []
        batch_results = []

        # Process each sample in batch
        for sample in batch:
            true_entities = re.findall(r'<e[12]>(.*?)</e[12]>', sample['sentence'])
            clean_text = clean_sentence(sample['sentence'])
            # Preprocess
            preprocessed_text = preprocess_text(clean_text)
            extracted_phrases = extract_keyphrases(preprocessed_text, model)

            # Calculate metrics
            precision, recall, f1 = evaluate_matches(true_entities, extracted_phrases)
            batch_metrics.append((precision, recall, f1))

            # Store detailed results
            batch_results.append({
                # 'sentence': sample['sentence'],
                # 'true_entities': true_entities,
                # 'extracted_phrases': extracted_phrases,
                'mode' : mode,
                'ngram' : ngram,
                'nr_cand' : nr_cand,
                'diversity' : diversity,
                'modelname' : modelname,
                'precision': precision,
                'recall': recall,
                'f1'  : f1
                # 'metrics': {'precision': precision, 'recall': recall, 'f1': f1}
            })

        # Update main lists
        all_metrics.extend(batch_metrics)
        all_results.extend(batch_results)

        # Update progress bar
        pbar.update(len(batch))

    pbar.close()

    # Calculate final averages
    avg_metrics = calculate_average_metrics(all_metrics)

    # Print time taken
    time_taken = time.time() - start_time
    print(f"\nTotal time taken: {time_taken:.2f} seconds")

    save(modelname, mode, ngram, nr_cand, diversity, avg_metrics, time_taken)

    return all_results, avg_metrics

def save(name, mode, ngram, nr_cand, diversity, avg_metrics, time):
    """Save results to CSV file, appending new rows if the file exists."""
    # Define the filename
    filename = "/content/drive/MyDrive/NLP_Proj/Results/Model_Exploration_mmr.csv"

    # Prepare data for DataFrame
    data = {
              'model_name' : name,
               'mode' : mode,
                'ngram' : ngram,
                'nr_cand' : nr_cand,
                'diversity' : diversity,
                'avg_metrics' : avg_metrics,
                'time' : time
    }

    # Create a DataFrame
    df = pd.DataFrame(data)

    # Check if the file exists to determine mode
    if os.path.exists(filename):
        # Append to existing CSV
        df.to_csv(filename, mode='a', header=False, index=False)
    else:
        # Create a new CSV file with header
        df.to_csv(filename, mode='w', header=True, index=False)

    print(f"\nSaved results to {filename}")

def calculate_average_metrics(metrics):
    """Calculate average metrics from list of (precision, recall, f1) tuples"""
    avg_precision = np.mean([m[0] for m in metrics])
    avg_recall = np.mean([m[1] for m in metrics])
    avg_f1 = np.mean([m[2] for m in metrics])

    return {
        'precision': avg_precision,
        'recall': avg_recall,
        'f1': avg_f1
    }

# Process both training and test dataset

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
names = ['all-mpnet-base-v2', 'sentence-transformers/paraphrase-MiniLM-L6-v2', 'roberta-base', 'roberta-large', 'albert-base-v2']

for name in names:
  print(f"Loading KeyBERT model : {name}")
  model = KeyBERT(model=name)
  print("Starting full dataset processing...")
  results, avg_metrics = process_dataset_in_batches(name,'Train', '(3,3)', 5, 0.4 , ds['train'], model)

  print("\nFinal Train Average Metrics:")
  print(f"Precision: {avg_metrics['precision']:.3f}")
  print(f"Recall: {avg_metrics['recall']:.3f}")
  print(f"F1 Score: {avg_metrics['f1']:.3f}")

  print("Starting test dataset processing...")
  test_results, test_metrics = process_dataset_in_batches(name,'Test', '(3,3)', 5, 0.4, ds['test'], model)


  print("\nTest Set Metrics:")
  print(f"Precision: {test_metrics['precision']:.3f}")
  print(f"Recall: {test_metrics['recall']:.3f}")
  print(f"F1 Score: {test_metrics['f1']:.3f}")





Loading KeyBERT model : all-mpnet-base-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Starting full dataset processing...


Processing samples:   0%|          | 0/8000 [00:00<?, ?it/s]


Total time taken: 269.10 seconds

Saved results to /content/drive/MyDrive/NLP_Proj/Results/Model_Exploration_mmr.csv

Final Train Average Metrics:
Precision: 0.791
Recall: 0.765
F1 Score: 0.774
Starting test dataset processing...


Processing samples:   0%|          | 0/2717 [00:00<?, ?it/s]


Total time taken: 91.02 seconds

Saved results to /content/drive/MyDrive/NLP_Proj/Results/Model_Exploration_mmr.csv

Test Set Metrics:
Precision: 0.789
Recall: 0.758
F1 Score: 0.769
Loading KeyBERT model : sentence-transformers/paraphrase-MiniLM-L6-v2


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Starting full dataset processing...


Processing samples:   0%|          | 0/8000 [00:00<?, ?it/s]


Total time taken: 133.94 seconds

Saved results to /content/drive/MyDrive/NLP_Proj/Results/Model_Exploration_mmr.csv

Final Train Average Metrics:
Precision: 0.800
Recall: 0.774
F1 Score: 0.783
Starting test dataset processing...


Processing samples:   0%|          | 0/2717 [00:00<?, ?it/s]




Total time taken: 45.10 seconds

Saved results to /content/drive/MyDrive/NLP_Proj/Results/Model_Exploration_mmr.csv

Test Set Metrics:
Precision: 0.801
Recall: 0.771
F1 Score: 0.781
Loading KeyBERT model : roberta-base


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Starting full dataset processing...


Processing samples:   0%|          | 0/8000 [00:00<?, ?it/s]


Total time taken: 252.39 seconds

Saved results to /content/drive/MyDrive/NLP_Proj/Results/Model_Exploration_mmr.csv

Final Train Average Metrics:
Precision: 0.698
Recall: 0.672
F1 Score: 0.680
Starting test dataset processing...


Processing samples:   0%|          | 0/2717 [00:00<?, ?it/s]




Total time taken: 85.02 seconds

Saved results to /content/drive/MyDrive/NLP_Proj/Results/Model_Exploration_mmr.csv

Test Set Metrics:
Precision: 0.689
Recall: 0.658
F1 Score: 0.668
Loading KeyBERT model : roberta-large


config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Starting full dataset processing...


Processing samples:   0%|          | 0/8000 [00:00<?, ?it/s]


Total time taken: 473.22 seconds

Saved results to /content/drive/MyDrive/NLP_Proj/Results/Model_Exploration_mmr.csv

Final Train Average Metrics:
Precision: 0.723
Recall: 0.697
F1 Score: 0.706
Starting test dataset processing...


Processing samples:   0%|          | 0/2717 [00:00<?, ?it/s]


Total time taken: 159.43 seconds

Saved results to /content/drive/MyDrive/NLP_Proj/Results/Model_Exploration_mmr.csv

Test Set Metrics:
Precision: 0.721
Recall: 0.690
F1 Score: 0.700
Loading KeyBERT model : albert-base-v2




config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Starting full dataset processing...


Processing samples:   0%|          | 0/8000 [00:00<?, ?it/s]


Total time taken: 230.39 seconds

Saved results to /content/drive/MyDrive/NLP_Proj/Results/Model_Exploration_mmr.csv

Final Train Average Metrics:
Precision: 0.699
Recall: 0.673
F1 Score: 0.682
Starting test dataset processing...


Processing samples:   0%|          | 0/2717 [00:00<?, ?it/s]


Total time taken: 78.39 seconds

Saved results to /content/drive/MyDrive/NLP_Proj/Results/Model_Exploration_mmr.csv

Test Set Metrics:
Precision: 0.696
Recall: 0.666
F1 Score: 0.676
