In [9]:
import json
import pandas as pd
import re

with open('./data/KPTimes.train.jsonl') as f:
    train_json = [json.loads(line) for line in f]

df_train = pd.DataFrame.from_dict(train_json)

with open('./data/KPTimes.test.jsonl') as f:
    test_json = [json.loads(line) for line in f]

df_test = pd.DataFrame.from_dict(test_json)

In [10]:
print(df_train.shape)
print(df_test.shape)

# Function to remove keywords not found in the abstract
def filter_keywords(row):
    abstract = row['abstract'].lower()  # Convert abstract to lowercase
    keywords = [kw.strip() for kw in row['keyword'].split(';')]  # Split keywords and remove spaces
    filtered_keywords = [kw for kw in keywords if kw.lower() in abstract]  # Keep only present keywords
    return ';'.join(filtered_keywords)  # Convert back to string

# Apply the function to each row
df_train['keyword'] = df_train.apply(filter_keywords, axis=1)
df_train = df_train[df_train['keyword'].str.strip() != ''] 
df_test['keyword'] = df_test.apply(filter_keywords, axis=1)
df_test = df_test[df_test['keyword'].str.strip() != ''] 

df_train = df_train[df_train['categories'].apply(lambda x: 'sports' in x)]
df_train = df_train[df_train['keyword'].apply(lambda x: len(x.split(';')) <= 4)]
df_train = df_train[df_train['abstract'].apply(lambda x: len(x.split()) <= 300)]
df_test = df_test[df_test['categories'].apply(lambda x: 'sports' in x)]
df_test= df_test[df_test['keyword'].apply(lambda x: len(x.split(';')) <= 4)]
df_test = df_test[df_test['abstract'].apply(lambda x: len(x.split()) <= 300)]

df_train = df_train[~df_train['keyword'].apply(lambda x: any(len(keyword.split()) > 3 for keyword in x.split(';')))]
df_test = df_test[~df_test['keyword'].apply(lambda x: any(len(keyword.split()) > 3 for keyword in x.split(';')))]

min_text_length = 20000
max_text_length = 0
avg_text_length = 0
sum = 0
for abstract in df_train['abstract']:
    sum += len(abstract.split())
    if(len(abstract.split()) > max_text_length):
        max_text_length = len(abstract.split())
    if(len(abstract.split()) < min_text_length):
        min_text_length = len(abstract.split())
    

print(sum/df_train.shape[0])
print(min_text_length)
print(max_text_length)
print(df_train.shape)
print(df_test.shape)
df_train


# Flatten the lists and find unique values
# unique_categories = set(category for sublist in df_train['categories'] for category in sublist)

# # Count unique categories
# num_unique_categories = len(unique_categories)

# print("Unique Categories:", unique_categories)
# print("Number of Unique Categories:", num_unique_categories)


# df_train.to_pickle("politics_train.pkl")

# df_test.to_pickle("politics_test.pkl")

(259923, 6)
(20000, 6)
113.69983561643835
13
300
(9125, 6)
(341, 6)


Unnamed: 0,id,categories,date,title,abstract,keyword
26,ny0066354,"[sports, basketball]",2014/06/23,Baylor Center Out of N.B.A. Draft,Baylor center Isaiah Austin will withdraw from...,Isaiah Austin
74,ny0013630,"[sports, soccer]",2013/11/08,Eight Advance in Europa League,"Tottenham, Fiorentina and Red Bull Salzburg ma...",Fiorentina
116,ny0257400,"[sports, baseball]",2011/01/20,A’s Bolster Bullpen With Fuentes,The Oakland Athletics added another proven arm...,Oakland Athletics
173,ny0289778,"[sports, golf]",2016/01/11,Spieth Wins at 30 Under,Jordan Spieth cruised to victory at the Hyunda...,Jordan Spieth
226,ny0102483,"[sports, baseball]",2015/12/15,Orioles Re-Sign Darren O’Day,"The All-Star reliever Darren O’Day, 33, re-sig...",Orioles
...,...,...,...,...,...,...
259765,ny0222531,"[sports, ncaafootball]",2010/11/18,Fox Goes for Big Ten,Fox Sports acquired the rights to carry the ne...,Football;Big Ten Conference
259779,ny0046839,"[sports, rugby]",2014/11/02,All Blacks Trounce Eagles,New Zealand’s All Blacks turned what was bille...,USA Golden Eagles;All Blacks;New Zealand
259818,ny0179553,"[sports, golf]",2007/08/10,Milestone for 12-Year-Old,Alexis Thompson became the youngest quarterfin...,Golf
259851,ny0032322,"[sports, football]",2013/12/04,Cribbs Is Put on Injured Reserve,The Jets signed the returner Darius Reynaud on...,Football;Jets;Darius Reynaud


In [11]:
import string

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove mentions (@user)
    text = re.sub(r'@\w+', '', text)

    # Remove hashtags (#hashtag)
    text = re.sub(r'#\w+', '', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def clean_sentence(text: str) -> str:
    """Remove XML tags from text"""
    return re.sub(r'</?e[12]>', '', text).strip()

In [12]:
from keybert import KeyBERT
from typing import List, Tuple
from tqdm.notebook import tqdm

def extract_keyphrases(ngram, diversity, text: str, model: KeyBERT, top_n: int) -> List[str]:
    """Extract keyphrases using KeyBERT"""
    keyphrases = model.extract_keywords(text,
                                      keyphrase_ngram_range=ngram,
                                      stop_words='english',
                                      top_n=top_n,
                                      use_mmr=True, diversity=diversity)
    return [k[0] for k in keyphrases]

# Initialize model
print("Loading KeyBERT model...")
model = KeyBERT('sentence-transformers/paraphrase-MiniLM-L6-v2')

Loading KeyBERT model...


In [13]:
import re
from typing import List, Tuple

def normalize(text: str) -> str:
    """Normalize text by lowercasing and removing extra spaces."""
    return re.sub(r'\s+', ' ', text.strip().lower())

def evaluate_matches(true_entities: List[str], extracted_phrases: List[str], partial_match: bool = True) -> Tuple[float, float, float]:
    """Calculate precision, recall, and F1 score with improved matching logic."""
    
    # Normalize inputs
    true_entities = [normalize(entity) for entity in true_entities]
    extracted_phrases = [normalize(phrase) for phrase in extracted_phrases]

    if partial_match:
        matched_true = set()
        matched_extracted = set()

        for i, ext in enumerate(extracted_phrases):
            for j, gold in enumerate(true_entities):
                if ext in gold or gold in ext:  # Partial match condition
                    matched_extracted.add(i)
                    matched_true.add(j)

        matches = len(matched_true)  # Count unique matches
    else:
        matches = len(set(extracted_phrases) & set(true_entities))  # Exact match

    precision = matches / len(extracted_phrases) if extracted_phrases else 0
    recall = matches / len(true_entities) if true_entities else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

In [14]:
import time
import os
import numpy as np
from datasets import Dataset
from sentence_transformers import SentenceTransformer, util

def process_dataset_in_batches(mode, ngram, diversity, dataframe, model, batch_size=200):
    """
    Process the dataset in batches
    """

    dataset = Dataset.from_pandas(dataframe[['abstract', 'keyword']])
    start_time = time.time()
    all_metrics = []
    all_results = []
    total_samples = len(dataset)

    for i in range(0, total_samples, batch_size):
        print(f"Progress {i*100/total_samples}%")
        # Get batch
        batch = dataset.select(range(i, min(i + batch_size, total_samples)))
        batch_metrics = []
        batch_results = []

        # Process each sample in batch
        for sample in batch:
            sentence = sample['abstract']
            true_entities = sample['keyword'].split(';')
            clean_text = clean_sentence(sentence)# Preprocess
            preprocessed_text = preprocess_text(clean_text)
            ## Example for ngram (1,3)
            extracted_phrases = extract_keyphrases(ngram, diversity, preprocessed_text, model, 3)


            # Calculate metrics
            precision, recall, f1 = evaluate_matches(true_entities, extracted_phrases)
            batch_metrics.append((precision, recall, f1))

            # Store detailed results
            batch_results.append({
                # 'sentence': sample['sentence'],
                # 'true_entities': true_entities,
                # 'extracted_phrases': extracted_phrases,
                'precision': precision,
                'recall': recall,
                'f1'  : f1
                # 'metrics': {'precision': precision, 'recall': recall, 'f1': f1}
            })

        # Update main lists
        all_metrics.extend(batch_metrics)
        all_results.extend(batch_results)


    # Calculate final averages
    avg_metrics = calculate_average_metrics(all_metrics)

    # Print time taken
    time_taken = time.time() - start_time
    print(f"\nTotal time taken: {time_taken:.2f} seconds")

    save(mode, ngram, diversity, avg_metrics, time_taken)

    return all_results, avg_metrics

def save(mode, ngram, diversity, avg_metrics, time):
    """Save results to CSV file, appending new rows if the file exists."""
    # Define the filename
    filename = "./data/Results/MMR_results_new2.csv"

    # Prepare data for DataFrame
    data = {
               'mode' : mode,
                'ngram' : str(ngram),
                'diversity' : str(diversity),
                'avg_metrics' : avg_metrics,
                'time' : time
    }

    # Create a DataFrame
    df = pd.DataFrame(data)

    # Check if the file exists to determine mode
    if os.path.exists(filename):
        # Append to existing CSV
        df.to_csv(filename, mode='a', header=False, index=False)
    else:
        # Create a new CSV file with header
        df.to_csv(filename, mode='w', header=True, index=False)

    print(f"\nSaved results to {filename}")

def calculate_average_metrics(metrics):
    """Calculate average metrics from list of (precision, recall, f1) tuples"""
    avg_precision = np.mean([m[0] for m in metrics])
    avg_recall = np.mean([m[1] for m in metrics])
    avg_f1 = np.mean([m[2] for m in metrics])

    return {
        'precision': avg_precision,
        'recall': avg_recall,
        'f1': avg_f1
    }

In [15]:
from sentence_transformers import SentenceTransformer, util

# Take first 10 samples for testing
ds = Dataset.from_pandas(df_train[['abstract', 'keyword']])
test_samples = list(ds.select(range(10)))

# Process test samples
for i, sample in enumerate(test_samples):
    # Get original sentence and true entities
    sentence = sample['abstract']
    true_entities = sample['keyword'].split(';')

    # Clean sentence and extract keyphrases
    clean_text = clean_sentence(sentence)
    # Preprocess
    preprocessed_text = preprocess_text(clean_text)
    ## Example for ngram (1,3) and diversity = 0.2
    extracted_phrases = extract_keyphrases((3,3), 0.4, preprocessed_text, model, 3)

    precision, recall, f1 = evaluate_matches(true_entities, extracted_phrases)

    # Print results
    print(f"\nSample {i+1}:")
    print(f"Clean text: {clean_text}")
    print(f"True entities: {true_entities}")
    print(f"Extracted keyphrases: {extracted_phrases}")
    print(f"Precision: {precision}, Recall: {recall}, F1: {f1}")
    print("-" * 80)


Sample 1:
Clean text: Baylor center Isaiah Austin will withdraw from the N.B.A. draft after having been found to have a rare genetic disorder. Austin has Marfan syndrome, a disorder that affects the connective tissue and can weaken the aorta. He announced in April that he was leaving Baylor to declare for the N.B.A. draft, which is Thursday.
True entities: ['Isaiah Austin']
Extracted keyphrases: ['austin withdraw nba', 'leaving baylor declare', 'marfan syndrome disorder']
Precision: 0.0, Recall: 0.0, F1: 0
--------------------------------------------------------------------------------

Sample 2:
Clean text: Tottenham, Fiorentina and Red Bull Salzburg made it four group wins in four games in the Europa League to qualify for the knockout phase with two games to spare. Erik Lamela scored his first Tottenham goal in a 2-1 home victory over Sheriff Tiraspol of Moldova. Fiorentina won by the same score at Pandurii Targu Jiu in Romania. Salzburg won, 3-1, at Standard Liège. Also clinching a

In [16]:
print("Starting full dataset processing...")
print(f"\nProcessing ngram=(3, 3), diversity=0.4")
results, avg_metrics = process_dataset_in_batches('Train', (3, 3), 0.4, df_train, model, 3)
print("\nFinal Average Train Metrics:")
print(f"Precision: {avg_metrics['precision']:.3f}")
print(f"Recall: {avg_metrics['recall']:.3f}")
print(f"F1 Score: {avg_metrics['f1']:.3f}")

test_results, test_metrics = process_dataset_in_batches('Test', (3, 3), 0.4, df_test, model, 3)
print("\nFinal Average Test Metrics:")
print(f"Precision: {test_metrics['precision']:.3f}")
print(f"Recall: {test_metrics['recall']:.3f}")
print(f"F1 Score: {test_metrics['f1']:.3f}")

Starting full dataset processing...

Processing ngram=(3, 3), diversity=0.4
Progress 0.0%
Progress 0.03287671232876712%
Progress 0.06575342465753424%
Progress 0.09863013698630137%
Progress 0.13150684931506848%
Progress 0.1643835616438356%
Progress 0.19726027397260273%
Progress 0.23013698630136986%
Progress 0.26301369863013696%
Progress 0.2958904109589041%
Progress 0.3287671232876712%
Progress 0.36164383561643837%
Progress 0.39452054794520547%
Progress 0.4273972602739726%
Progress 0.4602739726027397%
Progress 0.4931506849315068%
Progress 0.5260273972602739%
Progress 0.5589041095890411%
Progress 0.5917808219178082%
Progress 0.6246575342465753%
Progress 0.6575342465753424%
Progress 0.6904109589041096%
Progress 0.7232876712328767%
Progress 0.7561643835616438%
Progress 0.7890410958904109%
Progress 0.821917808219178%
Progress 0.8547945205479452%
Progress 0.8876712328767123%
Progress 0.9205479452054794%
Progress 0.9534246575342465%
Progress 0.9863013698630136%
Progress 1.0191780821917809%
Pro