# Vanilla Term Extraction Baseline

Simple word-boundary matching baseline:
- Extract all unique terms from training data (lowercased)
- Match terms in dev sentences using word boundaries
- No machine learning, just exact substring matching


In [None]:
import pandas as pd
import json
import os

data_path = "../data"
train_data_path = os.path.join(data_path, "subtask_a_train.json")
dev_data_path = os.path.join(data_path, "subtask_a_dev.json")
pred_dir = "predictions"
os.makedirs(pred_dir, exist_ok=True)
pred_path = os.path.join(pred_dir, "subtask_a_dev_vanilla_preds.json")

713


### Load Training Data and Extract Terms

In [7]:
import re
import unicodedata

def normalize_text(s: str) -> str:
    # lowercase
    s = s.lower()
    # normalizza unicode (accenti, apostrofi “strani”)
    s = unicodedata.normalize("NFKC", s)
    # sostituisci tipi di apostrofi e virgolette in ' standard
    s = s.replace("’", "'").replace("`", "'").replace("“", '"').replace("”", '"')
    # rimuovi doppie spaziature
    s = re.sub(r"\s+", " ", s)
    # strip finale
    return s.strip()


In [None]:
# 1) vocabolario termini dal train
import tqdm
import json


# 2) applicazione sul dev
with open(train_data_path, "r", encoding="utf-8") as f:
    train_data = json.load(f)

terms = set()
for entry in train_data["data"]:
    for term in entry["term_list"]:
        norm_term = normalize_text(term)
        if norm_term:
            terms.add(norm_term)

terms = sorted(terms)
print(f"# termini unici nel vocabolario train (normalizzati): {len(terms)}")


with open(dev_data_path, "r", encoding="utf-8") as f:
    dev_data = json.load(f)

dump_pred = {"data": []}
for entry in tqdm(dev_data["data"], desc="Processing dev data", total=len(dev_data["data"])):
    sentence_text = normalize_text(entry["sentence_text"])

    append_dict = {
        "document_id": entry["document_id"],
        "paragraph_id": entry["paragraph_id"],
        "sentence_id": entry["sentence_id"],
        "term_list": [],
    }

    for term in terms:
        # pattern con confini approssimati ma robusti
        pattern = r"(?<!\w)" + re.escape(term) + r"(?!\w)"
        if re.search(pattern, sentence_text):
            append_dict["term_list"].append(term)

    dump_pred["data"].append(append_dict)

# ====== 3) SALVATAGGIO UNA SOLA VOLTA, FUORI DAL LOOP ======
with open(pred_path, "w", encoding="utf-8") as f:
    json.dump(dump_pred, f, ensure_ascii=False, indent=4)

print(f"Salvato file di predizioni: {pred_path}")


Processing dev data: 100%|██████████| 577/577 [00:27<00:00, 20.68it/s]


### Evaluation 

In [9]:
def micro_f1_score(gold_standard, system_output):
    """
    Evaluates performance using Precision, Recall, and F1 score 
    based on individual term matching (micro-average).
    """
    total_true_positives = 0
    total_false_positives = 0
    total_false_negatives = 0
    
    for gold, system in zip(gold_standard, system_output):
        gold_set = set(gold)
        system_set = set(system)
        
        true_positives = len(gold_set.intersection(system_set))
        false_positives = len(system_set - gold_set)
        false_negatives = len(gold_set - system_set)
        
        total_true_positives += true_positives
        total_false_positives += false_positives
        total_false_negatives += false_negatives
    
    precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0
    recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1, total_true_positives, total_false_positives, total_false_negatives


def type_f1_score(gold_standard, system_output):
    """
    Evaluates performance using Type Precision, Type Recall, and Type F1 score
    based on the set of unique terms extracted at least once across the entire dataset.
    """
    all_gold_terms = set()
    for item_terms in gold_standard:
        all_gold_terms.update(item_terms)
    
    all_system_terms = set()
    for item_terms in system_output:
        all_system_terms.update(item_terms)
    
    type_true_positives = len(all_gold_terms.intersection(all_system_terms))
    type_false_positives = len(all_system_terms - all_gold_terms)
    type_false_negatives = len(all_gold_terms - all_system_terms)
    
    type_precision = type_true_positives / (type_true_positives + type_false_positives) if (type_true_positives + type_false_positives) > 0 else 0
    type_recall = type_true_positives / (type_true_positives + type_false_negatives) if (type_true_positives + type_false_negatives) > 0 else 0
    type_f1 = 2 * (type_precision * type_recall) / (type_precision + type_recall) if (type_precision + type_recall) > 0 else 0
    
    return type_precision, type_recall, type_f1

### Evaluate Performance

In [11]:
# Prepare gold standard and predictions for evaluation
gold_standard = []
system_output = []

# Build mapping from dev_data
dev_gold_map = {}
for entry in dev_data['data']:
    key = (entry['document_id'], entry['paragraph_id'], entry['sentence_id'])
    if key not in dev_gold_map:
        dev_gold_map[key] = entry['term_list']

# Build mapping from predictions
pred_map = {}
for entry in dump_pred['data']:
    key = (entry['document_id'], entry['paragraph_id'], entry['sentence_id'])
    pred_map[key] = entry['term_list']

# Align gold and predictions
for key in sorted(dev_gold_map.keys()):
    gold_standard.append(dev_gold_map[key])
    system_output.append(pred_map.get(key, []))

# Calculate metrics
precision, recall, f1, tp, fp, fn = micro_f1_score(gold_standard, system_output)
type_precision, type_recall, type_f1 = type_f1_score(gold_standard, system_output)

print("="*60)
print("VANILLA BASELINE RESULTS")
print("="*60)
print("\nMicro-averaged Metrics:")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1 Score:  {f1:.4f}")
print(f"  TP={tp}, FP={fp}, FN={fn}")

print("\nType-level Metrics:")
print(f"  Type Precision: {type_precision:.4f}")
print(f"  Type Recall:    {type_recall:.4f}")
print(f"  Type F1 Score:  {type_f1:.4f}")
print("="*60)

VANILLA BASELINE RESULTS

Micro-averaged Metrics:
  Precision: 0.4295
  Recall:    0.7428
  F1 Score:  0.5443
  TP=335, FP=445, FN=116

Type-level Metrics:
  Type Precision: 0.6587
  Type Recall:    0.5661
  Type F1 Score:  0.6089
