In [1]:
import json, os
def load_json(path: str):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)
    

def save_json(obj, path: str):
    os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)
    print(f"✓ Saved cleaned predictions to {path}")

In [2]:
DATA_DIR = "../data"
PRED_DIR = "../src/predictions"

TRAIN_FILE = os.path.join(DATA_DIR, "subtask_a_train.json")
DEV_FILE = os.path.join(DATA_DIR, "subtask_a_dev.json")

# BERT_DEV_PRED_FILE = os.path.join(
#     PRED_DIR, "subtask_a_dev_bert_token_classification_preds_clean.json"
# )
BERT_DEV_PRED_FILE = os.path.join(
    PRED_DIR, "subtask_a_dev_bert_token_classification_preds_extended_clean.json"
)
SPACY_DEV_PRED_FILE = os.path.join(
    PRED_DIR, "subtask_a_dev_spacy_trained_preds.json"
)

ENSEMBLE_OUT_FILE = os.path.join(
    PRED_DIR, "subtask_a_dev_ensemble_bert_spacy_dictfilter.json"
)

os.makedirs(PRED_DIR, exist_ok=True)

In [3]:
import re
import unicodedata


def normalize_term(t: str) -> str:
    t = t.lower().strip()
    t = " ".join(t.split())
    return t

def normalize_text(s: str) -> str:
    """
    Normalize a term or sentence:
      - lowercase
      - Unicode normalization (NFKC)
      - normalize quotes/apostrophes
      - collapse multiple spaces
      - strip leading/trailing spaces
    """
    s = s.lower()
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("’", "'").replace("`", "'").replace("“", '"').replace("”", '"')
    s = re.sub(r"\s+", " ", s)
    return s.strip()

In [4]:
# ==============================
# Train vocabulary from gold terms
# ==============================

def build_train_vocab(train_data: dict) -> set:
    """
    Build a normalized vocabulary of gold terms from the training set.
    Each term is normalized with `normalize_text`.
    """
    vocab = set()
    for entry in train_data["data"]:
        for term in entry.get("term_list", []):
            norm_term = normalize_text(term)
            if norm_term:
                vocab.add(norm_term)
    return vocab

def build_term_map(pred_json: dict) -> dict:
    """
    Build a mapping:
        (document_id, paragraph_id, sentence_id) -> list of predicted terms
    from a prediction JSON in the ATE-IT format.
    """
    m = {}
    for e in pred_json["data"]:
        key = (e["document_id"], e["paragraph_id"], e["sentence_id"])
        m[key] = e.get("term_list", []) or []
    return m

In [5]:
from collections import Counter

def build_train_vocab_with_freq(train_data):
    freq = Counter()
    for e in train_data["data"]:
        for term in e.get("term_list", []):
            norm = normalize_text(term)
            if norm:
                freq[norm] += 1
    strong = {t for t, c in freq.items() if c >= 3}
    weak   = {t for t, c in freq.items() if c == 1}
    return freq, strong, weak


For each sentence:

keeps all BERT terms as baseline,

adds spaCy terms only if their normalized form appears in the train vocabulary (and they’re multi-word and not duplicates).

In [6]:
def norm(t: str) -> str:
    if not t:
        return ""
    t = t.lower().strip()
    t = " ".join(t.split())
    t = t.replace("’", "'")
    t = t.strip(".,;:-'\"()[]{}")
    return t

In [None]:
GENERIC_HEADS = {
    "rifiuti", "materiali", "utenti", "plastica", "carta",
    "residui", "tariffe", "gestore", "servizio", "modalità",
    "conferimento", "costi", "parte", "quota", "impianto"
}

def filter_generic_unigrams(terms, train_vocab_norm):
    filtered = []
    for t in terms:
        tokens = t.split()
        if len(tokens) == 1:
            # tienilo solo se:
            # 1) è nel vocabolario di train (compare come termine vero)
            #    oppure
            # 2) è una sigla tipo "tmb", "r.a.e.e."
            if normalize_text(t) not in train_vocab_norm and normalize_text(t) in GENERIC_HEADS:
                continue
        filtered.append(t)
    return filtered


In [7]:
GENERIC_BAD = {
    "parte", "gestione", "città", "territorio", "comune",
    "ore", "no", "si", "anno", "mese", "giorno"
} 


def upgrade_with_longer_spacy(bert_terms, spacy_terms, train_vocab_norm):
    """
    Upgrade BERT terms to longer spaCy spans ONLY WHEN BENEFICIAL.
    """
    final = []
    seen = set()
    
    spacy_norm_map = {norm(t): t for t in spacy_terms or []}

    for b in bert_terms or []:
        b_norm = norm(b)
        if not b_norm or b_norm in GENERIC_BAD:
            continue

        best = None

        # search longest valid spaCy span containing the BERT term
        for s_norm, s in spacy_norm_map.items():
            if len(s.split()) < 2:
                continue
            if b_norm in s_norm and s_norm in train_vocab_norm:
                if best is None or len(s_norm.split()) > len(norm(best).split()):
                    best = s

        chosen = best if best else b
        c_norm = norm(chosen)

        if c_norm not in seen and c_norm not in GENERIC_BAD:
            final.append(chosen)
            seen.add(c_norm)

    return final


In [8]:
def merge_bert_spacy_with_dict(bert_terms, spacy_terms, train_vocab_norm):
    """
    BEST ensemble so far:
    1. upgrade BERT with spaCy
    2. add dictionary-filtered spaCy spans
    3. skip generic or meaningless words
    """
    upgraded = upgrade_with_longer_spacy(
        bert_terms=bert_terms,
        spacy_terms=spacy_terms,
        train_vocab_norm=train_vocab_norm,
    )

    final = upgraded[:]
    seen = {norm(t) for t in upgraded}

    for s in spacy_terms or []:
        s_norm = norm(s)

        if len(s.split()) < 2:
            continue
        if s_norm not in train_vocab_norm:
            continue
        if s_norm in seen:
            continue
        if s_norm in GENERIC_BAD:
            continue

        final.append(s)
        seen.add(s_norm)

    return final


In [9]:
def merge_sentence(bert_terms, spacy_terms, train_vocab_norm):
    merged = merge_bert_spacy_with_dict(
        bert_terms=bert_terms,
        spacy_terms=spacy_terms,
        train_vocab_norm=train_vocab_norm
    )
    # dedupe and normalize
    seen = set()
    final = []
    for t in merged:
        n = norm(t)
        if n not in seen:
            final.append(n)
            seen.add(n)
    return final


In [10]:
                   
def micro_f1_score(gold_standard, system_output):
  """
  Evaluates a term extraction system's performance using Precision, Recall,
  and F1 score based on individual term matching (micro-average).

  Args:
    gold_standard: A list of lists, where each inner list contains the
        gold standard terms for an item.
    system_output: A list of lists, where each inner list contains the
                   terms extracted by the system for the corresponding item.

  Returns:
    A tuple containing the Precision, Recall, and F1 score.
  """
  total_true_positives = 0
  total_false_positives = 0
  total_false_negatives = 0

  # Iterate through each item's gold standard and system output terms
  for gold, system in zip(gold_standard, system_output):
    # Convert to sets for efficient comparison
    gold_set = set(gold)
    system_set = set(system)

    # Calculate True Positives, False Positives, and False Negatives for the current item
    true_positives = len(gold_set.intersection(system_set))
    false_positives = len(system_set - gold_set)
    false_negatives = len(gold_set - system_set)

    # Accumulate totals across all items
    total_true_positives += true_positives
    total_false_positives += false_positives
    total_false_negatives += false_negatives

  # Calculate Precision, Recall, and F1 score (micro-average)
  precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0
  recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0
  f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

  return precision, recall, f1

In [11]:
def type_f1_score(gold_standard, system_output):
  """
  Evaluates a term extraction system's performance using Type Precision,
  Type Recall, and Type F1 score based on the set of unique terms extracted
  at least once across the entire dataset.

  Args:
    gold_standard: A list of lists, where each inner list contains the
                   gold standard terms for an item.
    system_output: A list of lists, where each inner list contains the
                   terms extracted by the system for the corresponding item.

  Returns:
    A tuple containing the Type Precision, Type Recall, and Type F1 score.
  """

  # Get the set of all unique gold standard terms across the dataset
  all_gold_terms = set()
  for item_terms in gold_standard:
    all_gold_terms.update(item_terms)

  # Get the set of all unique system extracted terms across the dataset
  all_system_terms = set()
  for item_terms in system_output:
    all_system_terms.update(item_terms)

  # Calculate True Positives (terms present in both sets)
  type_true_positives = len(all_gold_terms.intersection(all_system_terms))

  # Calculate False Positives (terms in system output but not in gold standard)
  type_false_positives = len(all_system_terms - all_gold_terms)

  # Calculate False Negatives (terms in gold standard but not in system output)
  type_false_negatives = len(all_gold_terms - all_system_terms)

  # Calculate Type Precision, Type Recall, and Type F1 score
  type_precision = type_true_positives / (type_true_positives + type_false_positives) if (type_true_positives + type_false_positives) > 0 else 0
  type_recall = type_true_positives / (type_true_positives + type_false_negatives) if (type_true_positives + type_false_negatives) > 0 else 0
  type_f1 = 2 * (type_precision * type_recall) / (type_precision + type_recall) if (type_precision + type_recall) > 0 else 0

  return type_precision, type_recall, type_f1

###   BUILD BERT + SPACY ENSEMBLE USING merge_sentence()

In [12]:
from tqdm import tqdm
import json

# ---- Load train data and build vocabulary ----
with open(TRAIN_FILE, "r", encoding="utf-8") as f:
    train_data = json.load(f)

train_vocab_norm = build_train_vocab(train_data)
print(f"# unique normalized terms from train gold: {len(train_vocab_norm)}")

# ---- Load dev gold (for evaluation) ----
with open(DEV_FILE, "r", encoding="utf-8") as f:
    dev_data = json.load(f)

# ---- Load BERT and spaCy predictions ----
with open(BERT_DEV_PRED_FILE, "r", encoding="utf-8") as f:
    bert_pred = json.load(f)

with open(SPACY_DEV_PRED_FILE, "r", encoding="utf-8") as f:
    spacy_pred = json.load(f)

# Convert JSON predictions → dict[(doc,par,sent)] → [terms...]
bert_map = build_term_map(bert_pred)
spacy_map = build_term_map(spacy_pred)

# ---- Build ensemble predictions using merge_sentence ----
ensemble_output = {"data": []}

print("Building improved BERT+spaCy ensemble ...")

for idx, row in enumerate(tqdm(dev_data["data"])):

    key = (row["document_id"], row["paragraph_id"], row["sentence_id"])

    bert_terms = bert_map.get(key, []) or []
    spacy_terms = spacy_map.get(key, []) or []

    #  NEW MERGE FUNCTION 
    merged_terms = merge_sentence(
        bert_terms=bert_terms,
        spacy_terms=spacy_terms,
        train_vocab_norm=train_vocab_norm
    )

    # Debug on first 3
    if idx < 3:
        print("\n---------------------------------------")
        print("Sentence", idx)
        print("TEXT:", row["sentence_text"])
        print("  BERT  :", bert_terms)
        print("  SPACY :", spacy_terms)
        print("  MERGED:", merged_terms)

    # Save
    ensemble_output["data"].append({
        "document_id": row["document_id"],
        "paragraph_id": row["paragraph_id"],
        "sentence_id": row["sentence_id"],
        "term_list": merged_terms,
    })





# unique normalized terms from train gold: 713
Building improved BERT+spaCy ensemble ...


100%|██████████| 577/577 [00:00<00:00, 110401.60it/s]


---------------------------------------
Sentence 0
TEXT: Non Domestica; CAMPEGGI, DISTRIBUTORI CARBURANTI, PARCHEGGI; 1,22; 4,73 
  BERT  : []
  SPACY : []
  MERGED: []

---------------------------------------
Sentence 1
TEXT: Il presente disciplinare per la gestione dei centri di raccolta comunali è stato redatto ai sensi e per effetto del DM 13/05/2009, pubblicato sulla G.U. n. 165 del 18/07/2009, con il quale sono state apportate le modifiche sostanziali al DM 08/04/2008, Disciplina dei centri di raccolta dei rifiuti urbani raccolti in modo differenziato, come previsto dall'art. 183, comma 7, lettera cc) del Dlgs 3 aprile 2006, n. 152, e ss.mm.ii.
  BERT  : ['disciplinare', 'gestione', 'centri di raccolta comunali', 'disciplina', 'centri di raccolta dei rifiuti urbani raccolti']
  SPACY : ['gestione dei centri di raccolta comunali', 'centri di raccolta dei rifiuti urbani raccolti']
  MERGED: ['disciplinare', 'centri di raccolta comunali', 'disciplina', 'centri di raccolta dei rifiu




#### Save predictions

In [13]:
# ---- Save final merged predictions ----
with open(ENSEMBLE_OUT_FILE, "w", encoding="utf-8") as f:
    json.dump(ensemble_output, f, ensure_ascii=False, indent=2)

print(f"\nEnsemble predictions saved to: {ENSEMBLE_OUT_FILE}")



Ensemble predictions saved to: ../src/predictions\subtask_a_dev_ensemble_bert_spacy_dictfilter.json


In [None]:
import re
import string

def normalize_term(t: str) -> str:
    t = t.lower().strip()

    # spazi multipli
    t = " ".join(t.split())

    # normalizza separatori tipo "-", "/", "," quando servono come lista
    t = t.replace(" - ", " ").replace("-", " - ")
    t = t.replace("/", " / ")

    # togli punteggiatura ai bordi
    t = t.strip(string.punctuation + "«»“”'\"[]()")

    # normalizza sigle r.a.e.e. -> raee
    t = re.sub(r"\.", "", t)

    return t


In [14]:

# Extract gold + predicted lists
dev_gold = [entry["term_list"] for entry in dev_data["data"]]
ensemble_preds = [entry["term_list"] for entry in ensemble_output["data"]]

precision, recall, f1 = micro_f1_score(dev_gold, ensemble_preds)
type_precision, type_recall, type_f1 = type_f1_score(dev_gold, ensemble_preds)

print("\n=====================================================")
print("    IMPROVED BERT + SPACY + DICTIONARY MERGE")
print("=====================================================")

print("\nMicro-averaged Metrics:")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1 Score:  {f1:.4f}")

print("\nType-level Metrics:")
print(f"  Type Precision: {type_precision:.4f}")
print(f"  Type Recall:    {type_recall:.4f}")
print(f"  Type F1 Score:  {type_f1:.4f}")



    IMPROVED BERT + SPACY + DICTIONARY MERGE

Micro-averaged Metrics:
  Precision: 0.7388
  Recall:    0.6962
  F1 Score:  0.7169

Type-level Metrics:
  Type Precision: 0.6881
  Type Recall:    0.6198
  Type F1 Score:  0.6522


In [15]:
import pandas as pd
def get_fp_fn_from_listformat(gold_entries, pred_entries):
    """
    gold_entries: list of rows from dev_data["data"]
    pred_entries: list of rows from ensemble_output["data"]
    
    Each entry has:
        - document_id
        - paragraph_id
        - sentence_id
        - term_list (list of terms)
    
    Returns DataFrames:
        fp_df (false positives)
        fn_df (false negatives)
    """

    gold_rows = []
    pred_rows = []

    # --- Expand GOLD ---
    for e in gold_entries:
        doc = e["document_id"]
        par = e["paragraph_id"]
        sid = e["sentence_id"]
        for t in e["term_list"]:
            t_norm = t.lower().strip()
            if t_norm:
                gold_rows.append((doc, par, sid, t_norm))

    # --- Expand PRED ---
    for e in pred_entries:
        doc = e["document_id"]
        par = e["paragraph_id"]
        sid = e["sentence_id"]
        for t in e["term_list"]:
            t_norm = t.lower().strip()
            if t_norm:
                pred_rows.append((doc, par, sid, t_norm))

    gold_set = set(gold_rows)
    pred_set = set(pred_rows)

    fp = pred_set - gold_set
    fn = gold_set - pred_set

    fp_df = pd.DataFrame(list(fp),
                         columns=["document_id", "paragraph_id", "sentence_id", "term"])
    fn_df = pd.DataFrame(list(fn),
                         columns=["document_id", "paragraph_id", "sentence_id", "term"])

    return fp_df, fn_df


In [16]:
gold_entries = dev_data["data"]      # gold JSON
pred_entries = ensemble_output["data"]  # merged predictions JSON

fp_df, fn_df = get_fp_fn_from_listformat(gold_entries, pred_entries)

print("False Positives:", len(fp_df))
print("False Negatives:", len(fn_df))

display(fp_df.head(20))
display(fn_df.head(20))


False Positives: 111
False Negatives: 137


Unnamed: 0,document_id,paragraph_id,sentence_id,term
0,doc_sorrento_20,1,3,raccolta differenziata
1,doc_sorrento_15,2,0,plastica
2,doc_praiano_05,13,2,forme di gestione dei rifiuti
3,doc_santegidiodelmontealbino_03,50,11,banda stagnata
4,doc_caserta_06,16,1,obbligo di conferimento separato dei rifiuti
5,doc_salerno_03,2,27,materiali
6,doc_capaccio_06,10,1,oli conferiti
7,doc_gragnano_03,5,0,servizio di raccolta rsu
8,doc_nocerainferiore_06,2,1,porta
9,doc_sorrento_20,1,3,plastica


Unnamed: 0,document_id,paragraph_id,sentence_id,term
0,doc_nocerainferiore_06,10,0,calendario utenze domestiche per la raccolta d...
1,doc_poggiomarino_12,17,65,r1
2,doc_sorrento_22,2,0,bidone
3,doc_salerno_05,7,6,"plastica, acciaio e alluminio"
4,doc_capaccio_10,9,4,busta con legaccio
5,doc_praiano_05,10,0,raccolta/ritiro
6,doc_capaccio_10,3,3,sacchetto trasparente
7,doc_francavillais_09,19,1,ruote gommate
8,doc_santegidiodelmontealbino_03,50,11,"metalli, acciaio, alluminio e banda stagnata"
9,doc_sorrento_22,2,0,plastica
