### Data Loading and Processing

In [14]:
import json
import os
import random
from pathlib import Path
from typing import List, Dict, Tuple

import spacy
from spacy.tokens import DocBin, Doc
from spacy.training import Example
from spacy.util import minibatch, compounding
from spacy.pipeline import EntityRuler
from tqdm import tqdm


In [16]:
def load_jsonl(path: str) -> List[Dict]:
    """Load a JSON lines file or JSON array file."""
    with open(path, 'r', encoding='utf-8') as f:
        text = f.read().strip()
    if not text:
        return []
    try:
        # Try parsing as single JSON object/array
        data = json.loads(text)
    except json.JSONDecodeError:
        # Fall back to JSONL (one JSON per line)
        data = []
        for line in text.splitlines():
            line = line.strip()
            if line:
                data.append(json.loads(line))
    return data


def build_sentence_gold_map(records: List[Dict]) -> List[Dict]:
    """Convert dataset rows into list of sentences with aggregated terms.
    
    Handles both formats:
    - Records with 'term_list' field (list of terms) for input files in json format
    - Records with individual 'term' field (one term per row) for input files in csv format
    """
    out = {}
    
    # Support both dict with 'data' key and plain list
    if isinstance(records, dict) and 'data' in records:
        rows = records['data']
    else:
        rows = records
    
    for r in rows:
        key = (r.get('document_id'), r.get('paragraph_id'), r.get('sentence_id'))
        if key not in out:
            out[key] = {
                'document_id': r.get('document_id'),
                'paragraph_id': r.get('paragraph_id'),
                'sentence_id': r.get('sentence_id'),
                'sentence_text': r.get('sentence_text', ''),
                'terms': []
            }
        
        # Support both 'term_list' (list) and 'term' (single value)
        if isinstance(r.get('term_list'), list):
            for t in r.get('term_list'):
                if t and t not in out[key]['terms']:
                    out[key]['terms'].append(t)
        else:
            term = r.get('term')
            if term and term not in out[key]['terms']:
                out[key]['terms'].append(term)
    
    return list(out.values())


# Test: Load a small sample
test_data = {
    'data': [
        {
            'document_id': 'doc1',
            'paragraph_id': 'p1',
            'sentence_id': 's1',
            'sentence_text': 'La tassa di successione è un tributo.',
            'term_list': ['tassa di successione', 'tributo']
        }
    ]
}

test_sentences = build_sentence_gold_map(test_data)
assert len(test_sentences) == 1
assert test_sentences[0]['terms'] == ['tassa di successione', 'tributo']
print("✓ Data loading functions work correctly")

✓ Data loading functions work correctly


In [17]:
# Load actual training and dev data
train_data = load_jsonl('../../data/subtask_a_train.json')
dev_data = load_jsonl('../../data/subtask_a_dev.json')

train_sentences = build_sentence_gold_map(train_data)
dev_sentences = build_sentence_gold_map(dev_data)

print(f"Training sentences: {len(train_sentences)}")
print(f"Dev sentences: {len(dev_sentences)}")
print(f"\nExample sentence:")
print(f"  Text: {train_sentences[6]['sentence_text']}")
print(f"  Terms: {train_sentences[6]['terms']}")

Training sentences: 2308
Dev sentences: 577

Example sentence:
  Text: AFFIDAMENTO DEL “SERVIZIO DI SPAZZAMENTO, RACCOLTA, TRASPORTO E SMALTIMENTO/RECUPERO DEI RIFIUTI URBANI ED ASSIMILATI E SERVIZI COMPLEMENTARI DELLA CITTA' DI AGROPOLI” VALEVOLE PER UN QUINQUENNIO
  Terms: ['raccolta', 'recupero', 'servizio di raccolta', 'servizio di spazzamento', 'smaltimento', 'trasporto']


### Evaluation Metrics

In [18]:
def micro_f1_score(gold_standard, system_output):
    """
    Evaluates performance using Precision, Recall, and F1 score 
    based on individual term matching (micro-average).
    
    Args:
        gold_standard: List of lists, where each inner list contains gold standard terms
        system_output: List of lists, where each inner list contains extracted terms
    
    Returns:
        Tuple containing (precision, recall, f1, tp, fp, fn)
    """
    total_true_positives = 0
    total_false_positives = 0
    total_false_negatives = 0
    
    # Iterate through each item's gold standard and system output terms
    for gold, system in zip(gold_standard, system_output):
        # Convert to sets for efficient comparison
        gold_set = set(gold)
        system_set = set(system)
        
        # Calculate TP, FP, FN for the current item
        true_positives = len(gold_set.intersection(system_set))
        false_positives = len(system_set - gold_set)
        false_negatives = len(gold_set - system_set)
        
        # Accumulate totals across all items
        total_true_positives += true_positives
        total_false_positives += false_positives
        total_false_negatives += false_negatives
    
    # Calculate Precision, Recall, and F1 score (micro-average)
    precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0
    recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1, total_true_positives, total_false_positives, total_false_negatives


def type_f1_score(gold_standard, system_output):
    """
    Evaluates performance using Type Precision, Type Recall, and Type F1 score
    based on the set of unique terms extracted at least once across the entire dataset.
    
    Args:
        gold_standard: List of lists, where each inner list contains gold standard terms
        system_output: List of lists, where each inner list contains extracted terms
    
    Returns:
        Tuple containing (type_precision, type_recall, type_f1)
    """
    # Get the set of all unique gold standard terms across the dataset
    all_gold_terms = set()
    for item_terms in gold_standard:
        all_gold_terms.update(item_terms)
    
    # Get the set of all unique system extracted terms across the dataset
    all_system_terms = set()
    for item_terms in system_output:
        all_system_terms.update(item_terms)
    
    # Calculate True Positives (terms present in both sets)
    type_true_positives = len(all_gold_terms.intersection(all_system_terms))
    
    # Calculate False Positives (terms in system output but not in gold standard)
    type_false_positives = len(all_system_terms - all_gold_terms)
    
    # Calculate False Negatives (terms in gold standard but not in system output)
    type_false_negatives = len(all_gold_terms - all_system_terms)
    
    # Calculate Type Precision, Type Recall, and Type F1 score
    type_precision = type_true_positives / (type_true_positives + type_false_positives) if (type_true_positives + type_false_positives) > 0 else 0
    type_recall = type_true_positives / (type_true_positives + type_false_negatives) if (type_true_positives + type_false_negatives) > 0 else 0
    type_f1 = 2 * (type_precision * type_recall) / (type_precision + type_recall) if (type_precision + type_recall) > 0 else 0
    
    return type_precision, type_recall, type_f1


# Test: Simple case
gold_test = [['term1', 'term2'], ['term3']]
pred_test = [['term1', 'term4'], ['term3']]
precision, recall, f1, tp, fp, fn = micro_f1_score(gold_test, pred_test)
assert tp == 2  # term1 and term3
assert fp == 1  # term4
assert fn == 1  # term2
print("✓ Evaluation functions work correctly")
print(f"  Test metrics: P={precision:.2f}, R={recall:.2f}, F1={f1:.2f}")

# Test type-level metrics
type_p, type_r, type_f1 = type_f1_score(gold_test, pred_test)
print(f"  Type metrics: P={type_p:.2f}, R={type_r:.2f}, F1={type_f1:.2f}")

✓ Evaluation functions work correctly
  Test metrics: P=0.67, R=0.67, F1=0.67
  Type metrics: P=0.67, R=0.67, F1=0.67


## SpaCy import and Pipeline

In [None]:
#python -m spacy download it_core_news_sm

In [19]:
import spacy
# Load Italian model
try:
    nlp = spacy.load('it_core_news_sm')
    print("✓ Italian model loaded successfully")
except:
    print("Model not found. Install with: python -m spacy download it_core_news_sm")


✓ Italian model loaded successfully


### Text Normalization Helpers

In [20]:
def normalize_text_spacy(t: str) -> str:
    """
    Normalize raw sentence text before sending it to spaCy:
      - replace non-breaking spaces
      - collapse multiple spaces
      - unify curly quotes to plain apostrophes
    """
    t = t.replace("\u00a0", " ")
    t = " ".join(t.split())
    t = t.replace("’", "'").replace("`", "'")
    return t

In [21]:
def normalize_candidate_spacy(term: str) -> str:
    """
    Normalize a candidate term for comparison/merging:
      - strip whitespace
      - lowercase
      - collapse multiple spaces
      - strip leading/trailing punctuation and quotes
    """
    import string

    t = term.strip()
    if not t:
        return ""

    t = t.lower()
    t = " ".join(t.split())
    # remove punctuation only at boundaries
    t = t.strip(string.punctuation + "«»“”'\"")

    return t

In [22]:
def spacy_annotate_sentence(text: str) -> Dict:
    """
    Run spaCy on a sentence and return:
      - doc: the spaCy Doc object
      - tokens: list of token texts
      - lemmas: list of lemmas
      - pos: list of POS tags
      - noun_chunks: list of noun chunk texts
    """
    clean_text = normalize_text_spacy(text)
    doc = nlp(clean_text)

    tokens = [token.text for token in doc]
    lemmas = [token.lemma_ for token in doc]
    pos_tags = [token.pos_ for token in doc]
    noun_chunks = [chunk.text for chunk in doc.noun_chunks]

    return {
        "doc": doc,
        "tokens": tokens,
        "lemmas": lemmas,
        "pos": pos_tags,
        "noun_chunks": noun_chunks,
    }


### Extraction opf the sPaCy candidates (pattern + noun chunks)

In [23]:
def extract_spacy_candidates(doc) -> List[str]:
    """
    Extract term candidates from a spaCy Doc using:
      - noun chunks
      - simple POS patterns:
          * ADJ + NOUN
          * NOUN + NOUN
          * NOUN + ADP + NOUN   (e.g., "centro di raccolta")

    Returns:
        A list of *normalized* candidate strings (deduplicated, order-preserving).
    """
    candidates = []

    # 1) Direct noun chunks
    for chunk in doc.noun_chunks:
        candidates.append(chunk.text)

    # 2) POS-based patterns
    tokens = list(doc)
    n = len(tokens)

    i = 0
    while i < n:
        tok = tokens[i]

        # ADJ + NOUN
        if tok.pos_ == "ADJ" and i + 1 < n and tokens[i + 1].pos_ == "NOUN":
            span = doc[tok.i : tokens[i + 1].i + 1]
            candidates.append(span.text)

        # NOUN + NOUN
        if tok.pos_ == "NOUN" and i + 1 < n and tokens[i + 1].pos_ == "NOUN":
            span = doc[tok.i : tokens[i + 1].i + 1]
            candidates.append(span.text)

        # NOUN + ADP + NOUN  (e.g., "centro di raccolta")
        if (
            tok.pos_ == "NOUN"
            and i + 2 < n
            and tokens[i + 1].pos_ == "ADP"
            and tokens[i + 2].pos_ == "NOUN"
        ):
            span = doc[tok.i : tokens[i + 2].i + 1]
            candidates.append(span.text)

        i += 1

    # Normalize and deduplicate while preserving order
    norm_seen = set()
    final = []
    for c in candidates:
        norm = normalize_candidate_spacy(c)
        if norm and norm not in norm_seen:
            norm_seen.add(norm)
            final.append(norm)

    return final

In [24]:
def build_spacy_annotations(sentences: List[Dict]) -> List[Dict]:
    """
    For each sentence entry, add:
      - entry["spacy_tokens"]
      - entry["spacy_lemmas"]
      - entry["spacy_pos"]
      - entry["spacy_noun_chunks"]
      - entry["spacy_candidates"]  (list of normalized candidate strings)

    NOTE: this function mutates the input list in place and returns it.
    """
    total = len(sentences)
    print(f"→ Starting spaCy annotation on {total} sentences")

    for i, entry in enumerate(sentences):
        if i % 500 == 0:
            print(f"  [spaCy] processing sentence {i}/{total}")

        sent_text = entry["sentence_text"]
        ann = spacy_annotate_sentence(sent_text)

        entry["spacy_tokens"] = ann["tokens"]
        entry["spacy_lemmas"] = ann["lemmas"]
        entry["spacy_pos"] = ann["pos"]
        entry["spacy_noun_chunks"] = ann["noun_chunks"]

        # linguistic candidates
        spacy_cands = extract_spacy_candidates(ann["doc"])
        entry["spacy_candidates"] = spacy_cands

    print("→ spaCy annotation completed")
    return sentences


In [25]:
# -------------------------------------------------------------------
# Run spaCy on TRAIN and DEV and inspect a few debug examples
# -------------------------------------------------------------------

print("\nAnnotating TRAIN with spaCy...")
train_sentences = build_spacy_annotations(train_sentences)

print("\nAnnotating DEV with spaCy...")
dev_sentences = build_spacy_annotations(dev_sentences)

# Extract raw text and gold terms for DEV
dev_texts = [s["sentence_text"] for s in dev_sentences]
dev_gold = [s["terms"] for s in dev_sentences]

print(f"\n✓ spaCy annotation completed on TRAIN ({len(train_sentences)}) "
      f"and DEV ({len(dev_sentences)}) sentences")

# -------------------------------------------------------------------
# DEBUG: inspect the first few DEV sentences and their candidates
# -------------------------------------------------------------------
print("\n[DEBUG] Example DEV sentences with gold terms and spaCy candidates:\n")

num_debug_examples = 3
for idx in range(min(num_debug_examples, len(dev_sentences))):
    sent = dev_sentences[idx]
    print(f"--- DEV sentence #{idx} ---")
    print("Text:", sent["sentence_text"])
    print("Gold terms:", sent.get("terms", []))
    print("spaCy noun_chunks:", sent["spacy_noun_chunks"])
    print("spaCy candidates:", sent["spacy_candidates"])
    print()

# More global debug: basic stats about number of candidates per sentence
candidates_lengths = [len(s["spacy_candidates"]) for s in dev_sentences]
if candidates_lengths:
    avg_cands = sum(candidates_lengths) / len(candidates_lengths)
    max_cands = max(candidates_lengths)
    min_cands = min(candidates_lengths)
    print("[DEBUG] spaCy candidates statistics on DEV:")
    print(f"  Min #candidates: {min_cands}")
    print(f"  Max #candidates: {max_cands}")
    print(f"  Avg #candidates: {avg_cands:.2f}")

    # Show a sentence with many candidates as a sanity check
    max_idx = candidates_lengths.index(max_cands)
    print(f"\n[DEBUG] Sentence with the highest number of candidates (index {max_idx}):")
    print("  Text:", dev_sentences[max_idx]["sentence_text"])
    print("  #Candidates:", max_cands)
    print("  Candidates:", dev_sentences[max_idx]["spacy_candidates"])

# -------------------------------------------------------------------
# Build predictions for DEV using spaCy candidates
# -------------------------------------------------------------------
spacy_dev_preds: List[List[str]] = [s["spacy_candidates"] for s in dev_sentences]

print(f"\nBuilt spaCy predictions for DEV: {len(spacy_dev_preds)} sentences")
assert len(spacy_dev_preds) == len(dev_sentences), "Mismatch between preds and dev sentences!"




Annotating TRAIN with spaCy...
→ Starting spaCy annotation on 2308 sentences
  [spaCy] processing sentence 0/2308
  [spaCy] processing sentence 500/2308
  [spaCy] processing sentence 1000/2308
  [spaCy] processing sentence 1500/2308
  [spaCy] processing sentence 2000/2308
→ spaCy annotation completed

Annotating DEV with spaCy...
→ Starting spaCy annotation on 577 sentences
  [spaCy] processing sentence 0/577
  [spaCy] processing sentence 500/577
→ spaCy annotation completed

✓ spaCy annotation completed on TRAIN (2308) and DEV (577) sentences

[DEBUG] Example DEV sentences with gold terms and spaCy candidates:

--- DEV sentence #0 ---
Text: Non Domestica; CAMPEGGI, DISTRIBUTORI CARBURANTI, PARCHEGGI; 1,22; 4,73 
Gold terms: []
spaCy noun_chunks: ['Non Domestica', 'CAMPEGGI', 'DISTRIBUTORI CARBURANTI, PARCHEGGI']
spaCy candidates: ['non domestica', 'campeggi', 'distributori carburanti, parcheggi']

--- DEV sentence #1 ---
Text: Il presente disciplinare per la gestione dei centri di ra

### Save Predictions

In [26]:
def save_predictions(predictions: List[List[str]], 
                     sentences: List[Dict], 
                     output_path: str):
    """Save predictions in competition format."""
    output = {'data': []}
    for pred, sent in zip(predictions, sentences):
        output['data'].append({
            'document_id': sent['document_id'],
            'paragraph_id': sent['paragraph_id'],
            'sentence_id': sent['sentence_id'],
            'term_list': pred
        })
    
    os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(output, f, ensure_ascii=False, indent=2)
    print(f"Saved {len(predictions)} predictions to {output_path}")

In [27]:
# Valutazione (se le metriche normalizzano già, usa dev_gold e spacy_dev_preds)
precision, recall, f1, tp, fp, fn = micro_f1_score(dev_gold, spacy_dev_preds)
type_precision, type_recall, type_f1 = type_f1_score(dev_gold, spacy_dev_preds)

print("\nspaCy linguistic pipeline Results:")
print("Micro-averaged metrics:")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1 Score:  {f1:.4f}")
print(f"  TP={tp}, FP={fp}, FN={fn}")
print("\nType-level metrics:")
print(f"  Type Precision: {type_precision:.4f}")
print(f"  Type Recall:    {type_recall:.4f}")
print(f"  Type F1 Score:  {type_f1:.4f}")



# Salva predizioni in formato competizione (se hai già save_predictions definita)
output_path = "results/spacy_pipeline_dev_predictions.json"
save_predictions(spacy_dev_preds, dev_sentences, output_path)
print(f"✓ Predictions saved to: {output_path}")


spaCy linguistic pipeline Results:
Micro-averaged metrics:
  Precision: 0.0509
  Recall:    0.3725
  F1 Score:  0.0896
  TP=168, FP=3131, FN=283

Type-level metrics:
  Type Precision: 0.0408
  Type Recall:    0.4174
  Type F1 Score:  0.0743
Saved 577 predictions to results/spacy_pipeline_dev_predictions.json
✓ Predictions saved to: results/spacy_pipeline_dev_predictions.json


### Inspection

In [29]:
import pandas as pd

rows = []

print("\nBuilding detailed prediction table for DEBUG...")

for sent in dev_sentences:
    doc_id    = sent["document_id"]
    par_id    = sent["paragraph_id"]
    sent_id   = sent["sentence_id"]
    sent_text = sent["sentence_text"]

    preds = sent["spacy_candidates"]  # this is a list of strings

    for term in preds:
        rows.append({
            "document_id": doc_id,
            "paragraph_id": par_id,
            "sentence_id": sent_id,
            "sentence_text": sent_text,
            "term": term
        })

dev_df_spacy = pd.DataFrame(rows)

print("✓ Detailed prediction table created.")
print("  Number of predicted term occurrences:", len(dev_df_spacy))
dev_df_spacy.head()



Building detailed prediction table for DEBUG...
✓ Detailed prediction table created.
  Number of predicted term occurrences: 3299


Unnamed: 0,document_id,paragraph_id,sentence_id,sentence_text,term
0,doc_praiano_07,32,7,"Non Domestica; CAMPEGGI, DISTRIBUTORI CARBURAN...",non domestica
1,doc_praiano_07,32,7,"Non Domestica; CAMPEGGI, DISTRIBUTORI CARBURAN...",campeggi
2,doc_praiano_07,32,7,"Non Domestica; CAMPEGGI, DISTRIBUTORI CARBURAN...","distributori carburanti, parcheggi"
3,doc_caserta_06,3,1,Il presente disciplinare per la gestione dei c...,la gestione
4,doc_caserta_06,3,1,Il presente disciplinare per la gestione dei c...,centri


In [30]:
rows_gold = []

for sent in dev_sentences:
    doc_id    = sent["document_id"]
    par_id    = sent["paragraph_id"]
    sent_id   = sent["sentence_id"]
    sent_text = sent["sentence_text"]

    gold_terms = sent["terms"]  # lista dei gold

    for term in gold_terms:
        rows_gold.append({
            "document_id": doc_id,
            "paragraph_id": par_id,
            "sentence_id": sent_id,
            "sentence_text": sent_text,
            "term": term
        })

dev_df_gold = pd.DataFrame(rows_gold)
print("✓ Gold table created.")
print("  Number of gold term occurrences:", len(dev_df_gold))
dev_df_gold.head()


✓ Gold table created.
  Number of gold term occurrences: 451


Unnamed: 0,document_id,paragraph_id,sentence_id,sentence_text,term
0,doc_caserta_06,3,1,Il presente disciplinare per la gestione dei c...,disciplina dei centri di raccolta dei rifiuti ...
1,doc_caserta_06,3,1,Il presente disciplinare per la gestione dei c...,disciplinare per la gestione dei centri di rac...
2,doc_poggiomarino_01,6,1,"È un Servizio Supplementare di raccolta, rivol...",raccolta
3,doc_nola_05,2,2,ll servizio di raccolta dei rifiuti derivanti ...,servizio di raccolta dei rifiuti
4,doc_nola_05,2,2,ll servizio di raccolta dei rifiuti derivanti ...,sfalci e potature


In [31]:
def get_fp_fn(gold_df, pred_df):
    """Return DataFrames for false positives and false negatives."""
    def normalize(df):
        df = df.copy()
        df["term"] = df["term"].str.lower().str.strip()
        return df[["document_id", "paragraph_id", "sentence_id", "term"]]

    gold = normalize(gold_df)
    pred = normalize(pred_df)

    gold_set = set(gold.itertuples(index=False, name=None))
    pred_set = set(pred.itertuples(index=False, name=None))

    tp = gold_set & pred_set
    fp = pred_set - gold_set
    fn = gold_set - pred_set

    fp_df = pd.DataFrame(list(fp), columns=["document_id", "paragraph_id", "sentence_id", "term"])
    fn_df = pd.DataFrame(list(fn), columns=["document_id", "paragraph_id", "sentence_id", "term"])

    return fp_df, fn_df


In [32]:
fp_df, fn_df = get_fp_fn(dev_df_gold, dev_df_spacy)

print("\n=== COUNTS ===")
print("False Positives:", len(fp_df))
print("False Negatives:", len(fn_df))
print("True Positives:", len(set(dev_df_gold.apply(tuple, axis=1)) 
                              & set(dev_df_spacy.apply(tuple, axis=1))))



=== COUNTS ===
False Positives: 3116
False Negatives: 273
True Positives: 168


In [33]:
print("\n=== FALSE POSITIVES (examples) ===")
display(fp_df.head(20))

print("\n=== FALSE NEGATIVES (examples) ===")
display(fn_df.head(20))



=== FALSE POSITIVES (examples) ===


Unnamed: 0,document_id,paragraph_id,sentence_id,term
0,doc_agropoli_09,17,0,durata del contratto
1,doc_agropoli_09,17,2,spesa a carico
2,doc_sorrento_10,18,0,1° aprile
3,doc_salerno_05,7,2,igiene della casa
4,doc_santagnello_19,3,6,planimetrie degli immobili
5,doc_battipaglia_02,28,0,sindaco del comune
6,doc_capaccio_10,8,3,2° giovedì
7,doc_sarno_12,3,4,uranio presente nelle rocce
8,doc_nocerainferiore_06,3,0,torquato
9,doc_auletta_01,5,25,laboratori



=== FALSE NEGATIVES (examples) ===


Unnamed: 0,document_id,paragraph_id,sentence_id,term
0,doc_praiano_07,21,1,coefficiente proporzionale di produttività kb
1,doc_nola_02,16,8,frazione secca
2,doc_salerno_05,7,16,conferire
3,doc_caserta_02,64,9,isole ecologiche
4,doc_salerno_05,11,6,conferire
5,doc_nocerainferiore_06,6,0,servizio ecologia
6,doc_salerno_06,12,5,utenti
7,doc_caserta_02,66,2,tubi fluorescenti ed altri tubi contenenti mer...
8,doc_auletta_13,36,1,gestore dello spazzamento e lavaggio delle strade
9,doc_caserta_02,59,3,multimateriale
