### Setup and dataset loading

In [1]:
import pandas as pd
import numpy as np
import spacy
import math
from collections import Counter
from typing import List, Dict
import json

In [2]:
import pandas as pd

pd.set_option("display.max_colwidth", 200)

def load_csv(path):
    df = pd.read_csv(path)
    if "term" in df.columns:
        df["term"] = df["term"].apply(
            lambda x: x.strip() if isinstance(x, str) else ""
        )
    return df

train_path = "https://raw.githubusercontent.com/nicolaCirillo/ate-it/main/data/subtask_a_train.csv"
dev_path   = "https://raw.githubusercontent.com/nicolaCirillo/ate-it/main/data/subtask_a_dev.csv"

train_df = load_csv(train_path)
dev_df   = load_csv(dev_path)

print("Train shape:", train_df.shape)
print("Dev shape  :", dev_df.shape)

Train shape: (3423, 5)
Dev shape  : (779, 5)


### Evaluation metrics

In [3]:
def micro_f1(gold_df, pred_df):
    # normalize columns
    def normalize(df):
        df = df.copy()
        df["term"] = df["term"].str.lower().str.strip()
        return df[["document_id", "paragraph_id", "sentence_id", "term"]]

    gold = normalize(gold_df)
    pred = normalize(pred_df)

    gold_set = set(gold.itertuples(index=False, name=None))
    pred_set = set(pred.itertuples(index=False, name=None))

    tp = len(gold_set & pred_set)
    fp = len(pred_set - gold_set)
    fn = len(gold_set - pred_set)

    precision = tp / (tp + fp) if tp + fp else 0.0
    recall    = tp / (tp + fn) if tp + fn else 0.0
    f1        = 2 * precision * recall / (precision + recall) if precision + recall else 0.0

    return precision, recall, f1


def type_f1(gold_df, pred_df):
    gold_types = set(gold_df["term"].str.lower().str.strip())
    pred_types = set(pred_df["term"].str.lower().str.strip())

    gold_types.discard("")
    pred_types.discard("")

    tp = len(gold_types & pred_types)
    fp = len(pred_types - gold_types)
    fn = len(gold_types - pred_types)

    precision = tp / (tp + fp) if tp + fp else 0.0
    recall    = tp / (tp + fn) if tp + fn else 0.0
    f1        = 2 * precision * recall / (precision + recall) if precision + recall else 0.0

    return precision, recall, f1



### Load spaCy and stopwords

In [4]:
import nltk
from nltk.corpus import stopwords

# Make sure stopwords are available
try:
    it_stopwords = set(stopwords.words("italian"))
except:
    nltk.download("stopwords")
    it_stopwords = set(stopwords.words("italian"))

nlp = spacy.load("it_core_news_sm")


### 1. **Linguistic Filtering of Candidate Terms**

Before we extract statistical n-grams, we apply linguistic rules to discard
non-term-like spans.


In [5]:
# Unigram filter
def is_valid_unigram_token(tok):
    """
    Check if a single spaCy token is a valid unigram candidate.
    Valid = alphabetic, non-stopword, sufficiently long, noun/proper noun.
    """
    text = tok.text.lower().strip()

    if len(text) < 3:
        return False
    if text in it_stopwords:
        return False
    if not text.isalpha():
        return False
    if tok.pos_ not in {"NOUN", "PROPN"}:
        return False
    
    return True

# N-gram (n>=2) span filter
def is_valid_ngram_span_old(span_tokens):
    """
    Rules for a valid multi-word candidate:
    - No punctuation
    - First token: NOUN / PROPN / ADJ
    - Last token:  NOUN / PROPN / ADJ
    - Must contain at least one noun/proper noun inside
    - All alphabetic
    """
    # no punctuation
    for t in span_tokens:
        if t.is_punct:
            return False

    first = span_tokens[0]
    last  = span_tokens[-1]

    # start: allow NOUN / PROPN / ADJ as first token
    if first.pos_ not in {"NOUN", "PROPN", "ADJ"}:
        return False

    # end: must be noun or proper noun (your new constraint) or adjective "raccolta differenziata"
    if last.pos_ not in {"NOUN", "PROPN", "ADJ"}:
        return False

    # must contain at least one noun/proper noun somewhere
    if not any(t.pos_ in {"NOUN", "PROPN"} for t in span_tokens):
        return False

    # optional: all alphabetic
    if not all(t.text.isalpha() for t in span_tokens):
        return False

    return True


In [6]:
def is_valid_ngram_span(span_tokens):
    """
    Enhanced POS-based n-gram filter.
    Accepts:
      - base rules (start-end POS)
      - new syntactic patterns:
          * NOUN + ADP + DET + NOUN
          * NOUN + CCONJ + NOUN
          * ADJ  + NOUN + ADP + NOUN
    """

    # Reject punctuation inside span
    for t in span_tokens:
        if t.is_punct:
            return False

    first = span_tokens[0]
    last  = span_tokens[-1]

    # Base rule: valid start
    if first.pos_ not in {"NOUN", "PROPN", "ADJ"}:
        return False

    # Base rule: valid end
    if last.pos_ not in {"NOUN", "PROPN", "ADJ"}:
        return False

    # Must contain at least one noun or proper noun
    if not any(t.pos_ in {"NOUN", "PROPN"} for t in span_tokens):
        return False

    # All alphabetic tokens (optional, but keeps noise low)
    if not all(t.text.isalpha() for t in span_tokens):
        return False

    # NEW ADVANCED SYNTAX PATTERNS
    # Pattern: NOUN + ADP + DET + NOUN
    if len(span_tokens) == 4:
        if (span_tokens[0].pos_ == "NOUN" and
            span_tokens[1].pos_ == "ADP"  and
            span_tokens[2].pos_ == "DET"  and
            span_tokens[3].pos_ in {"NOUN", "PROPN"}):
            return True

    # Pattern: NOUN + CCONJ + NOUN
    if len(span_tokens) == 3:
        if (span_tokens[0].pos_ == "NOUN"  and
            span_tokens[1].pos_ == "CCONJ" and
            span_tokens[2].pos_ == "NOUN"):
            return True

    # Pattern: ADJ + NOUN + ADP + NOUN
    if len(span_tokens) == 4:
        if (span_tokens[0].pos_ == "ADJ"  and
            span_tokens[1].pos_ == "NOUN" and
            span_tokens[2].pos_ == "ADP"  and
            span_tokens[3].pos_ in {"NOUN", "PROPN"}):
            return True

    # If not matched custom patterns, return base accept
    return True


In [7]:
def spacy_dependency_subtrees(doc):
    """
    Extract candidate multiword terms from dependency subtrees
    rooted at NOUN/PROPN heads.
    This captures things like:
        - "raccolta differenziata"
        - "gestione dei rifiuti urbani"
        - "svuotamento dei carrellati condominiali"
    """

    candidates = []

    for token in doc:
        if token.pos_ in {"NOUN", "PROPN"}:
            subtree = list(token.subtree)
            # Avoid huge subtrees (>6 words)
            if 1 < len(subtree) <= 6:
                # Filter punctuation and spaces
                words = [t.text for t in subtree if not t.is_space and not t.is_punct]
                if len(words) >= 2:
                    candidates.append(" ".join(words).lower())

    return candidates


In [8]:
# Gold terms from the training set (from the `term` column)
gold_terms = train_df["term"].str.lower().str.strip()
gold_terms = gold_terms[gold_terms != ""]  # drop empty

# How often each string is annotated as a term
gold_counts = gold_terms.value_counts()

print("Number of unique gold terms:", len(gold_counts))
gold_counts.head(20)

Number of unique gold terms: 713


term
vetro                    69
porta a porta            66
rifiuti                  57
conferire                57
multimateriale           56
conferimento             55
indifferenziato          40
carta e cartone          36
plastica                 32
rifiuti organici         30
isole ecologiche         29
raccolta                 27
utenze domestiche        27
utente                   23
utenza                   23
utenti                   22
utenze non domestiche    20
umido                    19
centro di raccolta       19
isola ecologica          18
Name: count, dtype: int64

### 2. Statistical N-gram Extraction (TF, DF, TF-IDF)

Once we have linguistic filters, we extract valid n-grams from each document
and measure their importance statistically.

For each n-gram:
- **TF** (Term Frequency): how often it appears in the corpus  
- **DF** (Document Frequency): in how many documents it appears  
- **TF-IDF**: penalizes generic expressions and boosts informative ones  

This gives a first ranking of “term-like” units based solely on statistics.


In [9]:
import math
from collections import Counter

def build_ngram_tfidf_from_documents_pos(train_df, max_n=3):
    tf_counter = Counter()
    df_counter = Counter()

    doc_ids = train_df["document_id"].unique()
    N_docs = len(doc_ids)

    for doc_id in doc_ids:
        # concatenate all sentences of the document
        doc_texts = train_df[train_df["document_id"] == doc_id]["sentence_text"]
        full_text = " ".join(doc_texts)

        spacy_doc = nlp(full_text)
        tokens = [t for t in spacy_doc if not t.is_space]
        L = len(tokens)

        doc_ngrams = set()

        for n in range(1, max_n + 1):
            for i in range(L - n + 1):
                span_tokens = tokens[i:i+n]

                # unigram case
                if n == 1:
                    if not is_valid_unigram_token(span_tokens[0]):
                        continue
                else:
                    if not is_valid_ngram_span(span_tokens):
                        continue

                ngram = " ".join(t.text.lower() for t in span_tokens)

                tf_counter[ngram] += 1
                doc_ngrams.add(ngram)

        # DF increment
        for ng in doc_ngrams:
            df_counter[ng] += 1

    # compute TF-IDF
    tfidf = {}
    for ng, tf in tf_counter.items():
        df_val = df_counter.get(ng, 0)
        idf = math.log((1 + N_docs) / (1 + df_val)) + 1
        tfidf[ng] = tf * idf

    return (
        pd.Series(tf_counter),
        pd.Series(df_counter),
        pd.Series(tfidf).sort_values(ascending=False),
        N_docs,
    )





In [10]:
tf_series_pos, df_series_pos, tfidf_series_pos, N_docs = \
    build_ngram_tfidf_from_documents_pos(train_df, max_n=5)

tfidf_series_pos.head(20)

porta               937.244905
carta               851.663186
rifiuti             789.963230
raccolta            782.200573
vetro               597.029624
cartone             593.471020
conferimento        526.726652
contenitori         523.482496
imballaggi          500.018994
servizio            495.492322
porta a porta       453.938402
multimateriale      435.929855
plastica            435.028606
organici            401.966809
rifiuti organici    370.383598
frazione            357.815257
indifferenziato     355.399021
ore                 348.895607
carta e cartone     328.744094
utenze              322.041205
dtype: float64

## 3. **Supervised boosting using TRAIN gold**

For each n-gram:
- If it appears in the training gold annotations, we **boost its score**
- The boosting factor depends on how frequently it appears as a gold term
- Controlled by parameter **alpha**

This integrates weak supervision:
- gold terms become more prominent in the final ranking  
- irrelevant but frequent n-grams get lower priority


In [20]:
def build_supervised_tfidf(tfidf_series, gold_counts, alpha=1.0):
    """
    Boost TF-IDF scores for n-grams that appear more often as gold terms.

    tfidf_series: pandas Series, index = n-gram string, value = base TF-IDF
    gold_counts:  pandas Series, index = term string, value = gold frequency
    alpha:        strength of boosting (0 = no supervision effect)
    """
    if len(gold_counts) > 0:
        max_gold = gold_counts.max()
    else:
        max_gold = 1

    scores = {}

    for ng, base_tfidf in tfidf_series.items():
        # how many times this n-gram is annotated as a term
        gold_freq = gold_counts.get(ng, 0)

        if gold_freq > 0:
            norm = gold_freq / max_gold        # in (0,1]
            boost = 1.0 + alpha * norm         # >= 1.0
        else:
            boost = 1.0                        # no boost for non-gold

        scores[ng] = base_tfidf * boost #boosting frequency

    supervised_series = pd.Series(scores).sort_values(ascending=False)
    return supervised_series

In [21]:
alpha = 1.0  # try 1.0, 2.0, etc.
tfidf_supervised = build_supervised_tfidf(tfidf_series_pos, gold_counts, alpha=alpha)

tfidf_supervised.head(20)


rifiuti             1442.541551
vetro               1194.059247
raccolta            1088.279058
carta               1061.493247
conferimento         946.581231
porta                937.244905
porta a porta        888.140352
multimateriale       789.727999
plastica             636.781004
cartone              636.476166
indifferenziato      561.427439
rifiuti organici     531.419944
contenitori          523.482496
imballaggi           521.758950
carta e cartone      500.262752
servizio             495.492322
organici             401.966809
frazione             378.558170
utenze               354.712052
ore                  348.895607
dtype: float64

### **4. Build memory (vocabulary of statistical candidates)**

select the highest-scoring n-grams as our "memory":
a dictionary mapping n-gram length → set of n-gram tuples.
Only n-grams with supervised TF-IDF above a threshold are kept.

In [22]:
   
def build_ngram_memory_tfidf(tfidf_series, threshold):
    """
    Select n-grams with TF-IDF above threshold and build
    a dictionary memory_by_len[length] = set of tuples.
    """
    selected = tfidf_series[tfidf_series >= threshold]
    termlist = list(selected.index)

    memory_by_len = {}
    for phrase in termlist:
        tokens = phrase.split(" ")
        L = len(tokens)
        if L not in memory_by_len:
            memory_by_len[L] = set()
        memory_by_len[L].add(tuple(tokens))

    max_len = max(memory_by_len.keys()) if memory_by_len else 0
    return memory_by_len, max_len

threshold = 100.0  # TF-IDF threshold on the *boosted* scores
memory_by_len_sup, max_ngram_len_sup = build_ngram_memory_tfidf(tfidf_supervised, threshold)

print("Threshold:", threshold)
print("Max n-gram length in memory:", max_ngram_len_sup)
total_terms = sum(len(s) for s in memory_by_len_sup.values())
print("Number of n-grams selected:", total_terms)



Threshold: 100.0
Max n-gram length in memory: 5
Number of n-grams selected: 186


### **5. Extract candidates from each DEV sentence**

In [23]:
def tokenize(text: str):
    """Simple tokenizer compatible with our memory matcher."""
    doc = nlp(text)
    return [t.text.lower() for t in doc if t.is_alpha]

def extract_terms_from_sentence_ngram(sentence, memory_by_len, max_ngram_len):
    doc = nlp(sentence)

    # --- 1) Dependency-based subtree extraction
    dep_candidates = spacy_dependency_subtrees(doc)

    # --- 2) Standard n-gram scanning
    tokens = [t.text.lower() for t in doc if t.is_alpha]
    L = len(tokens)
    i = 0
    found_terms = []

    while i < L:
        matched = None
        matched_len = 0

        for span_len in range(max_ngram_len, 0, -1):
            if i + span_len > L:
                continue
            span = tuple(tokens[i:i+span_len])
            if span_len in memory_by_len and span in memory_by_len[span_len]:
                matched = " ".join(span)
                matched_len = span_len
                break

        if matched:
            found_terms.append(matched)
            i += matched_len
        else:
            i += 1

    # --- 3) Combine & deduplicate
    all_candidates = dep_candidates + found_terms
    seen = set()
    unique = []
    for t in all_candidates:
        if t not in seen:
            seen.add(t)
            unique.append(t)

    return unique


In [24]:
# Build prediction dataframe
rows = []

for _, row in dev_df.iterrows():
    doc_id    = row["document_id"]
    par_id    = row["paragraph_id"]
    sent_id   = row["sentence_id"]
    sent_text = row["sentence_text"]

    preds = extract_terms_from_sentence_ngram(
        sent_text,
        memory_by_len_sup,
        max_ngram_len_sup
    )

    for term in preds:
        rows.append({
            "document_id": doc_id,
            "paragraph_id": par_id,
            "sentence_id": sent_id,
            "sentence_text": sent_text,
            "term": term
        })

dev_df_tfidf_sup = pd.DataFrame(rows)

### Save predictions

In [77]:
import json
import os
from typing import Dict, List
import pandas as pd

def save_predictions_stats(pred_df: pd.DataFrame,
                           dev_df: pd.DataFrame,
                           output_path: str):

    output = {"data": []}

    # Group predicted terms by (doc, par, sentence)
    grouped = pred_df.groupby(
        ["document_id", "paragraph_id", "sentence_id"]
    )["term"].apply(list).to_dict()

    for _, row in dev_df.iterrows():
        key = (
            row["document_id"],
            row["paragraph_id"],
            row["sentence_id"],
        )

        term_list = grouped.get(key, [])

        output["data"].append({
            "document_id": row["document_id"],
            "paragraph_id": row["paragraph_id"],
            "sentence_id": row["sentence_id"],
            "term_list": term_list
        })

    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(output, f, indent=2, ensure_ascii=False)

    print(f"✓ Saved predictions to {output_path}")




In [79]:
output_path = "../predictions/subtask_a_statistics_extraction_improved.json"


save_predictions_stats(
    pred_df=dev_df_tfidf_sup,
    dev_df=dev_df,
    output_path=output_path
)


✓ Saved predictions to ../predictions/subtask_a_statistics_extraction_improved.json


### Evaluation and debug

In [25]:
p_micro, r_micro, f1_micro = micro_f1(dev_df, dev_df_tfidf_sup)
p_type,  r_type,  f1_type  = type_f1(dev_df, dev_df_tfidf_sup)

print("=== SUPERVISED (GOLD-BOOSTED - IMPROVED) TF-IDF MODEL ===")
print("alpha      :", alpha)
print("threshold  :", threshold)
print(f"Micro P/R/F: {p_micro:.3f} / {r_micro:.3f} / {f1_micro:.3f}")
print(f"Type  P/R/F: {p_type:.3f} / {r_type:.3f} / {f1_type:.3f}")







=== SUPERVISED (GOLD-BOOSTED - IMPROVED) TF-IDF MODEL ===
alpha      : 1.0
threshold  : 100.0
Micro P/R/F: 0.083 / 0.271 / 0.127
Type  P/R/F: 0.047 / 0.322 / 0.082


In [None]:
def get_fp_fn(gold_df, pred_df):
    # normalize and reduce to comparable columns
    def normalize(df):
        df = df.copy()
        df["term"] = df["term"].str.lower().str.strip()
        return df[["document_id", "paragraph_id", "sentence_id", "term"]]

    gold = normalize(gold_df)
    pred = normalize(pred_df)

    gold_set = set(gold.itertuples(index=False, name=None))
    pred_set = set(pred.itertuples(index=False, name=None))

    tp = gold_set & pred_set
    fp = pred_set - gold_set
    fn = gold_set - pred_set

    # convert back to dataframes for readability
    fp_df = pd.DataFrame(list(fp), columns=["document_id", "paragraph_id", "sentence_id", "term"])
    fn_df = pd.DataFrame(list(fn), columns=["document_id", "paragraph_id", "sentence_id", "term"])

    return fp_df, fn_df

fp_df, fn_df = get_fp_fn(dev_df, dev_df_tfidf_sup)

print("False Positives:", len(fp_df))
print("False Negatives:", len(fn_df))


False Positives: 2133
False Negatives: 574


In [83]:
print("\n=== FALSE POSITIVES (predicted but not gold) ===")
display(fp_df.head(20))

print("\n=== FALSE NEGATIVES (gold but missed) ===")
display(fn_df.head(20))


=== FALSE POSITIVES (predicted but not gold) ===


Unnamed: 0,document_id,paragraph_id,sentence_id,term
0,doc_praiano_07,28,0,l' immediata eseguibilità del presente atto
1,doc_sarno_12,3,4,in alcuni casi
2,doc_praiano_05,13,2,gestione
3,doc_agropoli_09,17,2,dei rifiuti
4,doc_sorrento_15,2,0,di bonifica
5,doc_capaccio_21,1,7,in carta e cartone
6,doc_nola_02,8,23,frazione
7,doc_battipaglia_02,20,2,dell' ordinamento
8,doc_francavillais_02,12,17,tale organo
9,doc_nola_02,8,23,con il relativo codice cer



=== FALSE NEGATIVES (gold but missed) ===


Unnamed: 0,document_id,paragraph_id,sentence_id,term
0,doc_agropoli_09,29,1,
1,doc_caserta_02,66,1,porta a porta
2,doc_salerno_03,1,0,differenziata
3,doc_caserta_02,64,12,r.a.e.e.
4,doc_sorrento_22,2,0,pneumatici
5,doc_salerno_05,7,2,sacchetti di carta
6,doc_auletta_04,9,4,
7,doc_salerno_06,27,1,svuotamento dei carrellati condominiali
8,doc_agropoli_13,1,14,conferire
9,doc_auletta_13,36,1,gestore dello spazzamento e lavaggio


## Precision-Prior Filtering for Statistical Term Extraction
This section implements a refinement over pure TF-IDF statistical extraction.

We add:
- **Gold term normalization**
- **A prior precision estimate for each n-gram**
- **A dual-threshold filter** combining TF-IDF + precision-prior
- **Prediction extraction**
- **Evaluation**
- **Grid search hyperparameter tuning**


In [50]:
def normalize_gold_terms(gold_series):
    valid_terms = []

    for term in gold_series.dropna():
        term = term.lower().strip()
        if not term:
            continue
        doc = nlp(term)
        tokens = [t for t in doc if not t.is_space]

        if len(tokens) == 1:
            if is_valid_unigram_token(tokens[0]):
                valid_terms.append(term)
        else:
            if is_valid_ngram_span(tokens):
                valid_terms.append(term)

    return pd.Series(valid_terms)


In [51]:
gold_terms_clean = normalize_gold_terms(train_df["term"])
gold_counts = gold_terms_clean.value_counts()

print("Original gold count:", len(train_df["term"]))
print("Filtered gold count:", len(gold_terms_clean))


Original gold count: 3423
Filtered gold count: 1487


#### Building the precision-prior table

`prior_df` merges:
- `tf`: how often each n-gram appears in the whole corpus
- `gold`: how often it appears as a gold term
- `prec_prior`: an empirical estimate of the n-gram's "term-likeness"

### Interpretation:
- A high `prec_prior` means:
  - the n-gram appears frequently **as a gold term**
  - relative to how often it appears overall
- A low `prec_prior` means:
  - the n-gram is frequent but **rarely annotated** → likely NOT a term

This acts as a **weakly supervised precision signal**.


In [52]:
prior_df = pd.DataFrame({
    "tf": tf_series_pos,
    "gold": gold_counts
}).fillna(0)

prior_df["prec_prior"] = prior_df["gold"] / prior_df["tf"]

#### Building the precision-aware vocabulary (statistical memory)

This function combines:
- **TF-IDF score** (importance in corpus)
- **Precision-prior** (how often it's a gold term)
to filter n-grams.

We keep only the n-grams such as:
- "rifiuti urbani"
- "centro di raccolta"
- "plastica e metalli"

and discard irrelevant ones:
- "sindaco del comune"
- "1° aprile"
- "componenti essenziali"

Output:
- `memory_by_len[L]` = set of valid n-grams of length L
- `max_ngram_len_hp` = maximum length of any stored n-gram



In [None]:
prec_threshold = 0.2  # broader 0.1

def build_memory_with_prec_filter(tfidf_series, prior_df, tfidf_threshold, prec_threshold):
    # join tfidf with prior
    df = pd.DataFrame({"tfidf": tfidf_series}).join(prior_df[["prec_prior"]], how="left")
    df["prec_prior"] = df["prec_prior"].fillna(0.0)

    # keep only n-grams that pass both thresholds
    selected = df[(df["tfidf"] >= tfidf_threshold) & (df["prec_prior"] >= prec_threshold)]

    memory_by_len = {}
    for phrase in selected.index:
        tokens = phrase.split(" ")
        L = len(tokens)
        if L not in memory_by_len:
            memory_by_len[L] = set()
        memory_by_len[L].add(tuple(tokens))

    max_len = max(memory_by_len.keys()) if memory_by_len else 0
    return memory_by_len, max_len

tfidf_threshold = 20  #broad 2
memory_by_len_hp, max_ngram_len_hp = build_memory_with_prec_filter(
    tfidf_supervised,  # your gold-boosted tfidf
    prior_df,
    tfidf_threshold,
    prec_threshold,
)


#### Extracting candidate terms from DEV using the precision-prior memory

For each sentence:
1. It is tokenized.
2. We scan through all tokens and attempt to match:
   - longest possible n-grams first,
   - checking membership in `memory_by_len_hp`

This gives a *precision-filtered statistical baseline*.


In [61]:
rows = []

for _, row in dev_df.iterrows():
    doc_id    = row["document_id"]
    par_id    = row["paragraph_id"]
    sent_id   = row["sentence_id"]
    sent_text = row["sentence_text"]

    preds = extract_terms_from_sentence_ngram(
        sent_text,
        memory_by_len_hp,
        max_ngram_len_hp
    )

    for term in preds:
        rows.append({
            "document_id": doc_id,
            "paragraph_id": par_id,
            "sentence_id": sent_id,
            "sentence_text": sent_text,
            "term": term
        })

dev_pred_hp = pd.DataFrame(rows)
print("Number of predicted term occurrences (precision-prior model):", len(dev_pred_hp))
dev_pred_hp.head(10)


Number of predicted term occurrences (precision-prior model): 345


Unnamed: 0,document_id,paragraph_id,sentence_id,sentence_text,term
0,doc_caserta_06,3,1,"Il presente disciplinare per la gestione dei centri di raccolta comunali è stato redatto ai sensi e per effetto del DM 13/05/2009, pubblicato sulla G.U. n. 165 del 18/07/2009, con il quale sono st...",centri di raccolta
1,doc_caserta_06,3,1,"Il presente disciplinare per la gestione dei centri di raccolta comunali è stato redatto ai sensi e per effetto del DM 13/05/2009, pubblicato sulla G.U. n. 165 del 18/07/2009, con il quale sono st...",centri di raccolta
2,doc_salerno_05,24,17,Triciclo CCR/Servizio Ingombranti,ccr
3,doc_salerno_05,24,17,Triciclo CCR/Servizio Ingombranti,servizio ingombranti
4,doc_salerno_05,24,17,Triciclo CCR/Servizio Ingombranti,ccr
5,doc_salerno_05,24,17,Triciclo CCR/Servizio Ingombranti,servizio ingombranti
6,doc_caserta_06,6,2,- alla vigilanza nel rispetto delle norme del C.S.A. e sulla corretta gestione del centro di raccolta;,centro di raccolta
7,doc_praiano_05,15,1,Lunedì; Rifiuti Organici,rifiuti organici
8,doc_caserta_06,9,2,"Qualora l'utente fosse impossibilitato per forza maggiore ad effettuare il conferimento del rifiuto, può richiedere l'intervento dell'operatore ecologico presente.",rifiuto
9,doc_caserta_06,9,2,"Qualora l'utente fosse impossibilitato per forza maggiore ad effettuare il conferimento del rifiuto, può richiedere l'intervento dell'operatore ecologico presente.",rifiuto


In [62]:
p_micro, r_micro, f1_micro = micro_f1(dev_df, dev_pred_hp)
p_type,  r_type,  f1_type  = type_f1(dev_df, dev_pred_hp)

print("=== PRECISION-PRIOR FILTERED TF-IDF MODEL ===")
print(f"tfidf_threshold : {tfidf_threshold}")
print(f"prec_threshold  : {prec_threshold}")
print(f"Micro P/R/F     : {p_micro:.4f} / {r_micro:.4f} / {f1_micro:.4f}")
print(f"Type  P/R/F     : {p_type:.4f} / {r_type:.4f} / {f1_type:.4f}")


=== PRECISION-PRIOR FILTERED TF-IDF MODEL ===
tfidf_threshold : 20
prec_threshold  : 0.2
Micro P/R/F     : 0.8276 / 0.1540 / 0.2597
Type  P/R/F     : 0.9375 / 0.1860 / 0.3103


In [63]:
output_path = "../predictions/subtask_a_statistics_precision_strong.json"

save_predictions_stats(
    pred_df=dev_pred_hp,
    dev_df=dev_df,
    output_path=output_path,
)

✓ Saved predictions to ../predictions/subtask_a_statistics_precision_strong.json


## Grid Search Hyperparameter Tuning

We sweep:
- several **TF-IDF thresholds**
- several **precision-prior thresholds**

For each combination:
- build memory
- extract predictions
- compute metrics
- store results

This allows us to find the best trade-off between:
- precision
- recall
- vocabulary size
- number of predictions


In [40]:
import pandas as pd

# define grids – adjust as you like
tfidf_thresholds = [2.0, 5.0, 10.0, 20.0, 30.0, 40.0]
prec_thresholds  = [0.0, 0.1, 0.2, 0.3]

results = []

for tfidf_thr in tfidf_thresholds:
    for prec_thr in prec_thresholds:
        # 1) Build memory with both thresholds
        memory_by_len_hp, max_ngram_len_hp = build_memory_with_prec_filter(
            tfidf_supervised,
            prior_df,
            tfidf_threshold=tfidf_thr,
            prec_threshold=prec_thr,
        )

        # if memory is empty, skip
        if max_ngram_len_hp == 0:
            results.append({
                "tfidf_thr": tfidf_thr,
                "prec_thr": prec_thr,
                "micro_p": 0.0,
                "micro_r": 0.0,
                "micro_f1": 0.0,
                "type_p": 0.0,
                "type_r": 0.0,
                "type_f1": 0.0,
                "num_terms_in_memory": 0,
                "num_preds": 0,
            })
            continue

        # 2) Predict on dev
        rows = []
        for _, row in dev_df.iterrows():
            doc_id    = row["document_id"]
            par_id    = row["paragraph_id"]
            sent_id   = row["sentence_id"]
            sent_text = row["sentence_text"]

            preds = extract_terms_from_sentence_ngram(
                sent_text,
                memory_by_len_hp,
                max_ngram_len_hp
            )

            for term in preds:
                rows.append({
                    "document_id": doc_id,
                    "paragraph_id": par_id,
                    "sentence_id": sent_id,
                    "sentence_text": sent_text,
                    "term": term
                })

        dev_pred_hp = pd.DataFrame(rows)

        # 3) Evaluate
        p_micro, r_micro, f1_micro = micro_f1(dev_df, dev_pred_hp)
        p_type,  r_type,  f1_type  = type_f1(dev_df, dev_pred_hp)

        # 4) Store results
        num_terms_in_memory = sum(len(s) for s in memory_by_len_hp.values())
        num_preds = len(dev_pred_hp)

        results.append({
            "tfidf_thr": tfidf_thr,
            "prec_thr": prec_thr,
            "micro_p": p_micro,
            "micro_r": r_micro,
            "micro_f1": f1_micro,
            "type_p": p_type,
            "type_r": r_type,
            "type_f1": f1_type,
            "num_terms_in_memory": num_terms_in_memory,
            "num_preds": num_preds,
        })

# Collect into a DataFrame
grid_df = pd.DataFrame(results)

# Round for readability
for col in ["micro_p", "micro_r", "micro_f1", "type_p", "type_r", "type_f1"]:
    grid_df[col] = grid_df[col].apply(lambda x: float(f"{x:.3f}"))

grid_df


Unnamed: 0,tfidf_thr,prec_thr,micro_p,micro_r,micro_f1,type_p,type_r,type_f1,num_terms_in_memory,num_preds
0,2.0,0.0,0.092,0.294,0.141,0.076,0.446,0.13,9813,4045
1,2.0,0.1,0.684,0.208,0.319,0.714,0.289,0.412,269,548
2,2.0,0.2,0.8,0.169,0.28,0.821,0.227,0.356,212,386
3,2.0,0.3,0.857,0.092,0.167,0.842,0.132,0.229,140,196
4,5.0,0.0,0.105,0.303,0.156,0.091,0.446,0.151,5249,3716
5,5.0,0.1,0.685,0.207,0.318,0.719,0.285,0.408,242,539
6,5.0,0.2,0.804,0.168,0.278,0.831,0.223,0.352,185,377
7,5.0,0.3,0.866,0.091,0.165,0.861,0.128,0.223,113,187
8,10.0,0.0,0.113,0.308,0.166,0.1,0.426,0.161,3547,3503
9,10.0,0.1,0.687,0.203,0.313,0.733,0.273,0.398,203,530


In [41]:
# Sort by micro F1 (descending)
grid_df.sort_values("micro_f1", ascending=False).head(10)

Unnamed: 0,tfidf_thr,prec_thr,micro_p,micro_r,micro_f1,type_p,type_r,type_f1,num_terms_in_memory,num_preds
1,2.0,0.1,0.684,0.208,0.319,0.714,0.289,0.412,269,548
5,5.0,0.1,0.685,0.207,0.318,0.719,0.285,0.408,242,539
9,10.0,0.1,0.687,0.203,0.313,0.733,0.273,0.398,203,530
13,20.0,0.1,0.685,0.193,0.301,0.759,0.248,0.374,151,514
17,30.0,0.1,0.705,0.19,0.299,0.806,0.24,0.369,110,504
21,40.0,0.1,0.693,0.177,0.282,0.81,0.211,0.334,83,467
2,2.0,0.2,0.8,0.169,0.28,0.821,0.227,0.356,212,386
6,5.0,0.2,0.804,0.168,0.278,0.831,0.223,0.352,185,377
10,10.0,0.2,0.81,0.164,0.273,0.864,0.211,0.339,146,368
14,20.0,0.2,0.828,0.154,0.26,0.938,0.186,0.31,94,345
