# NLTK Term Extraction for Italian Text

This notebook demonstrates two approaches to term extraction:
1. **Baseline**: Simple substring and fuzzy matching
2. **Trained**: Statistical model using TF-IDF and collocations

Dataset: EvalITA 2025 ATE-IT (Automatic Term Extraction - Italian Testbed)

## Setup and Imports

In [5]:
import json
import re
import os
import pickle
import math
import difflib
from collections import Counter, defaultdict
from typing import List, Dict, Set, Tuple

import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures

from tqdm import tqdm

# Download required NLTK data
nltk.download('punkt', quiet=True)
print("Setup complete")

Setup complete


## Data Loading and Processing

In [2]:
def load_jsonl(path: str) -> List[Dict]:
    """Load a JSON lines file or JSON array file."""
    with open(path, 'r', encoding='utf-8') as f:
        text = f.read().strip()
    if not text:
        return []
    try:
        # Try parsing as single JSON object/array
        data = json.loads(text)
    except json.JSONDecodeError:
        # Fall back to JSONL (one JSON per line)
        data = []
        for line in text.splitlines():
            line = line.strip()
            if line:
                data.append(json.loads(line))
    return data


def build_sentence_gold_map(records: List[Dict]) -> List[Dict]:
    """Convert dataset rows into list of sentences with aggregated terms.
    
    Handles both formats:
    - Records with 'term_list' field (list of terms)
    - Records with individual 'term' field (one term per row)
    """
    out = {}
    
    # Support both dict with 'data' key and plain list
    if isinstance(records, dict) and 'data' in records:
        rows = records['data']
    else:
        rows = records
    
    for r in rows:
        key = (r.get('document_id'), r.get('paragraph_id'), r.get('sentence_id'))
        if key not in out:
            out[key] = {
                'document_id': r.get('document_id'),
                'paragraph_id': r.get('paragraph_id'),
                'sentence_id': r.get('sentence_id'),
                'sentence_text': r.get('sentence_text', ''),
                'terms': []
            }
        
        # Support both 'term_list' (list) and 'term' (single value)
        if isinstance(r.get('term_list'), list):
            for t in r.get('term_list'):
                if t and t not in out[key]['terms']:
                    out[key]['terms'].append(t)
        else:
            term = r.get('term')
            if term and term not in out[key]['terms']:
                out[key]['terms'].append(term)
    
    return list(out.values())


# Test: Load a small sample
test_data = {
    'data': [
        {
            'document_id': 'doc1',
            'paragraph_id': 'p1',
            'sentence_id': 's1',
            'sentence_text': 'La tassa di successione è un tributo.',
            'term_list': ['tassa di successione', 'tributo']
        }
    ]
}

test_sentences = build_sentence_gold_map(test_data)
assert len(test_sentences) == 1
assert test_sentences[0]['terms'] == ['tassa di successione', 'tributo']
print("✓ Data loading functions work correctly")

✓ Data loading functions work correctly


In [3]:
# Load actual training and dev data
train_data = load_jsonl('../data/subtask_a_train.json')
dev_data = load_jsonl('../data/subtask_a_dev.json')

train_sentences = build_sentence_gold_map(train_data)
dev_sentences = build_sentence_gold_map(dev_data)

print(f"Training sentences: {len(train_sentences)}")
print(f"Dev sentences: {len(dev_sentences)}")
print(f"\nExample sentence:")
print(f"  Text: {train_sentences[6]['sentence_text']}...")
print(f"  Terms: {train_sentences[6]['terms']}")

Training sentences: 2308
Dev sentences: 577

Example sentence:
  Text: AFFIDAMENTO DEL “SERVIZIO DI SPAZZAMENTO, RACCOLTA, TRASPORTO E SMALTIMENTO/RECUPERO DEI RIFIUTI URBANI ED ASSIMILATI E SERVIZI COMPLEMENTARI DELLA CITTA' DI AGROPOLI” VALEVOLE PER UN QUINQUENNIO...
  Terms: ['raccolta', 'recupero', 'servizio di raccolta', 'servizio di spazzamento', 'smaltimento', 'trasporto']


## Evaluation Metrics

Using the official evaluation metrics from the competition.

In [4]:
def micro_f1_score(gold_standard, system_output):
    """
    Evaluates performance using Precision, Recall, and F1 score 
    based on individual term matching (micro-average).
    
    Args:
        gold_standard: List of lists, where each inner list contains gold standard terms
        system_output: List of lists, where each inner list contains extracted terms
    
    Returns:
        Tuple containing (precision, recall, f1, tp, fp, fn)
    """
    total_true_positives = 0
    total_false_positives = 0
    total_false_negatives = 0
    
    # Iterate through each item's gold standard and system output terms
    for gold, system in zip(gold_standard, system_output):
        # Convert to sets for efficient comparison
        gold_set = set(gold)
        system_set = set(system)
        
        # Calculate TP, FP, FN for the current item
        true_positives = len(gold_set.intersection(system_set))
        false_positives = len(system_set - gold_set)
        false_negatives = len(gold_set - system_set)
        
        # Accumulate totals across all items
        total_true_positives += true_positives
        total_false_positives += false_positives
        total_false_negatives += false_negatives
    
    # Calculate Precision, Recall, and F1 score (micro-average)
    precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0
    recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1, total_true_positives, total_false_positives, total_false_negatives


def type_f1_score(gold_standard, system_output):
    """
    Evaluates performance using Type Precision, Type Recall, and Type F1 score
    based on the set of unique terms extracted at least once across the entire dataset.
    
    Args:
        gold_standard: List of lists, where each inner list contains gold standard terms
        system_output: List of lists, where each inner list contains extracted terms
    
    Returns:
        Tuple containing (type_precision, type_recall, type_f1)
    """
    # Get the set of all unique gold standard terms across the dataset
    all_gold_terms = set()
    for item_terms in gold_standard:
        all_gold_terms.update(item_terms)
    
    # Get the set of all unique system extracted terms across the dataset
    all_system_terms = set()
    for item_terms in system_output:
        all_system_terms.update(item_terms)
    
    # Calculate True Positives (terms present in both sets)
    type_true_positives = len(all_gold_terms.intersection(all_system_terms))
    
    # Calculate False Positives (terms in system output but not in gold standard)
    type_false_positives = len(all_system_terms - all_gold_terms)
    
    # Calculate False Negatives (terms in gold standard but not in system output)
    type_false_negatives = len(all_gold_terms - all_system_terms)
    
    # Calculate Type Precision, Type Recall, and Type F1 score
    type_precision = type_true_positives / (type_true_positives + type_false_positives) if (type_true_positives + type_false_positives) > 0 else 0
    type_recall = type_true_positives / (type_true_positives + type_false_negatives) if (type_true_positives + type_false_negatives) > 0 else 0
    type_f1 = 2 * (type_precision * type_recall) / (type_precision + type_recall) if (type_precision + type_recall) > 0 else 0
    
    return type_precision, type_recall, type_f1


# Test: Simple case
gold_test = [['term1', 'term2'], ['term3']]
pred_test = [['term1', 'term4'], ['term3']]
precision, recall, f1, tp, fp, fn = micro_f1_score(gold_test, pred_test)
assert tp == 2  # term1 and term3
assert fp == 1  # term4
assert fn == 1  # term2
print("✓ Evaluation functions work correctly")
print(f"  Test metrics: P={precision:.2f}, R={recall:.2f}, F1={f1:.2f}")

# Test type-level metrics
type_p, type_r, type_f1 = type_f1_score(gold_test, pred_test)
print(f"  Type metrics: P={type_p:.2f}, R={type_r:.2f}, F1={type_f1:.2f}")

✓ Evaluation functions work correctly
  Test metrics: P=0.67, R=0.67, F1=0.67
  Type metrics: P=0.67, R=0.67, F1=0.67


## Baseline Model: Substring and Fuzzy Matching

Simple approach:
- Exact substring matching on normalized text
- Fuzzy matching with similarity threshold for approximate matches

In [None]:
class NLTKSubstringBaseline:
    """Baseline using substring and fuzzy matching."""
    
    def __init__(self, threshold: float = 0.8, n_matches: int = 3):
        self.terms = []
        self.norm_terms = []
        self.threshold = threshold  # Fuzzy match threshold
        self.n_matches = n_matches  # Max fuzzy matches per token
    
    def _normalize(self, text: str) -> str:
        """Lowercase, remove punctuation, normalize whitespace."""
        text = text.lower()
        text = re.sub(r"[^\w\sÀ-ÖØ-öø-ÿ]+", " ", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text
    
    def build(self, term_list: List[str]):
        """Prepare term vocabulary."""
        self.terms = [t for t in term_list if t]
        self.norm_terms = [self._normalize(t) for t in self.terms]
    
    def predict(self, sentences: List[str]) -> List[List[str]]:
        """Extract terms from sentences."""
        results = []
        for s in tqdm(sentences, desc="Predicting", total=len(sentences)):
            ns = self._normalize(s)
            found = set()
            
            # Exact substring match
            for orig, nt in zip(self.terms, self.norm_terms):
                if nt and nt in ns:
                    found.add(orig)
            
            # Fuzzy matching on tokens
            tokens = ns.split()
            for t in tokens:
                matches = difflib.get_close_matches(
                    t, self.norm_terms, 
                    n=self.n_matches, 
                    cutoff=self.threshold
                )
                for m in matches:
                    idx = self.norm_terms.index(m)
                    found.add(self.terms[idx])
            
            results.append(sorted(found))
        return results
    
    def save(self, path: str):
        """Save model to disk."""
        model_data = {
            'terms': self.terms,
            'norm_terms': self.norm_terms,
            'threshold': self.threshold,
            'n_matches': self.n_matches
        }
        with open(path, 'wb') as f:
            pickle.dump(model_data, f)
        print(f"Model saved to {path}")
    
    def load(self, path: str):
        """Load model from disk."""
        with open(path, 'rb') as f:
            model_data = pickle.load(f)
        self.terms = model_data['terms']
        self.norm_terms = model_data['norm_terms']
        self.threshold = model_data['threshold']
        self.n_matches = model_data['n_matches']
        print(f"Model loaded from {path}")


# Test: Simple predictions
baseline = NLTKSubstringBaseline(threshold=0.8, n_matches=3)
baseline.build(['tributo', 'tassa di successione'])
test_preds = baseline.predict(['Il tributo è una tassa di successione.'])
assert 'tributo' in test_preds[0]
assert 'tassa di successione' in test_preds[0]
print("✓ Baseline model works correctly")
print(f"  Test predictions: {test_preds[0]}")

Predicting: 100%|██████████| 1/1 [00:00<?, ?it/s]

✓ Baseline model works correctly
  Test predictions: ['tassa di successione', 'tributo']





### Run and Evaluate Baseline Model


**Additional configurations to test**
- Test with different *threshold* and *n_matches* values

In [None]:
# Extract unique terms from training data
train_terms = set()
for s in train_sentences:
    train_terms.update(t for t in s['terms'] if t)

print(f"Unique training terms: {len(train_terms)}")

# Build baseline model
baseline_model = NLTKSubstringBaseline(threshold=0.8, n_matches=3)
baseline_model.build(sorted(train_terms))

# Predict on dev set
dev_texts = [s['sentence_text'] for s in dev_sentences]
dev_gold = [s['terms'] for s in dev_sentences]

baseline_preds = baseline_model.predict(dev_texts)

# Evaluate
precision, recall, f1, tp, fp, fn = micro_f1_score(dev_gold, baseline_preds)
type_precision, type_recall, type_f1 = type_f1_score(dev_gold, baseline_preds)

print("\nBaseline Model Results:")
print("Micro-averaged metrics:")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1 Score:  {f1:.4f}")
print(f"  TP={tp}, FP={fp}, FN={fn}")
print("\nType-level metrics:")
print(f"  Type Precision: {type_precision:.4f}")
print(f"  Type Recall:    {type_recall:.4f}")
print(f"  Type F1 Score:  {type_f1:.4f}")

# Store metrics for later comparison
baseline_metrics = {
    'precision': precision,
    'recall': recall,
    'f1': f1,
    'type_precision': type_precision,
    'type_recall': type_recall,
    'type_f1': type_f1
}

Unique training terms: 713


Predicting: 100%|██████████| 577/577 [00:02<00:00, 196.24it/s]


Baseline Model Results:
Micro-averaged metrics:
  Precision: 0.1599
  Recall:    0.7428
  F1 Score:  0.2632
  TP=335, FP=1760, FN=116

Type-level metrics:
  Type Precision: 0.5569
  Type Recall:    0.5661
  Type F1 Score:  0.5615





In [8]:
# Save baseline model
baseline_model.save('models/nltk_baseline.pkl')

Model saved to models/nltk_baseline.pkl


## Trained Model: TF-IDF and Collocation Detection

Statistical approach that learns from training data:
- **TF-IDF**: Identifies important terms based on frequency and document distribution
- **Collocations**: Detects statistically significant n-grams (bigrams, trigrams)
- **PMI scores**: Measures how often words appear together vs. independently (c.f. https://web.stanford.edu/~jurafsky/slp3/J.pdf)


**Additional configurations to test**
- Test with different *tfidf_threshold*, *collocation_threshold*, and *min_freq* values

In [9]:
class NLTKTrainedModel:
    """Statistical term extractor using TF-IDF and collocations."""
    
    def __init__(self, tfidf_threshold=0.1, collocation_threshold=3.0, min_freq=2):
        self.tfidf_threshold = tfidf_threshold
        self.collocation_threshold = collocation_threshold
        self.min_freq = min_freq
        
        # Learned features
        self.term_tfidf = {}
        self.term_freq = Counter()
        self.doc_freq = Counter()
        self.bigram_scores = {}
        self.trigram_scores = {}
        self.known_terms = set()
        self.vocab = set()
        self.n_docs = 0
    
    def _normalize(self, text: str) -> str:
        text = text.lower()
        text = re.sub(r"[^\w\sÀ-ÖØ-öø-ÿ]+", " ", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text
    
    def _tokenize(self, text: str) -> List[str]:
        try:
            return word_tokenize(text, language='italian')
        except:
            return text.split()
    
    def _compute_tfidf(self):
        """Compute TF-IDF: term frequency × log(N / document frequency)."""
        self.term_tfidf = {}
        for term in self.known_terms:
            if term not in self.term_freq or term not in self.doc_freq:
                continue
            tf = self.term_freq[term]
            df = self.doc_freq[term]
            idf = math.log(self.n_docs / df) if df > 0 else 0
            self.term_tfidf[term] = tf * idf
    
    def _extract_collocations(self, sentences: List[str]):
        """Find statistically significant word combinations."""
        all_tokens = []
        for s in sentences:
            tokens = self._tokenize(self._normalize(s))
            all_tokens.extend(tokens)
            self.vocab.update(tokens)
        
        # Bigrams
        if len(all_tokens) >= 2:
            finder = BigramCollocationFinder.from_words(all_tokens)
            finder.apply_freq_filter(self.min_freq)
            measures = BigramAssocMeasures()
            
            for (w1, w2), score in finder.score_ngrams(measures.pmi):
                if score >= self.collocation_threshold:
                    self.bigram_scores[(w1, w2)] = score
        
        # Trigrams
        if len(all_tokens) >= 3:
            finder = TrigramCollocationFinder.from_words(all_tokens)
            finder.apply_freq_filter(self.min_freq)
            measures = TrigramAssocMeasures()
            
            for (w1, w2, w3), score in finder.score_ngrams(measures.pmi):
                if score >= self.collocation_threshold:
                    self.trigram_scores[(w1, w2, w3)] = score
    
    def train(self, sentences: List[str], term_lists: List[List[str]]):
        """Learn statistical features from labeled data."""
        self.n_docs = len(sentences)
        
        # Collect term statistics
        for sent, terms in zip(sentences, term_lists):
            terms_in_doc = set()
            for term in terms:
                if term:
                    self.known_terms.add(term)
                    self.term_freq[term] += 1
                    terms_in_doc.add(term)
            
            for term in terms_in_doc:
                self.doc_freq[term] += 1
        
        # Compute TF-IDF scores
        self._compute_tfidf()
        
        # Extract collocations
        self._extract_collocations(sentences)
        
        print(f"Training complete:")
        print(f"  Unique terms: {len(self.known_terms)}")
        print(f"  Bigrams: {len(self.bigram_scores)}")
        print(f"  Trigrams: {len(self.trigram_scores)}")
    
    def _find_ngrams(self, tokens: List[str]) -> List[str]:
        """Find learned collocations in token sequence."""
        matches = []
        
        # Trigrams
        for i in range(len(tokens) - 2):
            trigram = (tokens[i], tokens[i+1], tokens[i+2])
            if trigram in self.trigram_scores:
                matches.append(" ".join(trigram))
        
        # Bigrams
        for i in range(len(tokens) - 1):
            bigram = (tokens[i], tokens[i+1])
            if bigram in self.bigram_scores:
                matches.append(" ".join(bigram))
        
        return matches
    
    def predict(self, sentences: List[str]) -> List[List[str]]:
        """Extract terms from new sentences."""
        results = []
        for sent in sentences:
            norm_sent = self._normalize(sent)
            tokens = self._tokenize(norm_sent)
            found = set()
            
            # Match high-scoring known terms
            for term in self.known_terms:
                norm_term = self._normalize(term)
                if norm_term in norm_sent:
                    if self.term_tfidf.get(term, 0) >= self.tfidf_threshold:
                        found.add(term)
            
            # Find collocations
            ngrams = self._find_ngrams(tokens)
            for ng in ngrams:
                for term in self.known_terms:
                    if self._normalize(term) == ng:
                        found.add(term)
            
            results.append(sorted(found))
        return results
    
    def save(self, path: str):
        """Save trained model."""
        os.makedirs(os.path.dirname(path) or '.', exist_ok=True)
        model_data = {
            'tfidf_threshold': self.tfidf_threshold,
            'collocation_threshold': self.collocation_threshold,
            'min_freq': self.min_freq,
            'term_tfidf': self.term_tfidf,
            'term_freq': dict(self.term_freq),
            'doc_freq': dict(self.doc_freq),
            'bigram_scores': self.bigram_scores,
            'trigram_scores': self.trigram_scores,
            'known_terms': list(self.known_terms),
            'n_docs': self.n_docs,
            'vocab': list(self.vocab)
        }
        with open(path, 'wb') as f:
            pickle.dump(model_data, f)
        print(f"Model saved to {path}")
    
    def load(self, path: str):
        """Load trained model."""
        with open(path, 'rb') as f:
            data = pickle.load(f)
        self.tfidf_threshold = data['tfidf_threshold']
        self.collocation_threshold = data['collocation_threshold']
        self.min_freq = data['min_freq']
        self.term_tfidf = data['term_tfidf']
        self.term_freq = Counter(data['term_freq'])
        self.doc_freq = Counter(data['doc_freq'])
        self.bigram_scores = data['bigram_scores']
        self.trigram_scores = data['trigram_scores']
        self.known_terms = set(data['known_terms'])
        self.n_docs = data['n_docs']
        self.vocab = set(data['vocab'])
        print(f"Model loaded from {path}")


# Test: Simple training
test_model = NLTKTrainedModel(tfidf_threshold=0.1, collocation_threshold=2.0)
test_sents = ['Il tributo è importante.', 'La tassa di successione è un tributo.']
test_terms = [['tributo'], ['tassa di successione', 'tributo']]
test_model.train(test_sents, test_terms)
assert len(test_model.known_terms) == 2
assert test_model.term_freq['tributo'] == 2
print("✓ Trained model initialization works correctly")

Training complete:
  Unique terms: 2
  Bigrams: 0
  Trigrams: 0
✓ Trained model initialization works correctly


### Train and Evaluate Trained Model

In [11]:
# Prepare training data
train_texts = [s['sentence_text'] for s in train_sentences]
train_term_lists = [s['terms'] for s in train_sentences]

# Initialize and train model
trained_model = NLTKTrainedModel(
    tfidf_threshold=0.1,
    collocation_threshold=3.0,
    min_freq=2
)

trained_model.train(train_texts, train_term_lists)

# Show some learned features
print("\nTop 10 terms by TF-IDF:")
top_terms = sorted(trained_model.term_tfidf.items(), key=lambda x: x[1], reverse=True)[:10]
for term, score in top_terms:
    print(f"  {term}: {score:.2f}")

print(f"\nExample bigrams (first 10):")
for bigram, score in list(trained_model.bigram_scores.items())[:10]:
    print(f"  {' '.join(bigram)}: {score:.2f}")

Training complete:
  Unique terms: 713
  Bigrams: 4198
  Trigrams: 3671

Top 10 terms by TF-IDF:
  vetro: 242.19
  porta a porta: 234.60
  conferire: 210.96
  rifiuti: 210.96
  multimateriale: 208.25
  conferimento: 205.52
  indifferenziato: 162.21
  carta e cartone: 149.78
  plastica: 136.91
  rifiuti organici: 130.29

Example bigrams (first 10):
  177 178: 14.09
  178 179: 14.09
  179 183: 14.09
  234 341: 14.09
  autotrasporti distributori: 14.09
  birrerie hamburgerie: 14.09
  bonifico bancario: 14.09
  carburante librerie: 14.09
  carrozzerie autofficine: 14.09
  case vacanze: 14.09


In [12]:
# Predict on dev set
trained_preds = trained_model.predict(dev_texts)

# Evaluate
precision, recall, f1, tp, fp, fn = micro_f1_score(dev_gold, trained_preds)
type_precision, type_recall, type_f1 = type_f1_score(dev_gold, trained_preds)

print("Trained Model Results:")
print("Micro-averaged metrics:")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1 Score:  {f1:.4f}")
print(f"  TP={tp}, FP={fp}, FN={fn}")
print("\nType-level metrics:")
print(f"  Type Precision: {type_precision:.4f}")
print(f"  Type Recall:    {type_recall:.4f}")
print(f"  Type F1 Score:  {type_f1:.4f}")

# Store metrics for later comparison
trained_metrics = {
    'precision': precision,
    'recall': recall,
    'f1': f1,
    'type_precision': type_precision,
    'type_recall': type_recall,
    'type_f1': type_f1
}

Trained Model Results:
Micro-averaged metrics:
  Precision: 0.2198
  Recall:    0.7428
  F1 Score:  0.3392
  TP=335, FP=1189, FN=116

Type-level metrics:
  Type Precision: 0.6143
  Type Recall:    0.5661
  Type F1 Score:  0.5892


In [13]:
# Save trained model
trained_model.save('models/nltk_trained.pkl')

Model saved to models/nltk_trained.pkl


## Results Comparison

In [14]:
import pandas as pd

# Micro-averaged comparison
results_df = pd.DataFrame([
    {
        'Model': 'Baseline',
        'Precision': baseline_metrics['precision'],
        'Recall': baseline_metrics['recall'],
        'F1': baseline_metrics['f1']
    },
    {
        'Model': 'Trained',
        'Precision': trained_metrics['precision'],
        'Recall': trained_metrics['recall'],
        'F1': trained_metrics['f1']
    }
])

print("Micro-averaged Metrics:")
print(results_df.to_markdown(index=False))

# Type-level comparison
type_results_df = pd.DataFrame([
    {
        'Model': 'Baseline',
        'Type Precision': baseline_metrics['type_precision'],
        'Type Recall': baseline_metrics['type_recall'],
        'Type F1': baseline_metrics['type_f1']
    },
    {
        'Model': 'Trained',
        'Type Precision': trained_metrics['type_precision'],
        'Type Recall': trained_metrics['type_recall'],
        'Type F1': trained_metrics['type_f1']
    }
])

print("\n\nType-level Metrics:")
print(type_results_df.to_markdown(index=False))

# Show improvement
f1_improvement = (trained_metrics['f1'] - baseline_metrics['f1']) / baseline_metrics['f1'] * 100
type_f1_improvement = (trained_metrics['type_f1'] - baseline_metrics['type_f1']) / baseline_metrics['type_f1'] * 100
print(f"\n\nMicro F1 Score improvement: {f1_improvement:+.1f}%")
print(f"Type F1 Score improvement: {type_f1_improvement:+.1f}%")

Micro-averaged Metrics:
| Model    |   Precision |   Recall |       F1 |
|:---------|------------:|---------:|---------:|
| Baseline |    0.159905 | 0.742794 | 0.263158 |
| Trained  |    0.219816 | 0.742794 | 0.339241 |


Type-level Metrics:
| Model    |   Type Precision |   Type Recall |   Type F1 |
|:---------|-----------------:|--------------:|----------:|
| Baseline |         0.556911 |      0.566116 |  0.561475 |
| Trained  |         0.61435  |      0.566116 |  0.589247 |


Micro F1 Score improvement: +28.9%
Type F1 Score improvement: +4.9%


## Save Predictions to Files

In [17]:
def save_predictions(predictions: List[List[str]], 
                     sentences: List[Dict], 
                     output_path: str):
    """Save predictions in competition format."""
    output = {'data': []}
    for pred, sent in zip(predictions, sentences):
        output['data'].append({
            'document_id': sent['document_id'],
            'paragraph_id': sent['paragraph_id'],
            'sentence_id': sent['sentence_id'],
            'term_list': pred
        })
    
    os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(output, f, ensure_ascii=False, indent=2)
    print(f"Saved {len(predictions)} predictions to {output_path}")


# Save both sets of predictions
save_predictions(baseline_preds, dev_sentences, 'predictions/subtask_a_dev_nltk_baseline_preds.json')
save_predictions(trained_preds, dev_sentences, 'predictions/subtask_a_dev_nltk_trained_preds.json')

Saved 577 predictions to predictions/subtask_a_dev_nltk_baseline_preds.json
Saved 577 predictions to predictions/subtask_a_dev_nltk_trained_preds.json


## Load and Test Saved Models

In [18]:
# Test loading baseline
loaded_baseline = NLTKSubstringBaseline()
loaded_baseline.load('models/nltk_baseline.pkl')
test_preds_baseline = loaded_baseline.predict([dev_texts[0]])
assert test_preds_baseline[0] == baseline_preds[0]
print("✓ Baseline model saved and loaded correctly")

# Test loading trained model
loaded_trained = NLTKTrainedModel()
loaded_trained.load('models/nltk_trained.pkl')
test_preds_trained = loaded_trained.predict([dev_texts[0]])
assert test_preds_trained[0] == trained_preds[0]
print("✓ Trained model saved and loaded correctly")

print("\nAll models successfully saved and can be reloaded!")

Model loaded from models/nltk_baseline.pkl


Predicting: 100%|██████████| 1/1 [00:00<00:00, 261.80it/s]

✓ Baseline model saved and loaded correctly
Model loaded from models/nltk_trained.pkl
✓ Trained model saved and loaded correctly

All models successfully saved and can be reloaded!





## Example Predictions

In [19]:
# Show example predictions from both models
example_idx = 120
example_text = dev_texts[example_idx]
example_gold = dev_gold[example_idx]
example_baseline = baseline_preds[example_idx]
example_trained = trained_preds[example_idx]

print(f"Sentence: {example_text}\n")
print(f"Gold terms: {example_gold}\n")
print(f"Baseline predictions: {example_baseline}")
print(f"Trained predictions: {example_trained}\n")

# Show what each model got right/wrong
baseline_correct = set(example_baseline) & set(example_gold)
trained_correct = set(example_trained) & set(example_gold)

print(f"Baseline correct: {baseline_correct}")
print(f"Trained correct: {trained_correct}")

Sentence: a. per incendi dei rifiuti nei contenitori € 2.000

Gold terms: ['rifiuti']

Baseline predictions: ['pe', 'rifiuti', 'rifiuto']
Trained predictions: ['pe', 'rifiuti']

Baseline correct: {'rifiuti'}
Trained correct: {'rifiuti'}
