# SpaCy Term Extraction for Italian Text
 **Trained**: Custom NER model fine-tuned for term extraction

Dataset: EvalITA 2025 ATE-IT (Automatic Term Extraction - Italian Testbed)

## Setup and Imports


In [9]:
#!python -m spacy download it_core_news_sm
#!python -m spacy download it_core_news_md
#!python -m spacy download it_core_news_lg

In [1]:
import json
import os
import random
from pathlib import Path
from typing import List, Dict, Tuple

import spacy
from spacy.tokens import DocBin, Doc
from spacy.training import Example
from spacy.util import minibatch, compounding
from spacy.pipeline import EntityRuler
from tqdm import tqdm

# Load Italian model
try:
    nlp = spacy.load('it_core_news_md')
    print("✓ Italian model loaded successfully")
except:
    print("Model not found. Install with: python -m spacy download it_core_news_md")

✓ Italian model loaded successfully


## Data Loading and Processing

In [2]:
def load_jsonl(path: str) -> List[Dict]:
    """Load a JSON lines file or JSON array file."""
    with open(path, 'r', encoding='utf-8') as f:
        text = f.read().strip()
    if not text:
        return []
    try:
        # Try parsing as single JSON object/array
        data = json.loads(text)
    except json.JSONDecodeError:
        # Fall back to JSONL (one JSON per line)
        data = []
        for line in text.splitlines():
            line = line.strip()
            if line:
                data.append(json.loads(line))
    return data


def build_sentence_gold_map(records: List[Dict]) -> List[Dict]:
    """Convert dataset rows into list of sentences with aggregated terms.
    
    Handles both formats:
    - Records with 'term_list' field (list of terms) for input files in json format
    - Records with individual 'term' field (one term per row) for input files in csv format
    """
    out = {}
    
    # Support both dict with 'data' key and plain list
    if isinstance(records, dict) and 'data' in records:
        rows = records['data']
    else:
        rows = records
    
    for r in rows:
        key = (r.get('document_id'), r.get('paragraph_id'), r.get('sentence_id'))
        if key not in out:
            out[key] = {
                'document_id': r.get('document_id'),
                'paragraph_id': r.get('paragraph_id'),
                'sentence_id': r.get('sentence_id'),
                'sentence_text': r.get('sentence_text', ''),
                'terms': []
            }
        
        # Support both 'term_list' (list) and 'term' (single value)
        if isinstance(r.get('term_list'), list):
            for t in r.get('term_list'):
                if t and t not in out[key]['terms']:
                    out[key]['terms'].append(t)
        else:
            term = r.get('term')
            if term and term not in out[key]['terms']:
                out[key]['terms'].append(term)
    
    return list(out.values())


In [6]:
# Load actual training and dev data
train_data = load_jsonl('../../data/subtask_a_train.json')
dev_data = load_jsonl('../../data/subtask_a_dev.json')

train_sentences = build_sentence_gold_map(train_data)
dev_sentences = build_sentence_gold_map(dev_data)

print(f"Training sentences: {len(train_sentences)}")
print(f"Dev sentences: {len(dev_sentences)}")
print(f"\nExample sentence:")
print(f"  Text: {train_sentences[6]['sentence_text']}")
print(f"  Terms: {train_sentences[6]['terms']}")

# Build TRAIN+DEV sentence list for final training
train_dev_sentences = train_sentences + dev_sentences
print(f"TRAIN+DEV sentences: {len(train_dev_sentences)}")

# Texts and term lists for training
train_dev_texts = [s["sentence_text"] for s in train_dev_sentences]
train_dev_term_lists = [s["terms"] for s in train_dev_sentences]

# For evaluation on DEV (sanity check only)
dev_texts = [s["sentence_text"] for s in dev_sentences]
dev_gold = [s["terms"] for s in dev_sentences]

Training sentences: 2308
Dev sentences: 577

Example sentence:
  Text: AFFIDAMENTO DEL “SERVIZIO DI SPAZZAMENTO, RACCOLTA, TRASPORTO E SMALTIMENTO/RECUPERO DEI RIFIUTI URBANI ED ASSIMILATI E SERVIZI COMPLEMENTARI DELLA CITTA' DI AGROPOLI” VALEVOLE PER UN QUINQUENNIO
  Terms: ['raccolta', 'recupero', 'servizio di raccolta', 'servizio di spazzamento', 'smaltimento', 'trasporto']
TRAIN+DEV sentences: 2885


## Trained Model: Custom NER

Neural approach that learns from examples:
- Fine-tunes SpaCy's NER model on term extraction task
- Learns patterns and context from labeled data
- Can generalize to similar terms not seen during training

In [14]:
class SpacyTrainedModel:
    """Trainable NER model for term extraction."""
    
    def __init__(self, model: str = 'it_core_news_sm'):
        self.model_name = model
        self.nlp = None
    
    def _prepare_training_data(self, sentences: List[str], term_lists: List[List[str]]) -> List[Example]:
        """Convert to SpaCy training format with character-span annotations."""
        training_data = []
        
        for sent_text, terms in zip(sentences, term_lists):
            doc = self.nlp.make_doc(sent_text)
            entities = []
            
            # Find character spans for each term
            for term in terms:
                if not term:
                    continue
                
                # Find all occurrences
                start_idx = 0
                while True:
                    start_idx = sent_text.find(term, start_idx)
                    if start_idx == -1:
                        break
                    
                    end_idx = start_idx + len(term)
                    span = doc.char_span(start_idx, end_idx, label='TERM', alignment_mode='expand')
                    if span is not None:
                        entities.append((start_idx, end_idx, 'TERM'))
                    
                    start_idx = end_idx
            
            # Remove overlapping entities
            entities = self._remove_overlapping(entities)
            example = Example.from_dict(doc, {'entities': entities})
            training_data.append(example)
        
        return training_data
    
    def _remove_overlapping(self, entities: List[Tuple[int, int, str]]) -> List[Tuple[int, int, str]]:
        """Keep longer spans when entities overlap."""
        if not entities:
            return []
        
        # Sort by start, then by length (descending)
        entities = sorted(entities, key=lambda x: (x[0], -(x[1] - x[0])))
        
        non_overlapping = []
        for start, end, label in entities:
            overlaps = False
            for prev_start, prev_end, _ in non_overlapping:
                if not (end <= prev_start or start >= prev_end):
                    overlaps = True
                    break
            if not overlaps:
                non_overlapping.append((start, end, label))
        
        return non_overlapping
    
    def train(self, sentences: List[str], term_lists: List[List[str]], 
              n_iter: int = 30, dropout: float = 0.2, batch_size: int = 8):
        """Train NER model on labeled data."""
        print(f"Initializing model: {self.model_name}")
        
        # Load base model
        try:
            self.nlp = spacy.load(self.model_name)
        except:
            print(f"Model not found, using blank Italian model")
            self.nlp = spacy.blank('it')
        
        # Setup NER
        if 'ner' not in self.nlp.pipe_names:
            ner = self.nlp.add_pipe('ner')
        else:
            ner = self.nlp.get_pipe('ner')
        ner.add_label('TERM')
        
        # Prepare training data
        print("Preparing training examples...")
        train_examples = self._prepare_training_data(sentences, term_lists)
        train_examples = [ex for ex in train_examples if len(ex.reference.ents) > 0] # Keep only examples with entities
        print(f"Training on {len(train_examples)} examples")
        
        # Train
        other_pipes = [pipe for pipe in self.nlp.pipe_names if pipe != 'ner']
        with self.nlp.disable_pipes(*other_pipes):
            #optimizer = self.nlp.begin_training()
            if self.model_name == 'it_core_news_sm':
                optimizer = self.nlp.resume_training()
            else:
                optimizer = self.nlp.begin_training()

            for iteration in tqdm(range(n_iter), desc="Training", total=n_iter):
                random.shuffle(train_examples)
                losses = {}
                batches = minibatch(train_examples, size=compounding(4.0, batch_size, 1.001))
                
                for batch in batches:
                    self.nlp.update(batch, drop=dropout, losses=losses)
                
                if iteration % 5 == 0:
                    print(f"  Iteration {iteration}: Loss = {losses.get('ner', 0):.3f}")
        
        print("Training complete!")
    
    def predict(self, sentences: List[str]) -> List[List[str]]:
        """Extract terms from sentences."""
        if self.nlp is None:
            raise RuntimeError("Model not trained. Call train() or load() first.")
        
        results = []
        for doc in self.nlp.pipe(sentences, batch_size=32):
            terms = [ent.text for ent in doc.ents if ent.label_ == 'TERM']
            results.append(terms)
        return results
    
    def save(self, path: str):
        """Save trained model."""
        if self.nlp is None:
            raise RuntimeError("No model to save")
        
        output_dir = Path(path)
        output_dir.mkdir(parents=True, exist_ok=True)
        self.nlp.to_disk(output_dir)
        print(f"Model saved to {output_dir}")
    
    def load(self, path: str):
        """Load trained model."""
        self.nlp = spacy.load(path)
        if 'ner' not in self.nlp.pipe_names:
            raise ValueError("Loaded model doesn't have NER component")
        print(f"Model loaded from {path}")

### Train on TRAIN+DEV

Note: This cell might take several minutes to run.

**Additional configurations to test**
- Keep overlapping entities
- Keep documents with 0 entities in the training set
- Change hyperparameters (*n_iter*, *dropout*, *batch_size*)

In [None]:
# Prepare training data
trained_model = SpacyTrainedModel(model="it_core_news_md")

trained_model.train(
    train_dev_texts,
    train_dev_term_lists,
    n_iter=40,   
    dropout=0.1,
    batch_size=8,
)

Initializing model: it_core_news_md
Preparing training examples...




Training on 763 examples


Training:   2%|▎         | 1/40 [00:07<04:51,  7.48s/it]

  Iteration 0: Loss = 3532.880


Training:  15%|█▌        | 6/40 [00:44<04:12,  7.43s/it]

  Iteration 5: Loss = 710.821


Training:  28%|██▊       | 11/40 [01:22<03:41,  7.64s/it]

  Iteration 10: Loss = 549.970


Training:  40%|████      | 16/40 [02:00<03:03,  7.64s/it]

  Iteration 15: Loss = 264.305


Training:  52%|█████▎    | 21/40 [02:39<02:25,  7.64s/it]

  Iteration 20: Loss = 181.076


Training:  65%|██████▌   | 26/40 [03:17<01:46,  7.60s/it]

  Iteration 25: Loss = 193.852


Training:  78%|███████▊  | 31/40 [03:56<01:10,  7.81s/it]

  Iteration 30: Loss = 114.109


Training:  90%|█████████ | 36/40 [04:35<00:30,  7.70s/it]

  Iteration 35: Loss = 120.093


Training: 100%|██████████| 40/40 [05:07<00:00,  7.68s/it]

Training complete!





In [17]:
# Save trained model
trained_model.save('models/spacy_trained_train_dev')

Model saved to models\spacy_trained_train_dev


In [10]:
# ================== PREDICTION ON TEST (FINAL RUN) ==================

test_data = load_jsonl('../../data/test.json')
test_sentences = build_sentence_gold_map(test_data)
test_texts = [s["sentence_text"] for s in test_sentences]


In [11]:
SPACY_MODEL_PATH = '../../src/final_train_dev_training/models/spacy_trained_train_dev'
nlp = spacy.load(SPACY_MODEL_PATH)
test_preds = []
for doc in nlp.pipe(test_texts, batch_size=32):
    # estrai SOLO entità TERM (quelle che hai addestrato)
    terms = [ent.text.lower() for ent in doc.ents if ent.label_ == "TERM"]
    # deduplica
    uniq = []
    seen = set()
    for t in terms:
        if t not in seen:
            seen.add(t)
            uniq.append(t)
    test_preds.append(uniq)

print(f"✓ Inference completed: {len(test_preds)} predictions")

✓ Inference completed: 1142 predictions


## Save Predictions to Files

In [12]:
def save_predictions(predictions: List[List[str]], 
                     sentences: List[Dict], 
                     output_path: str):
    """Save predictions in competition format."""
    output = {'data': []}
    for pred, sent in zip(predictions, sentences):
        output['data'].append({
            'document_id': sent['document_id'],
            'paragraph_id': sent['paragraph_id'],
            'sentence_id': sent['sentence_id'],
            'term_list': pred
        })
    
    os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(output, f, ensure_ascii=False, indent=2)
    print(f"Saved {len(predictions)} predictions to {output_path}")



In [13]:
save_predictions(
    test_preds,
    test_sentences,
    'predictions/subtask_a_test_spacy_trained_preds_train_dev.json'
)


Saved 1142 predictions to predictions/subtask_a_test_spacy_trained_preds_train_dev.json
