# LLM-Based Few-Shot Term Extraction

This notebook demonstrates a zero-shot term extraction approach using Large Language Models:
- Uses Google Gemini API for term extraction
- Processes sentences in batches for efficiency
- No training required - relies on LLM's general knowledge

Dataset: EvalITA 2025 ATE-IT (Automatic Term Extraction - Italian Testbed)

## Setup and Imports

In [1]:
import pandas as pd
import json
import os
import re
from tqdm import tqdm

print("✓ Libraries imported")

✓ Libraries imported


## Data Loading and Processing

In [2]:
def load_jsonl(path: str):
    """Load a JSON lines file or JSON array file."""
    with open(path, 'r', encoding='utf-8') as f:
        text = f.read().strip()
    if not text:
        return []
    try:
        data = json.loads(text)
    except json.JSONDecodeError:
        data = []
        for line in text.splitlines():
            line = line.strip()
            if line:
                data.append(json.loads(line))
    return data


def build_sentence_gold_map(records):
    """Convert dataset rows into list of sentences with aggregated terms."""
    out = {}
    
    if isinstance(records, dict) and 'data' in records:
        rows = records['data']
    else:
        rows = records
    
    for r in rows:
        key = (r.get('document_id'), r.get('paragraph_id'), r.get('sentence_id'))
        if key not in out:
            out[key] = {
                'document_id': r.get('document_id'),
                'paragraph_id': r.get('paragraph_id'),
                'sentence_id': r.get('sentence_id'),
                'sentence_text': r.get('sentence_text', ''),
                'terms': []
            }
        
        if isinstance(r.get('term_list'), list):
            for t in r.get('term_list'):
                if t and t not in out[key]['terms']:
                    out[key]['terms'].append(t)
        else:
            term = r.get('term')
            if term and term not in out[key]['terms']:
                out[key]['terms'].append(term)
    
    return list(out.values())


print("✓ Data loading functions defined")

✓ Data loading functions defined


In [3]:
# Load training and dev data
train_data = load_jsonl('../data/subtask_a_train.json')
dev_data = load_jsonl('../data/subtask_a_dev.json')

train_sentences = build_sentence_gold_map(train_data)
dev_sentences = build_sentence_gold_map(dev_data)

print(f"Training sentences: {len(train_sentences)}")
print(f"Dev sentences: {len(dev_sentences)}")
print(f"\nExample sentence:")
print(f"  Text: {dev_sentences[0]['sentence_text']}")
print(f"  Terms: {dev_sentences[0]['terms']}")

Training sentences: 2308
Dev sentences: 577

Example sentence:
  Text: Non Domestica; CAMPEGGI, DISTRIBUTORI CARBURANTI, PARCHEGGI; 1,22; 4,73 
  Terms: []


## Evaluation Metrics

Using the official evaluation metrics from the competition.

In [4]:
def micro_f1_score(gold_standard, system_output):
    """
    Evaluates performance using Precision, Recall, and F1 score 
    based on individual term matching (micro-average).
    """
    total_true_positives = 0
    total_false_positives = 0
    total_false_negatives = 0
    
    for gold, system in zip(gold_standard, system_output):
        gold_set = set(gold)
        system_set = set(system)
        
        true_positives = len(gold_set.intersection(system_set))
        false_positives = len(system_set - gold_set)
        false_negatives = len(gold_set - system_set)
        
        total_true_positives += true_positives
        total_false_positives += false_positives
        total_false_negatives += false_negatives
    
    precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0
    recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1, total_true_positives, total_false_positives, total_false_negatives


def type_f1_score(gold_standard, system_output):
    """
    Evaluates performance using Type Precision, Type Recall, and Type F1 score
    based on the set of unique terms extracted at least once across the entire dataset.
    """
    all_gold_terms = set()
    for item_terms in gold_standard:
        all_gold_terms.update(item_terms)
    
    all_system_terms = set()
    for item_terms in system_output:
        all_system_terms.update(item_terms)
    
    type_true_positives = len(all_gold_terms.intersection(all_system_terms))
    type_false_positives = len(all_system_terms - all_gold_terms)
    type_false_negatives = len(all_gold_terms - all_system_terms)
    
    type_precision = type_true_positives / (type_true_positives + type_false_positives) if (type_true_positives + type_false_positives) > 0 else 0
    type_recall = type_true_positives / (type_true_positives + type_false_negatives) if (type_true_positives + type_false_negatives) > 0 else 0
    type_f1 = 2 * (type_precision * type_recall) / (type_precision + type_recall) if (type_precision + type_recall) > 0 else 0
    
    return type_precision, type_recall, type_f1


print("✓ Evaluation functions defined")

✓ Evaluation functions defined


## Initialize LLM Model

In [5]:
import os
from pathlib import Path
from dotenv import load_dotenv
import google.generativeai as genai
ROOT_DIR = Path.cwd()  # se il notebook è nella root, va già bene così
dotenv_path = ROOT_DIR / ".env"
if not dotenv_path.exists():
    # fallback: prova nella cartella padre
    dotenv_path = ROOT_DIR.parent / ".env"

load_dotenv(dotenv_path)

api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise RuntimeError(f"GEMINI_API_KEY not found in {dotenv_path}")
# rimuovi eventuali virgolette nel .env
api_key = api_key.strip().strip('"').strip("'")
# Get the API key from the user
#api_key = input("Please enter your Gemini API key: ")
genai.configure(api_key=api_key)

# Define the model
model = genai.GenerativeModel('gemini-2.5-flash')

In [6]:

batch_size = 20

system_prompt = f"""
You are an automatic term extraction agent for Italian municipal waste management texts.
You will receive a list of sentences as input.
Your role is to extract *all and only* waste management terms from each sentence.
Output a list of terms for each sentence.

A "term" in this task is:
- a single- or multi-word expression
- that refers to a concept in the waste management domain
- often nouns or noun phrases (sometimes adjectives or verbs as part of a phrase)

Non-terms are:
- generic function words (e.g., "e", "di", "per", "che")
- pure numbers or dates not part of a waste term
- person names, city names, street names (unless part of an official name of a waste service)

Strictly adhere to the Example Output Format:

Sentence 1: [term1; term2; term3]
Sentence 2: [term4]
Sentence 3: []

--------------------------------
ANNOTATED EXAMPLES (FEW-SHOT)
--------------------------------

Example 1
Sentence:
"Il presente disciplinare per la gestione dei centri di raccolta comunali è stato redatto ai sensi del DM 13/05/2009."
Expected output:
Sentence 1: [disciplinare per la gestione dei centri di raccolta comunali; centri di raccolta comunali]

Example 2
Sentence:
"Il servizio di raccolta differenziata porta a porta dei rifiuti urbani è attivo su tutto il territorio comunale."
Expected output:
Sentence 1: [servizio di raccolta differenziata porta a porta; raccolta differenziata porta a porta; rifiuti urbani]

Example 3
Sentence:
"Il pagamento della Tassa Rifiuti (TARI) avviene tramite il portale pagoPA."
Expected output:
Sentence 1: [tassa rifiuti; tari]

Example 4
Sentence:
"Il presente regolamento disciplina le modalità di conferimento dei rifiuti ingombranti presso l'isola ecologica comunale."
Expected output:
Sentence 1: [regolamento; modalità di conferimento; rifiuti ingombranti; isola ecologica comunale]

Example 5
Sentence:
"In questa frase non sono presenti termini di gestione dei rifiuti."
Expected output:
Sentence 1: []


YOUR TASK:
Now you will receive {batch_size} sentences in the following format:

Sentence k:
<sentence_text>

For each sentence k, you MUST output exactly one line in this format:

Sentence k: [term1; term2; term3]

Instructions:
* Extract only terms related to waste and waste management (e.g., tassa rifiuti, tari, isola ecologica, raccolta differenziata, impianto di trattamento rifiuti).
* Prefer complete multi-word terms (full span) over shorter fragments.
* Do NOT output nested terms: if you extract "impianto di trattamento rifiuti urbani", do NOT also output "trattamento rifiuti urbani", unless it appears as a separate term in the sentence.
* Ignore named entities (people, cities, streets) unless they are part of an official waste management term.
* If a sentence contains no relevant terms, output an empty list: Sentence k: [].
* You must output one line for each of the {batch_size} input sentences, in order (Sentence 1, Sentence 2, ...).
"""


print(f"✓ Few-shot prompt configured (batch size: {batch_size})")

✓ Few-shot prompt configured (batch size: 20)


## Configure Few-Shot Prompt

#### Save LLM responses

In [7]:
SAVE_PATH = "llm_responses.txt"

def save_llm_responses(response_list, path=SAVE_PATH):
    """
    Salva ogni risposta LLM in batch in un file .txt,
    separata da un marcatore ---END-OF-BATCH---
    """
    with open(path, "w", encoding="utf-8") as f:
        for resp in response_list:
            f.write(resp.strip() + "\n---END-OF-BATCH---\n")
    print(f"✓ Saved {len(response_list)} batch responses to {path}")


In [8]:
# Process sentences in batches with LLM
print(f"Processing {len(dev_sentences)} sentences in batches of {batch_size}...")

response_list = []
user_prompt = ""

for i, sent_data in enumerate(tqdm(dev_sentences)):
    sent = sent_data['sentence_text']
    
    # Build prompt until batch size is reached
    if (i + 1) % batch_size == 0:
        user_prompt += f"Sentence {i + 1}:\n {sent}"
        
        # Send batch to LLM
        response = model.generate_content(
            f"System: {system_prompt}\nUser: {user_prompt}"
        )
        response_list.append(response.text)
        
        user_prompt = ""
    else:
        user_prompt += f"Sentence {i + 1}:\n {sent}\n\n"

# Process remaining sentences (last batch)
if user_prompt:
    user_prompt = user_prompt.rstrip()
    response = model.generate_content(
        f"System: {system_prompt}\nUser: {user_prompt}"
    )
    response_list.append(response.text)

print(f"✓ Received {len(response_list)} batch responses from LLM")
save_llm_responses(response_list)

Processing 577 sentences in batches of 20...


100%|██████████| 577/577 [09:29<00:00,  1.01it/s]


✓ Received 29 batch responses from LLM
✓ Saved 29 batch responses to llm_responses.txt


### Parse LLM responses

In [9]:
with open("llm_responses.txt", "r", encoding="utf-8") as f:
    raw = f.read()

responses = raw.split("\n---END-OF-BATCH---\n")
responses = [r.strip() for r in responses if r.strip()]


In [10]:
# Parse LLM responses to extract term lists
print("Parsing LLM responses...")

llm_preds = []
for response in response_list:
    for sent in response.split('\n'):
        if 'Sentence' in sent and '[' in sent:
            try:
                # Extract sentence ID
                id_match = re.search(r'Sentence (\d+):', sent)
                if not id_match:
                    continue
                
                # Extract terms from brackets
                terms_match = re.search(r'\[(.*?)\]', sent)
                if terms_match:
                    terms = terms_match.group(1).split(';')
                    terms = [term.strip().lower() for term in terms if term.strip()]
                else:
                    terms = []
                
                llm_preds.append(terms)
            except Exception as e:
                print(f"Warning: Could not parse line: {sent[:50]}...")
                llm_preds.append([])

# Verify output length matches input
if len(llm_preds) != len(dev_sentences):
    print(f"Warning: Output length ({len(llm_preds)}) doesn't match input ({len(dev_sentences)})")
    # Pad or truncate to match
    while len(llm_preds) < len(dev_sentences):
        llm_preds.append([])
    llm_preds = llm_preds[:len(dev_sentences)]

print(f"✓ Parsed {len(llm_preds)} predictions")

Parsing LLM responses...
✓ Parsed 577 predictions


### Save LLM predictions

In [11]:
# Save predictions in competition format
def save_predictions(predictions, sentences, output_path):
    """Save predictions in competition format."""
    output = {'data': []}
    for pred, sent in zip(predictions, sentences):
        output['data'].append({
            'document_id': sent['document_id'],
            'paragraph_id': sent['paragraph_id'],
            'sentence_id': sent['sentence_id'],
            'term_list': pred
        })
    
    os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(output, f, ensure_ascii=False, indent=2)
    print(f"✓ Saved {len(predictions)} predictions to {output_path}")


save_predictions(llm_preds, dev_sentences, 'predictions/subtask_a_dev_llm_few_shot_preds.json')

✓ Saved 577 predictions to predictions/subtask_a_dev_llm_few_shot_preds.json


## Evaluate Performance

In [12]:
# Prepare gold standard and predictions for evaluation
dev_gold = [s['terms'] for s in dev_sentences]

# Calculate metrics
precision, recall, f1, tp, fp, fn = micro_f1_score(dev_gold, llm_preds)
type_precision, type_recall, type_f1 = type_f1_score(dev_gold, llm_preds)

print("\n" + "="*60)
print("LLM FEW-SHOT BASELINE RESULTS")
print("="*60)
print("\nMicro-averaged Metrics:")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1 Score:  {f1:.4f}")
print(f"  TP={tp}, FP={fp}, FN={fn}")

print("\nType-level Metrics:")
print(f"  Type Precision: {type_precision:.4f}")
print(f"  Type Recall:    {type_recall:.4f}")
print(f"  Type F1 Score:  {type_f1:.4f}")
print("="*60)


LLM FEW-SHOT BASELINE RESULTS

Micro-averaged Metrics:
  Precision: 0.4630
  Recall:    0.6652
  F1 Score:  0.5460
  TP=300, FP=348, FN=151

Type-level Metrics:
  Type Precision: 0.4037
  Type Recall:    0.7190
  Type F1 Score:  0.5171


In [13]:
# Show example predictions
print("Example Predictions:\n")

count = 0
for i in range(len(dev_sentences)):
    if len(dev_gold[i]) > 0 and count < 5:
        print(f"Sentence: {dev_sentences[i]['sentence_text'][:100]}...")
        print(f"Gold terms: {dev_gold[i][:5]}")
        print(f"LLM predictions: {llm_preds[i][:5]}")
        
        correct = set(dev_gold[i]) & set(llm_preds[i])
        missed = set(dev_gold[i]) - set(llm_preds[i])
        wrong = set(llm_preds[i]) - set(dev_gold[i])
        
        print(f"✓ Correct: {len(correct)}")
        print(f"✗ Missed: {len(missed)}")
        print(f"✗ Wrong: {len(wrong)}")
        print("-"*80)
        print()
        
        count += 1

Example Predictions:

Sentence: Il presente disciplinare per la gestione dei centri di raccolta comunali è stato redatto ai sensi e ...
Gold terms: ['disciplina dei centri di raccolta dei rifiuti urbani raccolti in modo differenziato', 'disciplinare per la gestione dei centri di raccolta comunali']
LLM predictions: ['disciplinare per la gestione dei centri di raccolta comunali', 'centri di raccolta comunali', 'disciplina dei centri di raccolta dei rifiuti urbani raccolti in modo differenziato', 'centri di raccolta dei rifiuti urbani raccolti in modo differenziato']
✓ Correct: 2
✗ Missed: 0
✗ Wrong: 2
--------------------------------------------------------------------------------

Sentence: È un Servizio Supplementare di raccolta, rivolto a famiglie con bambini al di sotto dei 3 anni o con...
Gold terms: ['raccolta']
LLM predictions: ['servizio supplementare di raccolta']
✓ Correct: 0
✗ Missed: 1
✗ Wrong: 1
-------------------------------------------------------------------------------