# LLM-Based Term Extraction (Zero-Shot)

This notebook demonstrates a zero-shot term extraction approach using Large Language Models:
- Uses Google Gemini API for term extraction
- Processes sentences in batches for efficiency
- No training required - relies on LLM's general knowledge

Dataset: EvalITA 2025 ATE-IT (Automatic Term Extraction - Italian Testbed)

## Setup and Imports

In [6]:
import pandas as pd
import json
import os
import re
from tqdm import tqdm

print("✓ Libraries imported")

✓ Libraries imported


## Data Loading and Processing

In [7]:
def load_jsonl(path: str):
    """Load a JSON lines file or JSON array file."""
    with open(path, 'r', encoding='utf-8') as f:
        text = f.read().strip()
    if not text:
        return []
    try:
        data = json.loads(text)
    except json.JSONDecodeError:
        data = []
        for line in text.splitlines():
            line = line.strip()
            if line:
                data.append(json.loads(line))
    return data


def build_sentence_gold_map(records):
    """Convert dataset rows into list of sentences with aggregated terms."""
    out = {}
    
    if isinstance(records, dict) and 'data' in records:
        rows = records['data']
    else:
        rows = records
    
    for r in rows:
        key = (r.get('document_id'), r.get('paragraph_id'), r.get('sentence_id'))
        if key not in out:
            out[key] = {
                'document_id': r.get('document_id'),
                'paragraph_id': r.get('paragraph_id'),
                'sentence_id': r.get('sentence_id'),
                'sentence_text': r.get('sentence_text', ''),
                'terms': []
            }
        
        if isinstance(r.get('term_list'), list):
            for t in r.get('term_list'):
                if t and t not in out[key]['terms']:
                    out[key]['terms'].append(t)
        else:
            term = r.get('term')
            if term and term not in out[key]['terms']:
                out[key]['terms'].append(term)
    
    return list(out.values())


print("✓ Data loading functions defined")

✓ Data loading functions defined


In [8]:
# Load training and dev data
train_data = load_jsonl('../data/subtask_a_train.json')
dev_data = load_jsonl('../data/subtask_a_dev.json')

train_sentences = build_sentence_gold_map(train_data)
dev_sentences = build_sentence_gold_map(dev_data)

print(f"Training sentences: {len(train_sentences)}")
print(f"Dev sentences: {len(dev_sentences)}")
print(f"\nExample sentence:")
print(f"  Text: {dev_sentences[0]['sentence_text']}")
print(f"  Terms: {dev_sentences[0]['terms']}")

Training sentences: 2308
Dev sentences: 577

Example sentence:
  Text: Non Domestica; CAMPEGGI, DISTRIBUTORI CARBURANTI, PARCHEGGI; 1,22; 4,73 
  Terms: []


## Evaluation Metrics

Using the official evaluation metrics from the competition.

In [9]:
def micro_f1_score(gold_standard, system_output):
    """
    Evaluates performance using Precision, Recall, and F1 score 
    based on individual term matching (micro-average).
    """
    total_true_positives = 0
    total_false_positives = 0
    total_false_negatives = 0
    
    for gold, system in zip(gold_standard, system_output):
        gold_set = set(gold)
        system_set = set(system)
        
        true_positives = len(gold_set.intersection(system_set))
        false_positives = len(system_set - gold_set)
        false_negatives = len(gold_set - system_set)
        
        total_true_positives += true_positives
        total_false_positives += false_positives
        total_false_negatives += false_negatives
    
    precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0
    recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1, total_true_positives, total_false_positives, total_false_negatives


def type_f1_score(gold_standard, system_output):
    """
    Evaluates performance using Type Precision, Type Recall, and Type F1 score
    based on the set of unique terms extracted at least once across the entire dataset.
    """
    all_gold_terms = set()
    for item_terms in gold_standard:
        all_gold_terms.update(item_terms)
    
    all_system_terms = set()
    for item_terms in system_output:
        all_system_terms.update(item_terms)
    
    type_true_positives = len(all_gold_terms.intersection(all_system_terms))
    type_false_positives = len(all_system_terms - all_gold_terms)
    type_false_negatives = len(all_gold_terms - all_system_terms)
    
    type_precision = type_true_positives / (type_true_positives + type_false_positives) if (type_true_positives + type_false_positives) > 0 else 0
    type_recall = type_true_positives / (type_true_positives + type_false_negatives) if (type_true_positives + type_false_negatives) > 0 else 0
    type_f1 = 2 * (type_precision * type_recall) / (type_precision + type_recall) if (type_precision + type_recall) > 0 else 0
    
    return type_precision, type_recall, type_f1


print("✓ Evaluation functions defined")

✓ Evaluation functions defined


## Initialize LLM Model

In [None]:
import google.generativeai as genai
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get API key from environment
genai.configure(api_key=os.getenv('GEMINI_API_KEY'))

# Initialize Gemini model
model = genai.GenerativeModel('gemini-2.5-flash')

print("✓ Gemini model initialized")

  from .autonotebook import tqdm as notebook_tqdm


TypeError: 'bool' object is not subscriptable

In [None]:
# Configure batch size for efficient processing
batch_size = 20

# Define zero-shot prompt for term extraction
system_prompt = f"""You are an automatic term extraction agent. You will receive a list of sentences as input.
Your role is to extract waste management terms from the sentences. Output a list of terms for each sentence.

Strictly adhere to the Example Output Format:

Example Output Format:
Sentence 1: [term1; term2; term3; term4]
Sentence 2: [term5; term6]
Sentence 3: []
Sentence 4: []
Sentence 5: [term7]

Instructions:
* Extract only terms, ignore named entities
* Do not extract nested terms
* Extract only terms related to waste management, ignoring other domains
* If a sentence contains no terms, output an empty list for that sentence
* You must output {batch_size} lists of terms, one for each sentence
"""

print(f"✓ Zero-shot prompt configured (batch size: {batch_size})")

## Configure Zero-Shot Prompt

In [None]:
# Process sentences in batches with LLM
print(f"Processing {len(dev_sentences)} sentences in batches of {batch_size}...")

response_list = []
user_prompt = ""

for i, sent_data in enumerate(tqdm(dev_sentences)):
    sent = sent_data['sentence_text']
    
    # Build prompt until batch size is reached
    if (i + 1) % batch_size == 0:
        user_prompt += f"Sentence {i + 1}:\n {sent}"
        
        # Send batch to LLM
        response = model.generate_content(
            f"System: {system_prompt}\nUser: {user_prompt}"
        )
        response_list.append(response.text)
        
        user_prompt = ""
    else:
        user_prompt += f"Sentence {i + 1}:\n {sent}\n\n"

# Process remaining sentences (last batch)
if user_prompt:
    user_prompt = user_prompt.rstrip()
    response = model.generate_content(
        f"System: {system_prompt}\nUser: {user_prompt}"
    )
    response_list.append(response.text)

print(f"✓ Received {len(response_list)} batch responses from LLM")


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 577/577 [11:52<00:00,  1.23s/it]


### Parse LLM responses

In [None]:
# Parse LLM responses to extract term lists
print("Parsing LLM responses...")

llm_preds = []
for response in response_list:
    for sent in response.split('\n'):
        if 'Sentence' in sent and '[' in sent:
            try:
                # Extract sentence ID
                id_match = re.search(r'Sentence (\d+):', sent)
                if not id_match:
                    continue
                
                # Extract terms from brackets
                terms_match = re.search(r'\[(.*?)\]', sent)
                if terms_match:
                    terms = terms_match.group(1).split(';')
                    terms = [term.strip().lower() for term in terms if term.strip()]
                else:
                    terms = []
                
                llm_preds.append(terms)
            except Exception as e:
                print(f"Warning: Could not parse line: {sent[:50]}...")
                llm_preds.append([])

# Verify output length matches input
if len(llm_preds) != len(dev_sentences):
    print(f"Warning: Output length ({len(llm_preds)}) doesn't match input ({len(dev_sentences)})")
    # Pad or truncate to match
    while len(llm_preds) < len(dev_sentences):
        llm_preds.append([])
    llm_preds = llm_preds[:len(dev_sentences)]

print(f"✓ Parsed {len(llm_preds)} predictions")

Output data is the same length as input data.


### Save LLM predictions

In [None]:
# Save predictions in competition format
def save_predictions(predictions, sentences, output_path):
    """Save predictions in competition format."""
    output = {'data': []}
    for pred, sent in zip(predictions, sentences):
        output['data'].append({
            'document_id': sent['document_id'],
            'paragraph_id': sent['paragraph_id'],
            'sentence_id': sent['sentence_id'],
            'term_list': pred
        })
    
    os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(output, f, ensure_ascii=False, indent=2)
    print(f"✓ Saved {len(predictions)} predictions to {output_path}")


save_predictions(llm_preds, dev_sentences, 'predictions/subtask_a_dev_llm_zero_shot_preds.json')

## Evaluate Performance

In [None]:
# Prepare gold standard and predictions for evaluation
dev_gold = [s['terms'] for s in dev_sentences]

# Calculate metrics
precision, recall, f1, tp, fp, fn = micro_f1_score(dev_gold, llm_preds)
type_precision, type_recall, type_f1 = type_f1_score(dev_gold, llm_preds)

print("\n" + "="*60)
print("LLM ZERO-SHOT BASELINE RESULTS")
print("="*60)
print("\nMicro-averaged Metrics:")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1 Score:  {f1:.4f}")
print(f"  TP={tp}, FP={fp}, FN={fn}")

print("\nType-level Metrics:")
print(f"  Type Precision: {type_precision:.4f}")
print(f"  Type Recall:    {type_recall:.4f}")
print(f"  Type F1 Score:  {type_f1:.4f}")
print("="*60)

NameError: name 'llm_preds' is not defined

In [None]:
# Show example predictions
print("Example Predictions:\n")

count = 0
for i in range(len(dev_sentences)):
    if len(dev_gold[i]) > 0 and count < 5:
        print(f"Sentence: {dev_sentences[i]['sentence_text'][:100]}...")
        print(f"Gold terms: {dev_gold[i][:5]}")
        print(f"LLM predictions: {llm_preds[i][:5]}")
        
        correct = set(dev_gold[i]) & set(llm_preds[i])
        missed = set(dev_gold[i]) - set(llm_preds[i])
        wrong = set(llm_preds[i]) - set(dev_gold[i])
        
        print(f"✓ Correct: {len(correct)}")
        print(f"✗ Missed: {len(missed)}")
        print(f"✗ Wrong: {len(wrong)}")
        print("-"*80)
        print()
        
        count += 1