In [103]:
import pandas as pd
import codecs
import json
from collections import defaultdict

def load_data(file_path):
  """
  Loads data from a CSV or JSON file and returns a dictionary
  where keys are (doc_id, par_id, sent_id) tuples and values are
  lists of terms.

  Args:
    file_path: The path to the input file (CSV or JSON).

  Returns:
    A dictionary containing the loaded data.

  Raises:
    ValueError: If the file format is not supported.
  """
  if file_path.endswith('.csv'):
    # Load data from CSV file
    df = pd.read_csv(file_path)
    df.fillna('', inplace=True) # Fill NaN values with empty strings
    data = defaultdict(list)
    # Iterate over rows and extract terms
    for doc_id, par_id, sent_id, _, term in df.itertuples(index=False):
      if term.strip() != '':
        data[(doc_id, par_id, sent_id)].append(term.strip())
      else:
         data[(doc_id, par_id, sent_id)]
  elif file_path.endswith('.json'):
    # Load data from JSON file
    with codecs.open(file_path, 'r', 'utf-8') as f:
      json_data = json.load(f)
    # Extract terms from JSON data
    data = {(row["document_id"], row["paragraph_id"], row["sentence_id"]): row["term_list"]
            for row in json_data["data"]}
  else:
    # Raise error for unsupported file formats
    raise ValueError("Unsupported file format. Only CSV and JSON files are supported.")
  return data

In [104]:
def micro_f1_score(gold_standard, system_output):
  """
  Evaluates a term extraction system's performance using Precision, Recall,
  and F1 score based on individual term matching (micro-average).

  Args:
    gold_standard: A list of lists, where each inner list contains the
                   gold standard terms for an item.
    system_output: A list of lists, where each inner list contains the
                   terms extracted by the system for the corresponding item.

  Returns:
    A tuple containing the Precision, Recall, and F1 score.
  """
  total_true_positives = 0
  total_false_positives = 0
  total_false_negatives = 0

  # Iterate through each item's gold standard and system output terms
  for gold, system in zip(gold_standard, system_output):
    # Convert to sets for efficient comparison
    gold_set = set(gold)
    system_set = set(system)

    # Calculate True Positives, False Positives, and False Negatives for the current item
    true_positives = len(gold_set.intersection(system_set))
    false_positives = len(system_set - gold_set)
    false_negatives = len(gold_set - system_set)

    # Accumulate totals across all items
    total_true_positives += true_positives
    total_false_positives += false_positives
    total_false_negatives += false_negatives

  # Calculate Precision, Recall, and F1 score (micro-average)
  precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0
  recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0
  f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

  return precision, recall, f1

In [105]:
def type_f1_score(gold_standard, system_output):
  """
  Evaluates a term extraction system's performance using Type Precision,
  Type Recall, and Type F1 score based on the set of unique terms extracted
  at least once across the entire dataset.

  Args:
    gold_standard: A list of lists, where each inner list contains the
                   gold standard terms for an item.
    system_output: A list of lists, where each inner list contains the
                   terms extracted by the system for the corresponding item.

  Returns:
    A tuple containing the Type Precision, Type Recall, and Type F1 score.
  """

  # Get the set of all unique gold standard terms across the dataset
  all_gold_terms = set()
  for item_terms in gold_standard:
    all_gold_terms.update(item_terms)

  # Get the set of all unique system extracted terms across the dataset
  all_system_terms = set()
  for item_terms in system_output:
    all_system_terms.update(item_terms)

  # Calculate True Positives (terms present in both sets)
  type_true_positives = len(all_gold_terms.intersection(all_system_terms))

  # Calculate False Positives (terms in system output but not in gold standard)
  type_false_positives = len(all_system_terms - all_gold_terms)

  # Calculate False Negatives (terms in gold standard but not in system output)
  type_false_negatives = len(all_gold_terms - all_system_terms)

  # Calculate Type Precision, Type Recall, and Type F1 score
  type_precision = type_true_positives / (type_true_positives + type_false_positives) if (type_true_positives + type_false_positives) > 0 else 0
  type_recall = type_true_positives / (type_true_positives + type_false_negatives) if (type_true_positives + type_false_negatives) > 0 else 0
  type_f1 = 2 * (type_precision * type_recall) / (type_precision + type_recall) if (type_precision + type_recall) > 0 else 0

  return type_precision, type_recall, type_f1

In [106]:
GOLD_STANDARD_PATH = "../data/subtask_a_dev.json"

In [107]:
# Load data from the specified file paths
gold_standard_dict = load_data(GOLD_STANDARD_PATH)

# Extract the term lists from the loaded data dictionaries
gold_standard = list(gold_standard_dict.values())

Dataset	
Micro-Precision	Micro-Recall	Micro-F1	Type-Precision	Type-Recall	  Type-F1
	0.439	      0.616	         0.513	       0.372	       0.636	    0.47


In [108]:
import os
import glob

# Path to predictions directory and baseline
predictions_dir = "../src/predictions/"
gold_standard_file = "../data/subtask_a_dev.json"

# Get all JSON files in the predictions directory
prediction_files = glob.glob(os.path.join(predictions_dir, "*.json"))


# Load gold standard once
gold_standard_dict = load_data(gold_standard_file)
gold_standard = list(gold_standard_dict.values())

# Store results
results = []

# Evaluate each prediction file
for pred_file in prediction_files:
    if pred_file.find("subtask_a_dev_") == -1:
        continue
    file_name = os.path.basename(pred_file)
    #print(f"Evaluating {file_name}...")
    
    try:
        # Load system output
        system_output_dict = load_data(pred_file)
        system_output = list(system_output_dict.values())
        
        # Calculate micro-averaged metrics
        precision, recall, f1 = micro_f1_score(gold_standard, system_output)
        
        # Calculate type metrics
        type_precision, type_recall, type_f1 = type_f1_score(gold_standard, system_output)
        
        # Store results with cleaned file name
        clean_name = file_name.replace("subtask_a_dev_", "").replace("_preds", "").replace(".json", "")
        results.append({
            'file_name': clean_name,
            'micro_precision': round(precision, 3),
            'micro_recall': round(recall, 3),
            'micro_f1': round(f1, 3),
            'type_precision': round(type_precision, 3),
            'type_recall': round(type_recall, 3),
            'type_f1': round(type_f1, 3)
        })
    except Exception as e:
        print(f"Error evaluating {file_name}: {e}")

# Create DataFrame and sort by micro F1 score (descending)
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('micro_f1', ascending=False).reset_index(drop=True)

# Display the table
print("\n" + "="*100)
print("EVALUATION RESULTS")
print("="*100)
print(results_df.to_string(index=False))

# Save to CSV
output_csv = "subtask_a_evaluation_results.csv"
results_df.to_csv(output_csv, index=False)
print(f"\nResults saved to {output_csv}")


EVALUATION RESULTS
          file_name  micro_precision  micro_recall  micro_f1  type_precision  type_recall  type_f1
final_MD_submission            0.773         0.723     0.747           0.725        0.665    0.694
final_SM_submission            0.774         0.721     0.746           0.725        0.665    0.694
           final_MD            0.765         0.727     0.745           0.718        0.674    0.695
           final_SM            0.743         0.725     0.734           0.688        0.674    0.681
          bert_2e-5            0.732         0.710     0.721           0.677        0.649    0.662
   spacy_trained_MD            0.598         0.339     0.433           0.585        0.384    0.464
   spacy_trained_SM            0.476         0.326     0.387           0.405        0.368    0.385
  spacy_baseline_SM            0.289         0.341     0.313           0.349        0.360    0.354
  spacy_baseline_MD            0.289         0.341     0.313           0.349        0.360

## Research Questions and Considerations

### RQ1 — Are zero-shot/few-shot LLMs better than classical baselines or supervised systems?
To answer this question, I compared three groups:
- LLM zero-shot / few-shot → llm_zero_shot, llm_few_shot
- Rule-based / spaCy baselines → vanilla, spacy_baseline, spacy_term_extraction_patterns
- My supervised systems (BERT + ensemble) → all bert_* and ensemble_* models

#### Observations
- Zero-shot and few-shot LLMs do not outperform the baselines nor the supervised systems- Their micro-F1 scores (≈0.545) are significantly lower than my supervised models.
- LLMs tend to show high recall but very low precision, generating many false positives.

My supervised models clearly outperform the LLMs:
- Best ensemble reaches micro-F1 = 0.751
- LLM few/zero-shot are stuck around 0.545
- Rule-based baselines (e.g., vanilla) unexpectedly perform similarly to LLM zero-shot, confirming that the LLMs struggle without fine-tuning.

#### Conclusion
In this task, zero-shot/few-shot LLMs are not competitive.
Supervised approaches—especially BERT with linguistic filtering and reranking—are 20+ F1 points better.
For domain-specific ATE in Italian, contextual supervision is essential.

## RQ2 — Does model size or complexity improve extraction quality
### Observations
- Basic BERT models reach micro-F1 ≈ 0.69–0.71.
- Hyperparameter tuning (e.g., lr = 2e-5) and text cleaning improve BERT performance: up to 0.733 micro-F1
- Increasing model complexity through an ensemble pipeline yields further gains: best ensemble = 0.751 micro-F1
The ensemble combines:
- BERT predictions
- spaCy trained outputs
- vocabulary-based filtering
- refined LLM reranking
Each component adds robustness and reduces false positives.

### Conclusion
Performance improves not because the model is larger, but because the pipeline is richer and more structured.

## RQ3 — Which approach performs best overall?
- Rule-based systems have reasonable recall but very low precision → low F1.
- spaCy trained improves over baselines but lacks contextual accuracy.
- LLMs without fine-tuning fail to reach competitive precision.
- BERT models are consistently strong and greatly benefit from cleaning + tuning.
- Ensemble models outperform all others, balancing recall and precision effectively.

The best-performing approach is the supervised ensemble pipeline combining
BERT + spaCy + vocabulary filtering which achieves the highest and most stable results across all metrics.

## **Overall Improvements in My Pipeline**
Throughout the project, I introduced several pipeline enhancements that contributed to the final performance:
1. Text cleaning and normalization: reduced noise, improved token consistency, and removed truncated/partial terms → fewer false positives.

2. BERT fine-tuning optimization (learning rate 2e-5, more epochs, cleaned data): consistent improvement of 3–5 F1 points over vanilla BERT.

3. Training spaCy on the ATE-IT training set
Added robustness for multi-word terms and syntactic detection.

4. Vocabulary-based filtering (strong + weak vocab): eliminated common false positives introduced by spaCy expansions.

5. LLM reranking: din't improved metrics overall

6. Final ensemble integration: combining signals from BERT + spaCy + vocabulary + LLM reranking yields the best model overall, outperforming any single component.