# Test pratico per colloquio tecnico con Rogue Waves AI
> Silvano Quarto, 30/06/2025 ore 16:00

## Setup & imports

In [None]:
!git clone https://github.com/silvano315/Information-extractor-from-text-with-OpenAI.git
%cd Information-extractor-from-text-with-OpenAI
!pip install -r requirements.txt

In [1]:
# Import libraries

from src.io.load_data import load_articles_json, load_ground_truth_json
from src.analysis.eda import EDA
from src.preprocessing.text_utils import preprocess_articles_dataset
from src.llm.openai_client import process_single_article
from src.validation.batch_processing import process_ground_truth_articles, save_final_results
from src.validation.metrics_evaluation import run_complete_evaluation

## Load Data

In [2]:
# Load cleaned articles from JSON file

articles = load_articles_json("data/raw/clean_articles.json")

Loaded 699 articles from data/raw/clean_articles.json


## Articles exploration

In [3]:
# Initialize EDA class

eda = EDA(articles)

In [8]:
# Basic stats

print(eda.text_length_stats())
print(eda.token_stats())

{'num_documents': 699, 'avg_length_chars': 2443, 'min_length_chars': 1445, 'max_length_chars': 2985, 'avg_words': 352, 'min_words': 212, 'max_words': 419}
{'avg_tokens': 446, 'min_tokens': 262, 'max_tokens': 500, 'total_tokens': 311938}


In [5]:
print(eda.preview_articles(3))


--- Article 1 (ID: fbf3a70f...) ---
Length: 2132 chars, 310 words
Preview: **Scientific Innovations in Raossi: A Look into Recent Biological Research**

*By Ramona Micca | December 27, 2024*

In the small town of Raossi, a significant advancement in biological research has e...
--------------------------------------------------

--- Article 2 (ID: 46e67824...) ---
Length: 2520 chars, 377 words
Preview: **Football Community Faces Setbacks in Campiglia Marittima Stazione**  
*By Ilaria Lussu | October 19, 2024*

In a disheartening turn of events for football enthusiasts in Campiglia Marittima Stazione...
--------------------------------------------------

--- Article 3 (ID: e3487f20...) ---
Length: 2317 chars, 333 words
Preview: **Drug Trafficking Investigation Leads to Suspect Arrest in Bruzzano Zeffirio**  
*By Eraldo Serao, April 4, 2025*

In a significant development in the ongoing fight against organized crime, law enfor...
--------------------------------------------------
None


## Preprocessing

In [4]:
# Minimal preprocessing step

preprocessed_articles_filepath = preprocess_articles_dataset("data/raw/clean_articles.json", "data/preprocessed/preprocessed_articles.json")

Loaded 699 articles from data/raw/clean_articles.json
Total articles processed: 699


## Extract information with OpenAI
> This is just a test section to extract information and validate outputs

In [3]:
print(articles[0])

{'id': 'fbf3a70f-e4de-45e7-8f12-33b957c11490', 'text': '**Scientific Innovations in Raossi: A Look into Recent Biological Research**\n\n*By Ramona Micca | December 27, 2024*\n\nIn the small town of Raossi, a significant advancement in biological research has emerged, thanks to the collaborative efforts of a dedicated team of scientists. This group, comprising Sig. Orlando Trobbiani, a researcher with a keen focus on biological sciences; Evangelista Vezzali, an inventor and data analyst; and Dott. Donatello Legnante, a postdoctoral fellow, has recently presented their latest findings that could have implications for the field of biology.\n\nThe team has been working diligently to explore new biological concepts and methodologies, aiming to enhance our understanding of complex biological systems. While specific details of their research have yet to be disclosed to the public, the team\'s collective expertise suggests a promising direction in the study of biology that could potentially ad

In [4]:
sample_results = process_single_article(articles[0], model="gpt-4o-mini")

# Print the sample results
print(sample_results)

* 'schema_extra' has been renamed to 'json_schema_extra'


{'article_id': 'fbf3a70f-e4de-45e7-8f12-33b957c11490', 'extraction': {'people': [{'name': 'Orlando Trobbiani', 'roles': ['Researcher', 'Biological Sciences']}, {'name': 'Evangelista Vezzali', 'roles': ['Inventor', 'Data Analyst']}, {'name': 'Donatello Legnante', 'roles': ['Postdoctoral Fellow']}], 'topic': 'Science', 'subtopic': 'Biology', 'date': '2024-12-27'}, 'success': True, 'error': None, 'metadata': {'model': 'gpt-4o-mini', 'tokens_used': 1011}}


In [3]:
sample_results = process_single_article(articles[1], model="gpt-4o-mini")

# Print the sample results
print(sample_results)

{'article_id': '46e67824-f1c2-425a-912a-02eaccd3c494', 'extraction': {'people': [{'name': 'Giuliano Bembo', 'roles': ['Trainer']}, {'name': 'Roman Filzi-Verri', 'roles': ['Club Owner', 'Agent']}, {'name': 'Ninetta Brunelleschi-Pepe', 'roles': ['Referee', 'Athlete']}, {'name': 'Giulio Stucchi', 'roles': ['Sports Commentator']}, {'name': 'Sebastiano Roncalli-Pedersoli', 'roles': ['Sports Photographer', 'Mascot']}], 'topic': 'Sports', 'subtopic': 'Football', 'date': '2024-10-19'}, 'success': True, 'error': None, 'metadata': {'model': 'gpt-4o-mini', 'tokens_used': 1175}}


In [4]:
sample_results = process_single_article(articles[2], model="gpt-4o-mini")

# Print the sample results
print(sample_results)

{'article_id': 'e3487f20-56dd-4a16-a2ef-df7b756a459d', 'extraction': {'people': [{'name': 'Gelsomina Rizzoli', 'roles': ['suspected mafia boss']}, {'name': 'Marcella Priuli', 'roles': ['Detective', 'investigator']}, {'name': 'Augusto Romano', 'roles': ['Crime Scene Investigator', 'key witness']}, {'name': 'Serafina Mennea', 'roles': ['Judge']}, {'name': 'Livio Bonatti', 'roles': ['court clerk', 'coroner']}, {'name': 'Annalisa Panzera-Iadanza', 'roles': ['victim']}], 'topic': 'Crime', 'subtopic': 'Drug Trafficking', 'date': '2025-04-04'}, 'success': True, 'error': None, 'metadata': {'model': 'gpt-4o-mini', 'tokens_used': 1125}}


## Extract information for Validation
> using only UUID found in 200_gt_evaluation_human.json

In [3]:
# Load ground truth data

ground_truth = load_ground_truth_json("data/raw/200_gt_evaluation_human.json")

Loaded 200 ground truth samples from data/raw/200_gt_evaluation_human.json


In [5]:
results = process_ground_truth_articles(
    articles_filepath="data/raw/clean_articles.json",
    ground_truth_filepath="data/raw/200_gt_evaluation_human.json",
    model="gpt-4o-mini",
    batch_size=10
)

Starting Ground Truth Article Processing
Loading datasets...
Loaded 699 articles from data/raw/clean_articles.json
Loaded 200 ground truth samples from data/raw/200_gt_evaluation_human.json
Found 200 UUIDs in ground truth
Matched 200 articles with ground truth
Coverage: 200/200 ground truth entries

Processing 200 articles...
Processing 1/200 - ID: 0142e7ab...
Processing 2/200 - ID: dfda542f...
Processing 3/200 - ID: 5b4f5864...
Processing 4/200 - ID: 778abe7e...
Processing 5/200 - ID: 6f3999bf...
Processing 6/200 - ID: c1d30eb2...
Processing 7/200 - ID: 0793ac90...
Processing 8/200 - ID: 6dbdaaf8...
Processing 9/200 - ID: 81516984...
Processing 10/200 - ID: 3c35dac8...
Saved intermediate results (batch 10)
Processing 11/200 - ID: 1ac4a5c6...
Processing 12/200 - ID: ea330a23...
Processing 13/200 - ID: a72d5548...
Processing 14/200 - ID: aef1cb78...
Processing 15/200 - ID: 6ede7871...
Processing 16/200 - ID: f6e6759e...
Processing 17/200 - ID: f74a4812...
Processing 18/200 - ID: f2bfee3

In [6]:
if results:
    save_final_results(results)

Final results saved to: data/output/extraction_results.json


## Metrics Evaluation

In [2]:
results = run_complete_evaluation(
    "data/output/extraction_results.json",
    "data/raw/200_gt_evaluation_human.json"
)

Starting Final Evaluation
Loading data...
Loaded 200 predictions
Loaded 200 ground truth entries

Evaluating entity and role extraction...
Evaluating topic/subtopic classification...

EVALUATION RESULTS:
   Entity F1: 0.812
   Role F1: 0.741
   Topic Accuracy: 0.975
   Subtopic Accuracy: 0.970
Detailed report saved to: data/output/evaluation_report.json
