In [1]:
# ==========================================
# PIPELINE BATCH PARA EXTRACCIÓN ESTRUCTURADA
# ==========================================

In [2]:
# -----------------------------
# 1. LIBRERÍAS.
# -----------------------------
from __future__ import annotations
from typing import List, Optional, Literal
from pydantic import BaseModel, Field
import pandas as pd
import json, pathlib
from openai import OpenAI

pd.options.display.max_columns = None

ModuleNotFoundError: No module named 'pydantic'

In [None]:
# -----------------------------
# 2. CONSTANTES.
# -----------------------------
project_path = "C:/Users/i_link/Maestría/Text Mining/nlp_dmuba/"
dataset_file_path = project_path + "1-Scraping/dataset_consolidado/df.parquet"

In [None]:
# -----------------------------
# 3. LECTURA DE DATOS.
# -----------------------------
df = pd.read_parquet(dataset_file_path)

In [None]:
# Opcional: sample para pruebas
sample = 50
df_sample = df.dropna(subset=["contenido"]).sample(sample, random_state=42).reset_index(drop=True)

In [None]:
# -----------------------------
# 4. LECTURA DE CLAVE OPENAI.
# -----------------------------
with open(project_path + ".secrets/openai_api_key.txt", "r") as f:
    key = f.read().strip()

client = OpenAI(api_key=key)

In [None]:
# -----------------------------
# 5. PROMPTS Y SYSTEM.
# -----------------------------
SYSTEM_PROMPT = '''
Eres un analista económico-financiero especializado en Argentina.
Objetivo: extraer datos ESTRUCTURADOS de una noticia para modelar el MERVAL.
Reglas de oro:
- Usa SOLO el texto de la noticia.
- Si no hay evidencia clara: usa 0.0, "unknown" o null.
- Valores en [-1..1]; confianza y calidad en [0..1].
- horizonte_dias SOLO si se menciona explícitamente.
- Listas sin duplicados; tickers en MAYÚSCULAS.
'''

USER_TEMPLATE = '''
Diario: {diario}
Fecha: {fecha}
Seccion: {seccion}
Titulo: {titulo}
Contenido: {contenido}

Devuelve SOLO el JSON con el esquema pedido.

'''

In [None]:
# -----------------------------
# 6. CREAR ARCHIVO JSONL PARA BATCH.
# -----------------------------
batch_requests_path = pathlib.Path(project_path) / "batch_requests.jsonl"

with open(batch_requests_path, "w", encoding="utf-8") as f:
    for i, row in df_sample.iterrows():
        contenido = (row.get("contenido") or "")[:8000]
        prompt = USER_TEMPLATE.format(
            diario=row.get("diario", "unknown"),
            fecha=str(row.get("fecha", "unknown")),
            seccion=row.get("seccion", "unknown"),
            titulo=row.get("titulo", "unknown"),
            contenido=contenido
        )

        request_dict = {
            "custom_id": f"row_{i}",
            "method": "POST",
            "url": "/v1/responses",
            "body": {
                "model": "gpt-5-mini",
                "input": [
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": prompt}
                ],
                "text_format": "json"  # devuelve JSON válido
            }
        }
        f.write(json.dumps(request_dict, ensure_ascii=False) + "\n")

print(f"✅ Archivo JSONL creado en {batch_requests_path}")

In [None]:
# -----------------------------
# 7. SUBIR EL BATCH.
# -----------------------------
batch_job = client.batches.create(
    input_file=open(batch_requests_path, "rb"),
    endpoint="/v1/responses",
    completion_window="24h",  # OpenAI puede tardar hasta 24h según volumen.
)
print("Batch job creado:", batch_job.id)
print("Status inicial:", batch_job.status)

In [None]:
# -----------------------------
# 8. DESCARGAR RESULTADOS (cuando esté listo)
# -----------------------------
# status = client.batches.retrieve(batch_job.id)
# result_file = client.files.retrieve_content(status.output_file_id)
# with open(project_path + "batch_results.jsonl", "wb") as f_out:
#     f_out.write(result_file)

In [None]:
# -----------------------------
# 9. PARSEAR JSONL Y UNIR CON DF
# -----------------------------
# df_results = []
# with open(project_path + "batch_results.jsonl", "r", encoding="utf-8") as f_in:
#     for line in f_in:
#         data = json.loads(line)
#         df_results.append(data.get("output", {}))
# df_features = pd.json_normalize(df_results, sep="__")
# df_final = pd.concat([df_sample.reset_index(drop=True), df_features], axis=1)