Este notebook asume que se está ejecutando en Google Colab, y que el dataset `LLM-Redial` disponible en https://drive.google.com/drive/folders/1TIP4PFm9z0C4R4--KnHoWuiB1uK-dv5m se encuentra descargado en el drive del usuario.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import shutil
import os

source_path = '/content/gdrive/MyDrive/Proyecto LLMonkeys/Tools.py'
destination_path = '/content/Tools.py'

shutil.copy(source_path, destination_path)

'/content/Tools.py'

In [None]:
import zipfile

zip_path = '/content/gdrive/MyDrive/Proyecto LLMonkeys/LLM_Redial.zip'
extract_path = '/content/LLM_Redial'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [None]:
import Tools as t

# Cargar datos del Dataset

path = "./LLM_Redial/Movie"

final_data_path = '{}/final_data.jsonl'.format(path)
Conversation_path = '{}/Conversation.txt'.format(path)
user_map_path = '{}/user_ids.json'.format(path)
item_map_path = '{}/item_map.json'.format(path)

final_data = t.read_jsonl(final_data_path)
user_map = t.read_json(user_map_path)
item_map = t.read_json(item_map_path)
Conversation = t.read_dialogue(Conversation_path)

**Separación de la información de los usuarios en train y test**


In [None]:
import json
import random

path = './LLM_Redial/Movie/final_data.jsonl'

# Cada entrada de final_data.jsonl se ve así
#{
#  "A30Q8X8B1S3GGT": {
#    "history_interaction": [...],
#    "user_might_like": [...],
#    "Conversation": [...]
#  }
#}


with open(path, 'r', encoding='utf-8') as file:
    data = [json.loads(line) for line in file]

# Dividimos los dialogos en 80% & 20% para training y testing.
n_conversations = 10089
train_len = n_conversations * 0.8
test_len = n_conversations * 0.2

train_conv = []
test_conv = []
used_users = []

aux = []

# Integramos el 20% de usuarios a Test
while len(aux) < test_len:
    try:
        convs = []
        # "La semilla random.seed(42) se reinicia dentro del bucle, por lo que no tiene efecto real. Debería estar fuera del bucle para resultados reproducibles."
        random.seed(42)

        # Usando used_users aseguramos que un mismo usuario no aparezca en Train y Test al mismo tiempo
        user_id = random.choice(list(set(user_map.keys())^set(used_users)))
        used_users.append(user_id)

        # Guardamos la info del usuario y sus dialogos
        user_data = next((item[user_id] for item in data if user_id in item), None)
        user_conversations = user_data.get("Conversation", [])

        for i in range(len(user_conversations)):
            selected_conversation = user_conversations[i]
            conversation_details = list(selected_conversation.values())[0]
            conversation_id = conversation_details["conversation_id"]
            if conversation_id != 10088:
                conversation = Conversation[Conversation.index(f"{conversation_id}\n"):Conversation.index(f"{conversation_id+1}\n")]
            else:
                conversation = Conversation[Conversation.index(f"{conversation_id}\n"):]

            convs.append(conversation)
            aux.append(conversation)
        test_conv.append([user_id,convs])
    except ValueError:
        print("ValueError: ", user_id, f"{conversation_id}\n")

# Integramos el resto de los usuarios y su info a Training
for user_id in list(set(user_map.keys()) ^ set(used_users)):
    try:
        convs = []
        user_data = next((item[user_id] for item in data if user_id in item), None)
        user_conversations = user_data.get("Conversation", [])
        for i in range(len(user_conversations)):
            selected_conversation = user_conversations[i]
            conversation_details = list(selected_conversation.values())[0]
            conversation_id = conversation_details["conversation_id"]
            if conversation_id != 10088:
                conversation = Conversation[Conversation.index(f"{conversation_id}\n"):Conversation.index(f"{conversation_id+1}\n")]
            else:
                conversation = Conversation[Conversation.index(f"{conversation_id}\n"):]

            convs.append(conversation)
        train_conv.append([user_id,convs])
    except ValueError:
        print("ValueError: ", user_id, f"{conversation_id}\n")

print("Número de usuarios en conjunto de TEST:", len(test_conv))
print("Número de usuarios en conjunto de TRAIN:", len(train_conv))
print("Total de usuarios en test + train:", len(test_conv) + len(train_conv))
print("Total de conversaciones originales esperadas:", n_conversations)

Número de usuarios en conjunto de TEST: 603
Número de usuarios en conjunto de TRAIN: 2528
Total de usuarios en test + train: 3131
Total de conversaciones originales esperadas: 10089


In [None]:
import re

def format_ans(ans, n):
    answer = "None"
    try:
        answer_list = []
        current_pos = 0

        for j in range(1, n + 1):
            # Busca el patrón del número y el inicio del texto
            pattern_start = f"{j}. "
            start_index = ans.find(pattern_start, current_pos)

            if start_index == -1:
                break  # Si no se encuentra el número, salimos del bucle

            start_text = start_index + len(pattern_start)

            # Busca el final del texto: el inicio del siguiente número O un " - "
            next_number_start = ans.find(f"{j + 1}. ", start_text)
            dash_start = ans.find(" - ", start_text)

            end_text = -1

            # Determina el final del texto basado en la primera ocurrencia
            if next_number_start != -1 and dash_start != -1:
                end_text = min(next_number_start, dash_start)
            elif next_number_start != -1:
                end_text = next_number_start
            elif dash_start != -1:
                end_text = dash_start
            else:
                # Si no hay siguiente número ni " - ", toma hasta el final de la línea o la cadena
                end_line = ans.find("\n", start_text)
                if end_line != -1:
                    end_text = end_line
                else:
                    end_text = len(ans)

            if end_text == -1: # Si no se encontró un final válido, toma hasta el final de la cadena
                end_text = len(ans)

            movie_name = ans[start_text:end_text].strip()
            answer_list.append(movie_name)
            current_pos = end_text # Actualiza la posición para la próxima búsqueda

        if not answer_list: # Si no se encontraron películas
             return "Formato de respuesta incorrecto", i

        answer_str = ", ".join([f'{movie}' for movie in answer_list])
        answer = f"{i}. {answer_str}\n"
        i += 1

    except Exception as e:
        answer = f"Error: {e}"

    finally:
        pattern = r"\(\d{4}\)"
        answer_list = [m.replace('\"', '') for m in answer_list]
        answer_list = [re.sub(pattern, '', m)[:] for m in answer_list]
        return answer_list

### Selección de diálogo a utilizar

In [None]:
num_test_items = 20 # Decisión arbitraria

In [None]:
# Generamos ejemplos de conversaciones al azar

rand_conversations = []

for n in range(num_test_items):
  random.seed(n)
  rand_user = random.choice(test_conv)
  user_id = rand_user[0]
  user_conversation  = random.choice(rand_user[1])

  user_data = next((item[user_id] for item in data if user_id in item), None)
  convs = user_data.get("Conversation", [])

  for i in range(len(rand_user[1])):
    if user_conversation == rand_user[1][i]:
      rand_user_conv_id = i

  dialog = "\n\n".join(user_conversation.split("\n\n")[1:4])
  dialog_id = user_conversation.split("\n\n")[0]
  dialog_ground_truth = list(convs[rand_user_conv_id].values())[0]["rec_item"]

  rand_conversations.append([
      rand_user_conv_id, # 0: Índice de conversación
      dialog, # 1: Texto del diálogo parcial
      user_data, # 2: Info estructurada del usuario
      dialog_id, # 3: ID de conversación
      dialog_ground_truth # 4: Item recomendado como verdad
      ])

print(len(rand_conversations)) # dialog_ground_truth

20


In [None]:
# Appendeamos a rand_conversations las interacciones del usuario para tenerlas a mano

for i in range(num_test_items):
  user_data = rand_conversations[i][2]
  rand_user_interactions = user_data.get("history_interaction", [])

  rand_user_interactions = [item_map[m] for m in rand_user_interactions]
  rand_conversations[i].append(rand_user_interactions)

print(rand_conversations[0][-1])
print( "\n")
print(rand_conversations[0])


['Shogun', ' Transsiberian (Steelbook)', 'Strictly Business VHS', 'Robinson Crusoe on Mars', 'Murder 101', 'Alien: Quadrilogy (Alien / Aliens / Alien 3 / Alien Resurrection)', 'Them VHS', 'Blood Alley VHS', 'Anzio VHS', 'Gold VHS', 'Far Country VHS', 'TCM Spotlight: Errol Flynn Adventures (Desperate Journey / Edge of Darkness 1943 / Northern Pursuit / Uncertain Glory / Objective Burma)', "Mackenna's Gold VHS", 'Tangled (Mandarin Chinese Edition)', 'Journey to the Far Side of the Sun VHS', 'Heaven Knows Mr. Allison VHS', 'The Reef', 'Marooned VHS', 'The Living Daylights The James Bond 007 Collection  VHS', 'Noble House', 'Star Trek Enterprise - The Complete First Season']


[0, 'User: Hi, I\'m Mazon11. I really enjoyed watching "Strictly Business" on VHS. It\'s a wonderful story with many subplots, and it was my introduction to Halle Berry. Halle\'s performance is outstanding!\n\nAgent: That\'s great to hear! I also liked "Strictly Business" on VHS. It\'s not a throwaway movie, despite 

## Preparación de manejo de Few Shot

In [None]:
# Preparamos ejemplos representativos aleatorios para ajustar la información
few_shot_data = []
random.seed(42)
train_users = random.sample(train_conv, 5)

for u in train_users:
  user_data = next((item[u[0]] for item in data if u[0] in item), None)
  user_interactions = user_data.get("history_interaction", [])
  user_interactions = [item_map[m] for m in user_interactions]

  conversation = min(u[1], key=len)
  conversation[conversation.index("User:"):-2]
  convs = user_data.get("Conversation", [])
  for c in convs:
    user_likes = list(c.values())[0]["user_likes"]
    user_dislikes = list(c.values())[0]["user_dislikes"]
    recs= list(c.values())[0]["rec_item"]

    user_likes = [item_map[m] for m in user_likes]
    user_dislikes = [item_map[m] for m in user_dislikes]
    recs = [item_map[m] for m in recs]


  few_shot_data.append({
      # "user_interactions": user_interactions,
      "conversation": conversation,
      # "user_likes": user_likes,
      # "user_dislikes": user_dislikes,
      # "recs": recs
  })

## Modelo GPT 4.1 Nano


## Zero-Shot con interacciones históricas:

In [2]:
from openai import OpenAI
from google.colab import userdata
import time

# Se obtiene la API key del entorno
openai_api_key = userdata.get('OPENAI_API_KEY')

# Cliente de OpenAI
client = OpenAI(api_key=openai_api_key)

k = 1 # Para el avance no aplicaremos sampling al modelo de gpt
outputs_z_s = []
for j in range(num_test_items):
  ctx = (
      "YOU ARE A MOVIE RECOMMENDATION SYSTEM."
      "GENERATE A NUMBERED LIST OF 10 MOVIES. \n"
      "RULES: \n"
      "1. DO NOT write dialogs, explanations nor additional text or information. \n"
      "2. DO NOT repeat movies mentioned in the conversation. \n"
      "3. Response format: \n"
      "1. \"Movie name 1\" \n"
      "2. \"Movie name 2\" \n"
      "... \n"
      "10. \"Movie name 10\" \n"
      "Use ONLY this format and NOTHING else."
  )

  msg = (
      "Based on the following conversation: \n"
      f"{rand_conversations[j][1]} \n\n"
      "And the movies the user has previously interaced with: \n"
      f"{rand_conversations[j][5]}\n\n"
      "Generate a list of 10 recommended movies (JUST NAMES, ONE PER LINE):"
  )

  # Se combina el contexto y el mensaje para hacer el prompt
  prompt = ctx + msg

  for i in range(k):
      try:
          response = client.chat.completions.create(
              model="gpt-4.1-nano",  # Modelo utilizado
              messages=[
                  {"role": "system", "content": "YOU ARE A MOVIE RECOMMENDATION SYSTEM."},
                  {"role": "user", "content": prompt}
              ],
              max_tokens=200,  # Adjust as needed for 10 movie titles and formatting
              temperature=0.4,
              top_p=0.95
          )
          outputs_z_s.append(response.choices[0].message.content)
          # Add a small delay to avoid hitting API rate limits
          time.sleep(1)
      except Exception as e:
          print(f"Error during API call: {e}")
          outputs_z_s.append(f"Error: {e}")
          # Consider adding a longer delay or breaking if there are repeated errors
          time.sleep(5)


# Now outputs_z_s contains the responses from the API calls
# You can process outputs_z_s as needed
# For example, print the responses:
for output in outputs_z_s:
    print(output)

NameError: name 'num_test_items' is not defined

### Formateo de las respuestas de la LLM

In [None]:
formatted_outputs = []
i = 1  # Inicializa el índice para format_ans

for output in outputs_z_s:
  # Llama a format_ans para cada salida
  formatted_output = format_ans(output, 10)
  formatted_outputs.append(formatted_output)

# Ahora formatted_outputs contiene las salidas formateadas
# Puedes imprimir o procesar esta lista
for fo in formatted_outputs:
  print(fo)


print(formatted_outputs)


['Pulp Fiction', 'The Shawshank Redemption', 'Fight Club', 'The Godfather', 'Inception', 'The Dark Knight', 'Forrest Gump', 'Gladiator', 'The Matrix', 'Saving Private Ryan']
['No Country for Old Men', 'The Assassination of Jesse James by the Coward Robert Ford', 'Prisoners', 'The Proposition', 'Cold Mountain', 'The Proposition', 'The Road', 'The Proposition', 'Mystic River', 'The Proposition']
['Hereditary', 'Midsommar', 'The Witch', 'The Babadook', 'Suspiria ', 'The Void', 'The Witching Hour', 'The Invitation', 'The Ritual', 'The Wicker Man']
['The Descent', 'Evil Dead II', 'House of 1000 Corpses', 'Cabin Fever', 'The Void', 'Tucker and Dale vs. Evil', 'The Ritual', 'The Witch', 'Martyrs', 'The Babadook']
['M*A*S*H', 'The Odd Couple', 'The Apartment', 'Some Like It Hot', 'Harold and Maude', 'The Philadelphia Story', 'It’s a Mad, Mad, Mad, Mad World', 'The Great Race', 'The Lady Eve', 'The Fortune Cookie']
['Kung Fu Hustle', 'Tropic Thunder', 'Hot Shots!', 'The Naked Gun', 'Zombieland'

In [None]:
# Guardar en archivo de texto
with open('zeroshotwithinteractions.txt', 'w', encoding='utf-8') as file:
    for i, output in enumerate(outputs_z_s):
        file.write(f"Recomendación {i+1}:\n")
        file.write(output + "\n\n")

print(f"Archivo 'zeroshotwithinteractions.txt' creado con {len(outputs_z_s)} recomendaciones")

Archivo 'zeroshotwithinteractions.txt' creado con 20 recomendaciones


## Few-Shot con interacciones históricas:

In [None]:
from openai import OpenAI
from google.colab import userdata
import time

# Se obtiene la API key del entorno
openai_api_key = userdata.get('OPENAI_API_KEY')

# Cliente de OpenAI
client = OpenAI(api_key=openai_api_key)

k = 1 # Para el avance no aplicaremos sampling al modelo de gpt
outputs_few_shot = []
for j in range(num_test_items):
  ctx = (
      "YOU ARE A MOVIE RECOMMENDATION SYSTEM."
      "GENERATE A NUMBERED LIST OF 10 MOVIES. \n"
      "RULES: \n"
      "1. DO NOT write dialogs, explanations nor additional text or information. \n"
      "2. DO NOT repeat movies mentioned in the conversation. \n"
      "3. Response format: \n"
      "1. \"Movie name 1\" \n"
      "2. \"Movie name 2\" \n"
      "... \n"
      "10. \"Movie name 10\" \n"
      "Use ONLY this format and NOTHING else."
  )

  # Información adicional para completar el few shot (4 conversaciones extra)
  msg = (
      "Based on these 4 examples: \n"
      f"{few_shot_data[0]}\n"
      f"{few_shot_data[1]}\n"
      f"{few_shot_data[2]}\n"
      f"{few_shot_data[3]}\n\n"
      "Based on the following conversation: \n"
      f"{rand_conversations[j][1]} \n\n"
      "And the movies the user has previously interaced with: \n"
      f"{rand_conversations[j][5]}\n\n"
      "Generate a list of 10 recommended movies (JUST NAMES, ONE PER LINE):"
  )



  # Se combina el contexto y el mensaje para hacer el prompt
  prompt = ctx + msg

  for i in range(k):
      try:
          response = client.chat.completions.create(
              model="gpt-4.1-nano",  # Modelo utilizado
              messages=[
                  {"role": "system", "content": "YOU ARE A MOVIE RECOMMENDATION SYSTEM."},
                  {"role": "user", "content": prompt}
              ],
              max_tokens=200,  # Adjust as needed for 10 movie titles and formatting
              temperature=0.4,
              top_p=0.95
          )
          outputs_few_shot.append(response.choices[0].message.content)
          # Add a small delay to avoid hitting API rate limits
          time.sleep(1)
      except Exception as e:
          print(f"Error during API call: {e}")
          outputs_few_shot.append(f"Error: {e}")
          # Consider adding a longer delay or breaking if there are repeated errors
          time.sleep(5)


# Now outputs_few_shot contains the responses from the API calls
# You can process outputs_z_s as needed
# For example, print the responses:
for output in outputs_few_shot:
    print(output)

1. "Pulp Fiction"  
2. "The Shawshank Redemption"  
3. "Fight Club"  
4. "The Godfather"  
5. "Inception"  
6. "The Dark Knight"  
7. "Forrest Gump"  
8. "The Silence of the Lambs"  
9. "Se7en"  
10. "The Matrix"
1. "The Proposition"  
2. "The Hateful Eight"  
3. "No Country for Old Men"  
4. "The Assassination of Jesse James by the Coward Robert Ford"  
5. "The Revenant"  
6. "Django Unchained"  
7. "True Grit"  
8. "Hell or High Water"  
9. "3:10 to Yuma"  
10. "The Pale Rider"
1. "Hereditary"  
2. "Midsommar"  
3. "The Witch"  
4. "Suspiria" (2018)  
5. "The Babadook"  
6. "The Invitation"  
7. "The Witching Hour"  
8. "The Descent"  
9. "Rosemary's Baby"  
10. "The Ritual"
1. "The Descent"  
2. "House of the Devil"  
3. "The Void"  
4. "Martyrs"  
5. "The Witch"  
6. "The Babadook"  
7. "It Follows"  
8. "The Invitation"  
9. "A Quiet Place"  
10. "Hereditary"
1. "The Apartment"  
2. "M*A*S*H"  
3. "Some Like It Hot"  
4. "The Philadelphia Story"  
5. "Tootsie"  
6. "The Odd Couple

In [None]:
# Formatear las respuestas
formatted_outputs_few_shot = []
format_index = 1

for output in outputs_few_shot:
    formatted_output = format_ans(output, 10)
    formatted_outputs_few_shot.append(formatted_output)

print("\n--- Respuestas Formateadas ---")
for fo in formatted_outputs_few_shot:
    print(fo)


--- Respuestas Formateadas ---
['Pulp Fiction', 'The Shawshank Redemption', 'Fight Club', 'The Godfather', 'Inception', 'The Dark Knight', 'Forrest Gump', 'The Silence of the Lambs', 'Se7en', 'The Matrix']
['The Proposition', 'The Hateful Eight', 'No Country for Old Men', 'The Assassination of Jesse James by the Coward Robert Ford', 'The Revenant', 'Django Unchained', 'True Grit', 'Hell or High Water', '3:10 to Yuma', 'The Pale Rider']
['Hereditary', 'Midsommar', 'The Witch', 'Suspiria ', 'The Babadook', 'The Invitation', 'The Witching Hour', 'The Descent', "Rosemary's Baby", 'The Ritual']
['The Descent', 'House of the Devil', 'The Void', 'Martyrs', 'The Witch', 'The Babadook', 'It Follows', 'The Invitation', 'A Quiet Place', 'Hereditary']
['The Apartment', 'M*A*S*H', 'Some Like It Hot', 'The Philadelphia Story', 'Tootsie', 'The Odd Couple', 'His Girl Friday', 'The Birdcage', 'Arsenic and Old Lace', 'The Producers']
['Tropic Thunder', 'Hot Fuzz', 'Monty Python and the Holy Grail', 'Th

In [None]:
# Guardar en archivo de texto
with open('fewshotwithinteractions.txt', 'w', encoding='utf-8') as file:
    for i, output in enumerate(outputs_few_shot):
        file.write(f"Recomendación Few-Shot {i+1}:\n")
        file.write(output + "\n\n")

print(f"\nArchivo 'fewshotwithinteractions.txt' creado con {len(outputs_few_shot)} recomendaciones")


Archivo 'fewshotwithinteractions.txt' creado con 20 recomendaciones


## Evaluación

In [None]:
user_conversation_id = int(user_conversation[:user_conversation.index("User:")-2])
user_data = next((item[rand_user[0]] for item in data if rand_user[0] in item), None)
convs = user_data.get("Conversation", [])
ground_truth = [item_map[m] for m in list(convs[rand_user_conv_id].values())[0]["rec_item"]]


print(user_conversation_id)
print(ground_truth)

2564
['Beyond Borders']


In [None]:
for c in rand_conversations:
  print(c[4])
  ground_truth = [item_map[m] for m in c[4]]
  print(ground_truth)

['B01CPU8ON2']
['Where to Invade Next']
['B01DZQ10M2']
['Criminal 2016']
['B000F3UACU']
['Sick Girl masters Of Horror']
['B00005IC52']
['Black Christmas']
['B0053O8AQE']
['2 Broke Girls: The Complete First Season']
['6302120527']
['Captain Blood VHS']
['B00XRZU2E4']
['Hobbit 3: The Battle of the Five Armies']
['B000FGGE7C']
['Ultraviolet']
['B00005JNT2']
['Man of the House']
['B000WCBVQS']
['Underdog']
['0790729989']
['Seven']
['B000E5KUME']
['Masters of Horror - John Carpenter - Cigarette Burns']
['B00M25EALG']
['Lucy']
['B000WC38B4']
['Element: Yoga for Beginners']
['6300252310']
['Gorgo VHS']
['6301562925']
['Halloween 5 VHS']
['B000NVIGLG']
['Isolation']
['6305599033']
['Tessellations: How to Create Them VHS']
['B016LGTB1A']
['National Treasure']
['B0001AW02A']
['Beyond Borders']


In [None]:
%pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m2.2/3.1 MB[0m [31m71.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.13.0


In [None]:
import numpy as np
import re
from sklearn.metrics import ndcg_score
import html
from rapidfuzz import fuzz
import json

# Funciones de normalización y comparación
def normalizar_titulo(titulo):
    # Decode entidades HTML como &amp;
    titulo = html.unescape(titulo)
    # Minúsculas
    titulo = titulo.lower()
    # Eliminar puntuación excepto letras, números y &
    titulo = re.sub(r"[^a-z0-9& ]+", "", titulo)
    # Eliminar múltiples espacios
    titulo = re.sub(r"\s+", " ", titulo).strip()
    return titulo

def comparar_titulos(t1, t2):
    t1 = normalizar_titulo(t1)
    t2 = normalizar_titulo(t2)
    return fuzz.token_set_ratio(t1, t2)

# Función para extraer nombres de películas de la respuesta del modelo
def extract_movie_names(response_text):
    """
    Extrae los nombres de películas de la respuesta del modelo.
    Maneja diferentes formatos de respuesta.
    """
    movie_names = []

    # Patrón para encontrar líneas numeradas con películas
    # Busca patrones como "1. Movie Name" o "1. "Movie Name""
    pattern = r'\d+\.\s*["\']?([^"\'\n]+?)["\']?(?:\s*[-–—].*)?$'

    lines = response_text.split('\n')
    for line in lines:
        line = line.strip()
        if line and re.match(r'\d+\.', line):
            match = re.match(pattern, line)
            if match:
                movie_name = match.group(1).strip()
                # Limpiar caracteres especiales adicionales
                movie_name = re.sub(r'["\']', '', movie_name)
                movie_names.append(movie_name)

    return movie_names

# Función mejorada para calcular Recall@K usando comparación fuzzy
def calculate_recall_at_k_fuzzy(predicted_items, true_items, k, threshold=80):
    """
    Calcula Recall@K usando comparación fuzzy de títulos
    """
    if len(true_items) == 0:
        return 0.0

    predicted_k = predicted_items[:k]
    hits = 0

    # Para cada item predicho, verificar si coincide con algún ground truth
    for predicted_item in predicted_k:
        for true_item in true_items:
            if comparar_titulos(predicted_item, true_item) > threshold:
                hits += 1
                break  # Una vez que encontramos match, no necesitamos seguir buscando

    # Recall = hits / total_relevant_items
    recall = hits / len(true_items)
    return recall

# Función mejorada para calcular NDCG@K usando comparación fuzzy
def calculate_ndcg_at_k_fuzzy(predicted_items, true_items, k, threshold=80):
    """
    Calcula NDCG@K usando comparación fuzzy de títulos
    """
    if len(true_items) == 0:
        return 0.0

    predicted_k = predicted_items[:k]

    # Crear vector de relevancia usando comparación fuzzy
    relevance_scores = []
    for predicted_item in predicted_k:
        is_relevant = False
        for true_item in true_items:
            if comparar_titulos(predicted_item, true_item) > threshold:
                is_relevant = True
                break
        relevance_scores.append(1 if is_relevant else 0)

    if sum(relevance_scores) == 0:
        return 0.0

    # Calcular NDCG usando sklearn
    relevance_scores = np.array([relevance_scores])
    ideal_relevance = np.array([sorted(relevance_scores[0], reverse=True)])

    ndcg = ndcg_score(ideal_relevance, relevance_scores, k=k)
    return ndcg

# Función principal de evaluación
def evaluate_recommendations(outputs, ground_truths, method_name, threshold=80):
    """
    Evalúa las recomendaciones usando Recall@K y NDCG@K con comparación fuzzy
    """
    results = {
        'recall_5': [],
        'recall_10': [],
        'recall_20': [],
        'ndcg_5': [],
        'ndcg_10': [],
        'ndcg_20': []
    }

    print(f"\n=== Evaluando {method_name} (Threshold: {threshold}) ===")

    for i, (output, ground_truth) in enumerate(zip(outputs, ground_truths)):
        # Extraer nombres de películas de la respuesta
        predicted_movies = extract_movie_names(output)

        print(f"\nConversación {i+1}:")
        print(f"Ground Truth: {ground_truth}")
        print(f"Predicciones: {predicted_movies[:10]}")  # Mostrar solo las primeras 10

        # Verificar matches con threshold para debugging
        matches_found = []
        for pred in predicted_movies[:5]:  # Solo verificar las primeras 5 para debugging
            for gt in ground_truth:
                similarity = comparar_titulos(pred, gt)
                if similarity > threshold:
                    matches_found.append(f"'{pred}' ≈ '{gt}' ({similarity:.1f}%)")
                    break

        if matches_found:
            print(f"Matches encontrados: {matches_found}")
        else:
            print("No se encontraron matches claros")

        # Calcular métricas usando comparación fuzzy
        recall_5 = calculate_recall_at_k_fuzzy(predicted_movies, ground_truth, 5, threshold)
        recall_10 = calculate_recall_at_k_fuzzy(predicted_movies, ground_truth, 10, threshold)
        recall_20 = calculate_recall_at_k_fuzzy(predicted_movies, ground_truth, 20, threshold)

        ndcg_5 = calculate_ndcg_at_k_fuzzy(predicted_movies, ground_truth, 5, threshold)
        ndcg_10 = calculate_ndcg_at_k_fuzzy(predicted_movies, ground_truth, 10, threshold)
        ndcg_20 = calculate_ndcg_at_k_fuzzy(predicted_movies, ground_truth, 20, threshold)

        # Agregar a resultados
        results['recall_5'].append(recall_5)
        results['recall_10'].append(recall_10)
        results['recall_20'].append(recall_20)
        results['ndcg_5'].append(ndcg_5)
        results['ndcg_10'].append(ndcg_10)
        results['ndcg_20'].append(ndcg_20)

        print(f"Recall@5: {recall_5:.4f}, Recall@10: {recall_10:.4f}, Recall@20: {recall_20:.4f}")
        print(f"NDCG@5: {ndcg_5:.4f}, NDCG@10: {ndcg_10:.4f}, NDCG@20: {ndcg_20:.4f}")

    # Calcular promedios
    avg_results = {}
    for metric in results:
        avg_results[f'avg_{metric}'] = np.mean(results[metric])

    print(f"\n=== Resultados Promedio - {method_name} ===")
    print(f"Recall@5: {avg_results['avg_recall_5']:.4f}")
    print(f"Recall@10: {avg_results['avg_recall_10']:.4f}")
    print(f"Recall@20: {avg_results['avg_recall_20']:.4f}")
    print(f"NDCG@5: {avg_results['avg_ndcg_5']:.4f}")
    print(f"NDCG@10: {avg_results['avg_ndcg_10']:.4f}")
    print(f"NDCG@20: {avg_results['avg_ndcg_20']:.4f}")

    return results, avg_results

# Preparar ground truths para evaluación
print("Preparando ground truths...")
ground_truths = []

for i in range(num_test_items):
    # Extraer ground truth de rand_conversations
    ground_truth_items = rand_conversations[i][4]  # Items recomendados como verdad

    # Convertir IDs a nombres de películas usando item_map
    ground_truth_names = [item_map[item_id] for item_id in ground_truth_items if item_id in item_map]
    ground_truths.append(ground_truth_names)

    print(f"Ground Truth {i+1}: {ground_truth_names}")

# Verificar que tenemos la misma cantidad de outputs y ground truths
print(f"\nVerificación de datos:")
print(f"Número de outputs Zero-Shot: {len(outputs_z_s)}")
print(f"Número de outputs Few-Shot: {len(outputs_few_shot)}")
print(f"Número de ground truths: {len(ground_truths)}")

# Verificar que los números coinciden
assert len(outputs_z_s) == len(ground_truths), f"Mismatch: {len(outputs_z_s)} vs {len(ground_truths)}"
assert len(outputs_few_shot) == len(ground_truths), f"Mismatch: {len(outputs_few_shot)} vs {len(ground_truths)}"

# Evaluar con diferentes thresholds para encontrar el óptimo
thresholds = [70, 75, 80, 85]
results_by_threshold = {}

for threshold in thresholds:
    print(f"\n{'='*60}")
    print(f"EVALUACIÓN CON THRESHOLD = {threshold}")
    print(f"{'='*60}")

    # Evaluar Zero-Shot
    zs_results, zs_avg = evaluate_recommendations(outputs_z_s, ground_truths, f"Zero-Shot (T={threshold})", threshold)

    # Evaluar Few-Shot
    fs_results, fs_avg = evaluate_recommendations(outputs_few_shot, ground_truths, f"Few-Shot (T={threshold})", threshold)

    # Guardar resultados
    results_by_threshold[threshold] = {
        'zero_shot': zs_avg,
        'few_shot': fs_avg
    }

    # Comparación para este threshold
    print(f"\n{'='*40}")
    print(f"COMPARACIÓN THRESHOLD = {threshold}")
    print(f"{'='*40}")

    metrics = ['recall_5', 'recall_10', 'recall_20', 'ndcg_5', 'ndcg_10', 'ndcg_20']

    for metric in metrics:
        zs_score = zs_avg[f'avg_{metric}']
        fs_score = fs_avg[f'avg_{metric}']
        improvement = ((fs_score - zs_score) / zs_score * 100) if zs_score > 0 else 0

        print(f"{metric.upper():<12}: ZS={zs_score:.4f} | FS={fs_score:.4f} | Mejora: {improvement:+.2f}%")

# Resumen final con el mejor threshold
print(f"\n{'='*80}")
print("RESUMEN FINAL - TODOS LOS THRESHOLDS")
print(f"{'='*80}")

for threshold in thresholds:
    zs_results = results_by_threshold[threshold]['zero_shot']
    fs_results = results_by_threshold[threshold]['few_shot']

    print(f"\nThreshold {threshold}:")
    print(f"  Zero-Shot: R@10={zs_results['avg_recall_10']:.4f}, NDCG@10={zs_results['avg_ndcg_10']:.4f}")
    print(f"  Few-Shot:  R@10={fs_results['avg_recall_10']:.4f}, NDCG@10={fs_results['avg_ndcg_10']:.4f}")

# Guardar todos los resultados
with open('evaluation_results_fuzzy.json', 'w') as f:
    # Convertir numpy arrays a listas para JSON
    json_results = {}
    for threshold, results in results_by_threshold.items():
        json_results[str(threshold)] = {}
        for method, metrics in results.items():
            json_results[str(threshold)][method] = {k: float(v) for k, v in metrics.items()}

    json.dump(json_results, f, indent=2)

print(f"\nResultados completos guardados en 'evaluation_results_fuzzy.json'")

Preparando ground truths...
Ground Truth 1: ['Where to Invade Next']
Ground Truth 2: ['Criminal 2016']
Ground Truth 3: ['Sick Girl masters Of Horror']
Ground Truth 4: ['Black Christmas']
Ground Truth 5: ['2 Broke Girls: The Complete First Season']
Ground Truth 6: ['Captain Blood VHS']
Ground Truth 7: ['Hobbit 3: The Battle of the Five Armies']
Ground Truth 8: ['Ultraviolet']
Ground Truth 9: ['Man of the House']
Ground Truth 10: ['Underdog']
Ground Truth 11: ['Seven']
Ground Truth 12: ['Masters of Horror - John Carpenter - Cigarette Burns']
Ground Truth 13: ['Lucy']
Ground Truth 14: ['Element: Yoga for Beginners']
Ground Truth 15: ['Gorgo VHS']
Ground Truth 16: ['Halloween 5 VHS']
Ground Truth 17: ['Isolation']
Ground Truth 18: ['Tessellations: How to Create Them VHS']
Ground Truth 19: ['National Treasure']
Ground Truth 20: ['Beyond Borders']

Verificación de datos:
Número de outputs Zero-Shot: 20
Número de outputs Few-Shot: 20
Número de ground truths: 20

EVALUACIÓN CON THRESHOLD = 70


# LLAMADO A LA API

In [None]:
from openai import OpenAI
from google.colab import userdata

# Get the API key from Colab Secrets
openai_api_key = userdata.get('OPENAI_API_KEY')

client = OpenAI(api_key=openai_api_key)

def chat_with_gpt(prompt, model="gpt-4.1-nano"):
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "Eres un asistente útil."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=150,
            temperature=0.7
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error: {e}"

# Usar la función
respuesta = chat_with_gpt("Explícame qué es Python")
print(respuesta)

¡Por supuesto! Python es un lenguaje de programación de alto nivel, interpretado y de propósito general. Fue creado por Guido van Rossum y lanzado por primera vez en 1991. Python es conocido por su sintaxis clara y legible, lo que facilita su aprendizaje y uso tanto para principiantes como para programadores experimentados.

Características principales de Python:

1. **Sintaxis sencilla**: Utiliza una sintaxis que favorece la legibilidad del código, con una estructura que permite escribir programas de manera clara y concisa.
2. **Multipropósito**: Se puede usar para desarrollo web, análisis de datos, inteligencia artificial, automatización, scripting, desarrollo de aplicaciones, entre otros.
3. **Gran comunidad y


# Manejo de Uncertainty

In [8]:
from openai import OpenAI
import numpy as np
from collections import Counter
import re
import json
from scipy.stats import entropy
import time
from typing import List, Dict, Any
import os
from google.colab import userdata

# Se obtiene la API key del entorno
openai_api_key = userdata.get('OPENAI_API_KEY')

# Configuración de OpenAI (nueva API)
client = OpenAI(
    api_key=openai_api_key
)

def extract_answer(text, question_text):
    """Extrae la respuesta del texto generado"""
    # Buscar después de "Answer:" o similar
    patterns = [r"Answer:\s*(.+?)(?:\n|$)", r"answer:\s*(.+?)(?:\n|$)", r"Answer is\s*(.+?)(?:\n|$)"]

    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(1).strip()

    # Si no encuentra patrón, tomar lo que viene después del prompt
    try:
        # Dividir por el texto de la pregunta y tomar la parte después
        parts = text.split("Answer:")
        if len(parts) > 1:
            return parts[-1].strip().split('\n')[0].strip()
    except:
        pass

    return text.strip()

def generate_gpt4_responses(paraphrases: List[str], samples_per_paraphrase: int = 5,
                          model: str = "gpt-4.1-nano", temperature: float = 0.7,
                          max_tokens: int = 50, delay: float = 0.1) -> List[List[str]]:
    """
    Genera respuestas usando GPT-4 con sampling (nueva API de OpenAI)

    Args:
        paraphrases: Lista de paráfrasis de la pregunta
        samples_per_paraphrase: Número de samples por paráfrasis
        model: Modelo de OpenAI a usar
        temperature: Temperatura para sampling
        max_tokens: Máximo número de tokens a generar
        delay: Delay entre llamadas para evitar rate limits

    Returns:
        Lista de listas con las respuestas generadas
    """
    outputs_per_paraphrase = []

    print(f"Generando respuestas con {model}...")

    for i, paraphrase in enumerate(paraphrases):
        print(f"Procesando paráfrasis {i+1}/{len(paraphrases)}: {paraphrase}")

        outputs = []
        ctx = "You are an oracle who only responds with short and concise answers."
        msg = f"Answer the following question: {paraphrase}\nAnswer:"

        for j in range(samples_per_paraphrase):
            try:
                response = client.chat.completions.create(
                    model=model,
                    messages=[
                        {"role": "system", "content": ctx},
                        {"role": "user", "content": msg}
                    ],
                    max_tokens=max_tokens,
                    temperature=temperature,
                    top_p=0.95,
                    n=1  # Generar una respuesta por llamada
                )

                answer = response.choices[0].message.content.strip()
                outputs.append(answer)

                # Delay para evitar rate limits
                time.sleep(delay)

            except Exception as e:
                print(f"Error en sample {j+1}: {e}")
                outputs.append("")  # Respuesta vacía en caso de error

        outputs_per_paraphrase.append(outputs)

    return outputs_per_paraphrase

def generate_gpt4_with_logprobs(paraphrases: List[str], samples_per_paraphrase: int = 5,
                               model: str = "gpt-4.1-nano", temperature: float = 0.7,
                               max_tokens: int = 50, delay: float = 0.1) -> tuple:
    """
    Genera respuestas con GPT-4 obteniendo log probabilities cuando sea posible (nueva API)

    Nota: Log probabilities solo están disponibles en algunos modelos como gpt-3.5-turbo y gpt-4
    """
    outputs_per_paraphrase = []
    logprobs_per_paraphrase = []

    print(f"Generando respuestas con {model} (intentando obtener logprobs)...")

    for i, paraphrase in enumerate(paraphrases):
        print(f"Procesando paráfrasis {i+1}/{len(paraphrases)}: {paraphrase}")

        outputs = []
        logprobs_list = []
        ctx = "You are an oracle who only responds with short and concise answers."
        msg = f"Answer the following question: {paraphrase}\nAnswer:"

        for j in range(samples_per_paraphrase):
            try:
                # Intentar con logprobs (solo funciona en algunos modelos)
                response = client.chat.completions.create(
                    model=model,
                    messages=[
                        {"role": "system", "content": ctx},
                        {"role": "user", "content": msg}
                    ],
                    max_tokens=max_tokens,
                    temperature=temperature,
                    top_p=0.95,
                    n=1,
                    logprobs=True,  # Intentar obtener logprobs
                    top_logprobs=5  # Top 5 tokens más probables
                )

                answer = response.choices[0].message.content.strip()
                outputs.append(answer)

                # Extraer logprobs si están disponibles
                if hasattr(response.choices[0], 'logprobs') and response.choices[0].logprobs:
                    logprobs_list.append(response.choices[0].logprobs)
                else:
                    logprobs_list.append(None)

                time.sleep(delay)

            except Exception as e:
                print(f"Error en sample {j+1}: {e}")
                # Si falla con logprobs, intentar sin ellos
                try:
                    response = client.chat.completions.create(
                        model=model,
                        messages=[
                            {"role": "system", "content": ctx},
                            {"role": "user", "content": msg}
                        ],
                        max_tokens=max_tokens,
                        temperature=temperature,
                        top_p=0.95,
                        n=1
                    )
                    answer = response.choices[0].message.content.strip()
                    outputs.append(answer)
                    logprobs_list.append(None)
                    time.sleep(delay)
                except Exception as e2:
                    print(f"Error secundario en sample {j+1}: {e2}")
                    outputs.append("")
                    logprobs_list.append(None)

        outputs_per_paraphrase.append(outputs)
        logprobs_per_paraphrase.append(logprobs_list)

    return outputs_per_paraphrase, logprobs_per_paraphrase

def calculate_uncertainty_metrics_lightweight(outputs_per_paraphrase, paraphrases):
    """
    Calcula métricas de incertidumbre usando Input Clarification Ensembling
    """
    # 1. Extraer respuestas limpias
    all_answers = []
    answers_by_paraphrase = []

    for i, outputs in enumerate(outputs_per_paraphrase):
        paraphrase_answers = []
        for output in outputs:
            answer = extract_answer(output, paraphrases[i])
            paraphrase_answers.append(answer)
            all_answers.append(answer)
        answers_by_paraphrase.append(paraphrase_answers)

    # 2. Calcular frecuencias de respuestas
    answer_counts = Counter(all_answers)
    total_responses = len(all_answers)

    # 3. INCERTIDUMBRE TOTAL (Shannon Entropy de todas las respuestas)
    probs = np.array(list(answer_counts.values())) / total_responses
    total_uncertainty = entropy(probs, base=2)  # bits

    # 4. INCERTIDUMBRE ALEATORIA (promedio de entropías por paráfrasis)
    aleatoric_uncertainties = []
    for answers in answers_by_paraphrase:
        local_counts = Counter(answers)
        local_probs = np.array(list(local_counts.values())) / len(answers)
        if len(local_probs) > 1:
            aleatoric_uncertainties.append(entropy(local_probs, base=2))
        else:
            aleatoric_uncertainties.append(0.0)

    aleatoric_uncertainty = np.mean(aleatoric_uncertainties)

    # 5. INCERTIDUMBRE EPISTÉMICA (diferencia)
    epistemic_uncertainty = total_uncertainty - aleatoric_uncertainty

    # 6. Métricas adicionales
    unique_answers = len(set(all_answers))
    most_common_answer, most_common_count = answer_counts.most_common(1)[0]
    confidence = most_common_count / total_responses

    # 7. Consistencia entre paráfrasis
    consistency_scores = []
    for i in range(len(paraphrases)):
        for j in range(i+1, len(paraphrases)):
            # Comparar respuestas más frecuentes de cada paráfrasis
            answers_i = Counter(answers_by_paraphrase[i])
            answers_j = Counter(answers_by_paraphrase[j])

            most_common_i = answers_i.most_common(1)[0][0] if answers_i else ""
            most_common_j = answers_j.most_common(1)[0][0] if answers_j else ""

            # Similaridad simple (exacta o parcial)
            if most_common_i.lower().strip() == most_common_j.lower().strip():
                consistency_scores.append(1.0)
            else:
                # Similaridad parcial usando tokens comunes
                tokens_i = set(most_common_i.lower().split())
                tokens_j = set(most_common_j.lower().split())
                if tokens_i and tokens_j:
                    jaccard = len(tokens_i.intersection(tokens_j)) / len(tokens_i.union(tokens_j))
                    consistency_scores.append(jaccard)
                else:
                    consistency_scores.append(0.0)

    consistency = np.mean(consistency_scores) if consistency_scores else 0.0

    return {
        'total_uncertainty': float(total_uncertainty),
        'aleatoric_uncertainty': float(aleatoric_uncertainty),
        'epistemic_uncertainty': float(epistemic_uncertainty),
        'confidence': float(confidence),
        'unique_answers': int(unique_answers),
        'most_common_answer': str(most_common_answer),
        'consistency_across_paraphrases': float(consistency),
        'answer_distribution': {str(k): int(v) for k, v in answer_counts.items()},
        'num_paraphrases': len(paraphrases),
        'samples_per_paraphrase': len(outputs_per_paraphrase[0]) if outputs_per_paraphrase else 0
    }

def calculate_logprobs_uncertainty(logprobs_per_paraphrase):
    """
    Calcula incertidumbre basada en log probabilities de OpenAI (nueva API)
    """
    if not logprobs_per_paraphrase or not any(any(lp for lp in paraphrase_logprobs if lp)
                                            for paraphrase_logprobs in logprobs_per_paraphrase):
        return {
            'logprobs_available': False,
            'message': 'No se pudieron obtener log probabilities del modelo'
        }

    uncertainties_per_paraphrase = []

    for paraphrase_logprobs in logprobs_per_paraphrase:
        paraphrase_uncertainties = []

        for logprobs_data in paraphrase_logprobs:
            if logprobs_data and hasattr(logprobs_data, 'content') and logprobs_data.content:
                # Extraer probabilidades del primer token generado
                if len(logprobs_data.content) > 0 and logprobs_data.content[0].top_logprobs:
                    first_token_logprobs = logprobs_data.content[0].top_logprobs

                    # Convertir logprobs a probabilidades
                    logprobs_values = [token.logprob for token in first_token_logprobs]
                    probs = np.exp(logprobs_values)
                    probs = probs / np.sum(probs)  # Normalizar

                    # Calcular entropía
                    uncertainty = entropy(probs, base=2)
                    paraphrase_uncertainties.append(uncertainty)

        if paraphrase_uncertainties:
            uncertainties_per_paraphrase.append(np.mean(paraphrase_uncertainties))
        else:
            uncertainties_per_paraphrase.append(0.0)

    # Calcular estadísticas
    mean_uncertainty = np.mean(uncertainties_per_paraphrase)
    std_uncertainty = np.std(uncertainties_per_paraphrase)

    return {
        'logprobs_available': True,
        'logprob_uncertainties_per_paraphrase': uncertainties_per_paraphrase,
        'mean_logprob_uncertainty': float(mean_uncertainty),
        'std_logprob_uncertainty': float(std_uncertainty)
    }

# FUNCIÓN PRINCIPAL PARA GPT-4 (Nueva API)
def analyze_uncertainty_gpt4(paraphrases: List[str], samples_per_paraphrase: int = 5,
                           model: str = "gpt-4.1-nano", temperature: float = 0.7,
                           include_logprobs: bool = True, delay: float = 0.1):
    """
    Analiza incertidumbre completa para GPT-4 usando la nueva API de OpenAI
    """

    # 1. Generar respuestas
    if include_logprobs:
        try:
            outputs_per_paraphrase, logprobs_per_paraphrase = generate_gpt4_with_logprobs(
                paraphrases, samples_per_paraphrase, model, temperature, delay=delay
            )
        except Exception as e:
            print(f"Error con logprobs, fallback a modo básico: {e}")
            outputs_per_paraphrase = generate_gpt4_responses(
                paraphrases, samples_per_paraphrase, model, temperature, delay=delay
            )
            logprobs_per_paraphrase = None
    else:
        outputs_per_paraphrase = generate_gpt4_responses(
            paraphrases, samples_per_paraphrase, model, temperature, delay=delay
        )
        logprobs_per_paraphrase = None

    print("="*50)
    print("CALCULANDO INCERTIDUMBRE PARA GPT-4.1-nano...")
    print("="*50)

    # 2. Calcular métricas básicas de incertidumbre
    uncertainty_metrics = calculate_uncertainty_metrics_lightweight(outputs_per_paraphrase, paraphrases)

    # 3. Calcular métricas de logprobs si están disponibles
    logprob_metrics = None
    if include_logprobs and logprobs_per_paraphrase:
        logprob_metrics = calculate_logprobs_uncertainty(logprobs_per_paraphrase)

    # 4. Mostrar resultados
    print(f"\n📊 MÉTRICAS DE INCERTIDUMBRE:")
    print(f"├── Incertidumbre Total: {uncertainty_metrics['total_uncertainty']:.3f} bits")
    print(f"├── Incertidumbre Aleatoria: {uncertainty_metrics['aleatoric_uncertainty']:.3f} bits")
    print(f"├── Incertidumbre Epistémica: {uncertainty_metrics['epistemic_uncertainty']:.3f} bits")
    print(f"├── Confianza: {uncertainty_metrics['confidence']:.3f}")
    print(f"├── Respuestas únicas: {uncertainty_metrics['unique_answers']}")
    print(f"├── Consistencia entre paráfrasis: {uncertainty_metrics['consistency_across_paraphrases']:.3f}")
    print(f"└── Respuesta más común: '{uncertainty_metrics['most_common_answer']}'")

    if logprob_metrics and logprob_metrics.get('logprobs_available', False):
        print(f"\n🔢 MÉTRICAS DE LOG PROBABILITIES:")
        print(f"├── Incertidumbre promedio: {logprob_metrics['mean_logprob_uncertainty']:.3f} bits")
        print(f"└── Desviación estándar: {logprob_metrics['std_logprob_uncertainty']:.3f} bits")
    elif logprob_metrics:
        print(f"\n⚠️  LOG PROBABILITIES: {logprob_metrics['message']}")

    # 5. Interpretación
    print(f"\n🧠 INTERPRETACIÓN:")
    if uncertainty_metrics['epistemic_uncertainty'] > uncertainty_metrics['aleatoric_uncertainty']:
        print("├── El modelo tiene más incertidumbre sobre QUÉ responder")
        print("└── → Sugiere falta de conocimiento específico")
    else:
        print("├── El modelo tiene más incertidumbre sobre CÓMO responder")
        print("└── → Sugiere ambigüedad inherente en la pregunta")

    if uncertainty_metrics['consistency_across_paraphrases'] > 0.8:
        print("├── Alta consistencia entre paráfrasis")
    elif uncertainty_metrics['consistency_across_paraphrases'] > 0.5:
        print("├── Consistencia moderada entre paráfrasis")
    else:
        print("├── Baja consistencia - posible confusión del modelo")

    return {
        'uncertainty_metrics': uncertainty_metrics,
        'logprob_metrics': logprob_metrics,
        'model_used': model,
        'analysis_params': {
            'num_paraphrases': len(paraphrases),
            'samples_per_paraphrase': samples_per_paraphrase,
            'temperature': temperature,
            'included_logprobs': include_logprobs
        }
    }

# EJEMPLO DE USO
if __name__ == "__main__":

    # Paráfrasis de la pregunta
    paraphrases = [
        "What year did the Berlin Wall fall?",
        "When did the Berlin Wall come down?",
        "In which year was the Berlin Wall demolished?",
        "What year did the fall of the Berlin Wall occur?",
        "When was the Berlin Wall brought down?",
        "In what year did the Berlin Wall collapse?",
        "What year did they tear down the Berlin Wall?",
        "When did the destruction of the Berlin Wall happen?",
        "In which year did the Berlin Wall get demolished?",
        "What year marked the fall of the Berlin Wall?"
    ]

    # Ejecutar análisis
    results = analyze_uncertainty_gpt4(
        paraphrases=paraphrases,
        samples_per_paraphrase=5,
        model="gpt-4.1-nano",
        temperature=0.7,
        include_logprobs=True,
        delay=0.1  # 100ms entre llamadas
    )

    print("\n" + "="*50)
    print("RESULTADOS FINALES:")
    print("="*50)
    print(json.dumps(results, indent=2, ensure_ascii=False))

Generando respuestas con gpt-4.1-nano (intentando obtener logprobs)...
Procesando paráfrasis 1/10: What year did the Berlin Wall fall?
Procesando paráfrasis 2/10: When did the Berlin Wall come down?
Procesando paráfrasis 3/10: In which year was the Berlin Wall demolished?
Procesando paráfrasis 4/10: What year did the fall of the Berlin Wall occur?
Procesando paráfrasis 5/10: When was the Berlin Wall brought down?
Procesando paráfrasis 6/10: In what year did the Berlin Wall collapse?
Procesando paráfrasis 7/10: What year did they tear down the Berlin Wall?
Procesando paráfrasis 8/10: When did the destruction of the Berlin Wall happen?
Procesando paráfrasis 9/10: In which year did the Berlin Wall get demolished?
Procesando paráfrasis 10/10: What year marked the fall of the Berlin Wall?
CALCULANDO INCERTIDUMBRE PARA GPT-4.1-nano...

📊 MÉTRICAS DE INCERTIDUMBRE:
├── Incertidumbre Total: 1.180 bits
├── Incertidumbre Aleatoria: 0.241 bits
├── Incertidumbre Epistémica: 0.939 bits
├── Confianz