### G-Drive Setup

Este notebook asume que se está ejecutando en Google Colab, y que el dataset `LLM-Redial` disponible en https://drive.google.com/drive/folders/1TIP4PFm9z0C4R4--KnHoWuiB1uK-dv5m se encuentra descargado en el drive del usuario.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


### Instalaciones necesarias

In [None]:
%pip install unsloth[colab-new] xformers trl peft accelerate bitsandbytes rapidfuzz



In [None]:
import shutil
import os

source_path = '/content/gdrive/MyDrive/Proyecto LLMonkeys/Tools.py'
destination_path = '/content/Tools.py'

shutil.copy(source_path, destination_path)

'/content/Tools.py'

In [None]:
import zipfile

zip_path = '/content/gdrive/MyDrive/Proyecto LLMonkeys/LLM_Redial.zip'
extract_path = '/content/LLM_Redial'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [None]:
from transformers import AutoTokenizer

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

In [None]:
# Cargar datos del Dataset

import Tools as t

path = "./LLM_Redial/Movie"
final_data_path = '{}/final_data.jsonl'.format(path)
Conversation_path = '{}/Conversation.txt'.format(path)
user_map_path = '{}/user_ids.json'.format(path)
item_map_path = '{}/item_map.json'.format(path)

final_data = t.read_jsonl(final_data_path)
user_map = t.read_json(user_map_path)
item_map = t.read_json(item_map_path)
Conversation = t.read_dialogue(Conversation_path)

### Para limpiar el entorno y guardar las variables relevantes

In [None]:
import gc
import torch
import dill

def cleanup_for_dill_serialization():
    """
    Limpia todas las variables pesadas que pueden causar problemas con dill
    Mantiene solo los outputs y variables esenciales
    """
    # Variables que debes eliminar ANTES de dill.dump_session()
    variables_to_delete = [
        'model', 'base_model', 'config', 'pipe',

        # Tensores y objetos de PyTorch
        'inputs', 'output_ids', 'outputs',

        # Variables temporales del loop
        'decoded', 'prompt', 'messages', 'ctx', 'msg',

        # Indices y variables de control
        'i', 'n', 'k',
    ]

    # Lista de variables que SÍ quieres mantener
    variables_to_keep = [
        'item_map',
        'Conversation',
        'model_name',
        'output_r'
        'outputs_mp'
        'outputs_z_s',
        'outputs_f_s',
        'outputs_ft_s',
        'outputs_z_n',
        'outputs_f_n',
        'outputs_ft_n',
        'rand_conversations',
        'num_test_items',
        'train_conv',
        'test_conv',
        'few_shot_users',
        'num_test_items',
        'few_shot_data'
    ]

    # Obtener todas las variables globales
    global_vars = list(globals().keys())

    # Eliminar variables específicamente problemáticas
    for var_name in variables_to_delete:
        if var_name in globals():
            print(f"   ├── Eliminando: {var_name}")
            try:
                del globals()[var_name]
            except:
                print(f"No se pudo eliminar {var_name}")

    vars_to_remove = []
    for var_name in global_vars:
        if var_name.startswith('_'):  # Variables privadas
            continue

        if var_name in variables_to_keep:  # No eliminar variables importantes
            continue

        try:
            var_obj = globals().get(var_name)
            var_type = str(type(var_obj))

            # Detectar objetos problemáticos
            problematic_types = [
                'transformers',
                'peft',
                'torch.nn',
                'pipeline',
                'PreTrainedModel',
                'PreTrainedTokenizer',
                'PeftModel',
                'Tensor'
            ]

            if any(prob_type in var_type for prob_type in problematic_types):
                vars_to_remove.append(var_name)

        except Exception as e:
            vars_to_remove.append(var_name)

    # Eliminar variables problemáticas detectadas
    for var_name in vars_to_remove:
        print(f"   ├── Eliminando: {var_name}")
        try:
            del globals()[var_name]
        except:
            print(f"No se pudo eliminar: {var_name}")

    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    gc.collect()

    # Verificar tamaño de variables mantenidas
    print("\n📊 Variables mantenidas:")
    for var_name in variables_to_keep:
        if var_name in globals():
            var_obj = globals()[var_name]
            try:
                if hasattr(var_obj, '__len__'):
                    print(f"   ├── {var_name}: {len(var_obj)} elementos")
                else:
                    print(f"   ├── {var_name}: {type(var_obj)}")
            except:
                print(f"   ├── {var_name}: (no se puede medir)")

    return True

# FUNCIÓN PRINCIPAL PARA TU CASO
def cleanup_after_lora_generation():
    """
    Limpieza específica después de generar con modelo LoRA
    """
    print("🎯 Limpieza específica para modelo LoRA...")

    # Variables específicas de tu código LoRA
    lora_specific_vars = [
        'peft_model_path',
        'config',           # PeftConfig
        'base_model',       # Modelo base
        'model',           # PeftModel final
        'tokenizer',       # Tokenizer
        'pipe',           # Pipeline
        'inputs',          # Tensors de input
        'output_ids',      # Tensors de output IDs
        'prompt',          # Prompt generado
        'messages',        # Mensajes del chat template
        'ctx',            # Context string
        'msg',            # Message string
        'decoded',        # String decodificado
        'outputs',        # Lista temporal (no outputs_ft_s)
    ]

    for var_name in lora_specific_vars:
        if var_name in globals():
            print(f"   ├── Eliminando: {var_name}")
            try:
                del globals()[var_name]
            except Exception as e:
                print(f"   │   └── Error: {e}")

    # Limpieza general
    cleanup_for_dill_serialization()

def limpiar_y_guardar():
  cleanup_after_lora_generation()
  path = '/content/gdrive/MyDrive/Proyecto LLMonkeys/sessions/TinyLlama/TinyLlama_notebook_env.db'
  with open(path, 'wb') as f:
      dill.dump_session(f)

### Para retornar la respuesta del modelo como una lista de nombres

In [None]:
import re
def format_ans(ans, n):
    answer = "None"
    try:
        answer_list = []
        current_pos = 0

        for j in range(1, n + 1):
            # Busca el patrón del número y el inicio del texto
            patterns = [
                f"{j}. ",
                f"Movie {j}: ",
                f"Movie name {j}: ",
                f"Movie Name {j}: ",
                f"[{j}] ",
            ]
            for pattern in patterns:
                pattern_start = pattern
                start_index = ans.find(pattern_start, current_pos)
                if start_index != -1:
                    break

            if start_index == -1:
                break  # Si no se encuentra el número, salimos del bucle

            start_text = start_index + len(pattern_start)

            # Busca el final del texto: el inicio del siguiente número O un " - "
            next_number_start = ans.find(f"{j + 1}. ", start_text)
            dash_start = ans.find(" - ", start_text)

            end_text = -1

            # Determina el final del texto basado en la primera ocurrencia
            if next_number_start != -1 and dash_start != -1:
                end_text = min(next_number_start, dash_start)
            elif next_number_start != -1:
                end_text = next_number_start
            elif dash_start != -1:
                end_text = dash_start
            else:
                # Si no hay siguiente número ni " - ", toma hasta el final de la línea o la cadena
                end_line = ans.find("\n", start_text)
                if end_line != -1:
                    end_text = end_line
                else:
                    end_text = len(ans)

            if end_text == -1: # Si no se encontró un final válido, toma hasta el final de la cadena
                end_text = len(ans)

            movie_name = ans[start_text:end_text].strip()
            answer_list.append(movie_name)
            current_pos = end_text # Actualiza la posición para la próxima búsqueda

        if not answer_list: # Si no se encontraron películas
             return "Formato de respuesta incorrecto", i

    except Exception as e:
        answer = f"Error: {e}"

    finally:
        pattern = r"\(\d{4}\)"
        answer_list = [m.replace('\"', '') for m in answer_list]
        answer_list = [re.sub(pattern, '', m).strip() for m in answer_list]
        return answer_list

### Para guardar y cargar las respuestas del modelo

In [None]:
import json

def guardar_datos_json(lista_de_listas, nombre_archivo):
  """
  Guarda una lista de listas en un archivo de texto en formato JSON.

  Args:
    lista_de_listas: La lista de listas a guardar.
    nombre_archivo: El nombre del archivo donde se guardará.
  """
  try:
    with open(nombre_archivo, 'w', encoding='utf-8') as f:
      json.dump(lista_de_listas, f, ensure_ascii=False, indent=4)
    print(f"Outputs guardados exitosamente en '{nombre_archivo}'")
  except Exception as e:
    print(f"Error al guardar outputs: {e}")

def cargar_datos_json(nombre_archivo):
  """
  Carga una lista de listas desde un archivo de texto en formato JSON.

  Args:
    nombre_archivo: El nombre del archivo desde donde se cargará.

  Returns:
    La lista de listas cargada, o None si hay un error.
  """
  try:
    with open(nombre_archivo, 'r', encoding='utf-8') as f:
      lista_de_listas = json.load(f)
    print(f"Outputs cargados exitosamente desde '{nombre_archivo}'")
    return lista_de_listas
  except FileNotFoundError:
    print(f"Error: El archivo '{nombre_archivo}' no fue encontrado.")
    return None
  except json.JSONDecodeError:
    print(f"Error: El archivo '{nombre_archivo}' no es un JSON válido.")
    return None
  except Exception as e:
    print(f"Error al cargar outputs: {e}")
    return None

### Cargar las conversaciones a utilizar para el entrenamiento, validación y testeo

In [None]:
def load_conversations(conv_path):

    all_conversations = {}
    current_conv_id = None
    current_conv_lines = []

    with open(conv_path, 'r', encoding='utf-8') as file:
        for line in file:
            line_stripped = line.strip()

            # Si es un número (ID de conversación)
            if line_stripped.isdigit():
                # Guardar conversación anterior si existe
                if current_conv_id is not None:
                    all_conversations[current_conv_id] = "".join(current_conv_lines[2:])

                # Iniciar nueva conversación
                current_conv_id = int(line_stripped)
                current_conv_lines = [line]
            else:
                # Agregar línea a la conversación actual
                if current_conv_id is not None:
                    current_conv_lines.append(line)

    if current_conv_id is not None:
        all_conversations[current_conv_id] = current_conv_lines


    all_conversations[len(all_conversations)-1] = "".join(all_conversations[len(all_conversations)-1][2:])

    return all_conversations

all_conversations = load_conversations(Conversation_path)
n_conversations = len(all_conversations)
print(n_conversations)

### Separación de diálogos en train, test y val, cuidando que todas las conversaciones de un usuario en particular se encuentren en sólo train o sólo en test


In [None]:
import random
import json

path = './LLM_Redial/Movie/final_data.jsonl'

# Cada entrada de final_data.jsonl se ve así
#{
#  "A30Q8X8B1S3GGT": {
#    "history_interaction": [...],
#    "user_might_like": [...],
#    "Conversation": [...]
#  }
#}

with open(path, 'r', encoding='utf-8') as file:
    data = [json.loads(line) for line in file]

train_len = n_conversations * 0.8
test_val_len = n_conversations * 0.1

train_conv = []
test_conv = []
val_conv = []
used_users = []

user_data_map = {user_id: user_info for item in data for user_id, user_info in item.items()}

aux = []

while len(aux) < test_val_len:
    try:
        convs = []
        random.seed(42)

        # Usando used_users aseguramos que un mismo usuario no aparezca en Train y Test al mismo tiempo
        user_id = random.choice(list(set(user_map.keys())^set(used_users)))
        used_users.append(user_id)

        # Guardamos la info del usuario y sus dialogos
        user_data = user_data_map.get(user_id)
        user_conversations = user_data.get("Conversation", [])

        for i in range(len(user_conversations)):
            selected_conversation = user_conversations[i]
            conversation_details = list(selected_conversation.values())[0]
            conversation_id = conversation_details["conversation_id"]
            conversation = all_conversations[conversation_id]

            convs.append(conversation)
            aux.append(conversation)
        test_conv.append([user_id,convs])
    except ValueError:
        print("ValueError: ", user_id, f"{conversation_id}\n")

aux = []

while len(aux) < test_val_len:
    try:
        convs = []
        random.seed(42)
        user_id = random.choice(list(set(user_map.keys())^set(used_users)))
        used_users.append(user_id)
        user_data = user_data_map.get(user_id)
        user_conversations = user_data.get("Conversation", [])
        for i in range(len(user_conversations)):
            selected_conversation = user_conversations[i]
            conversation_details = list(selected_conversation.values())[0]
            conversation_id = conversation_details["conversation_id"]
            conversation = all_conversations[conversation_id]

            convs.append(conversation)
            aux.append(conversation)
        val_conv.append([user_id,convs])
    except ValueError:
        print("ValueError: ", user_id, f"{conversation_id}\n")

for user_id in list(set(user_map.keys()) ^ set(used_users)):
    try:
        convs = []
        user_data = user_data_map.get(user_id)
        user_conversations = user_data.get("Conversation", [])
        for i in range(len(user_conversations)):
            selected_conversation = user_conversations[i]
            conversation_details = list(selected_conversation.values())[0]
            conversation_id = conversation_details["conversation_id"]
            conversation = all_conversations[conversation_id]

            convs.append(conversation)
        train_conv.append([user_id,convs])
    except ValueError:
        print("ValueError: ", user_id, f"{conversation_id}\n")

print(len(test_conv), len(val_conv), len(train_conv), len(test_conv) + len(train_conv) + len(val_conv), n_conversations)

316 289 2526 3131 10089


### Elegimos si generamos respuestas nuevas o si cargamos datos ya generados

In [None]:
eleccion = "1"
while eleccion not in ["1", "2"]:
  eleccion = input("Generar respuestas nuevas (1) o cargar respuestas anteriores (2)? ")
  if eleccion not in ["1", "2"]:
    print("Opción inválida. Por favor, elige 1 o 2.\n")

### Selección aleatoria de diálogos para testear

In [None]:
num_test_items = 100

In [None]:
import random
def extraer_dialogos(conversations, num_conversations):
  rand_conversations = []
  all = []
  used_convs = []
  for conv in conversations:
    all.extend([f"{conv[0]}:::{c}" for c in conv[1]])

  while len(rand_conversations) < num_conversations:

    rand_user = random.choice(list(set(all)^set(used_convs)))
    used_convs.append(rand_user)
    rand_user = rand_user.split(":::")
    user_id = rand_user[0]
    user_conversation  = rand_user[1]
    user_data = user_data_map[user_id]
    convs = user_data.get("Conversation", [])
    for i in range(len(convs)):
      conv_details = list(convs[i].values())[0]
      conversation = all_conversations[conv_details["conversation_id"]]
      # print(conversation)
      # print(user_conversation)
      if user_conversation == conversation:
        rand_user_conv_id = i
    dialog = "\n\n".join(user_conversation.split("\n\n")[:3])
    dialog_id = list(convs[rand_user_conv_id].values())[0]["conversation_id"]
    dialog_ground_truth = list(convs[rand_user_conv_id].values())[0]["rec_item"]
    rand_user_interactions = user_data.get("history_interaction", [])
    rand_user_interactions = [item_map[m] for m in rand_user_interactions]
    rand_conversations.append([
        rand_user_conv_id,      # index 0
        dialog,                 # index 1
        user_data,              # index 2
        dialog_id,              # index 3
        dialog_ground_truth,    # index 4
        rand_user_interactions  # index 5
    ])
  return rand_conversations

In [None]:
if eleccion == "1":
  rand_conversations = extraer_dialogos(test_conv, num_test_items)
  path = "/content/gdrive/MyDrive/Proyecto LLMonkeys/outputs/TinyLlama/rand_conversations.json"
  guardar_datos_json(rand_conversations, path)
elif eleccion == "2":
  path = "/content/gdrive/MyDrive/Proyecto LLMonkeys/outputs/TinyLlama/rand_conversations.json"
  rand_conversations = cargar_datos_json(path)

Outputs guardados exitosamente en '/content/gdrive/MyDrive/Proyecto LLMonkeys/outputs/TinyLlama/rand_conversations.json'


### Seleccionamos aleatoriamente los datos de Few shot desde train

In [None]:
# ESTO QUEDA POR ARREGLAR, PASAR DATOS FEW-SHOT CON EL MISMO FORMATO DEL INPUT-RESPUESTA

few_shot_data = []
random.seed(42)

sorted_train_conv = []
for user, texts in train_conv:
  ordered_text = sorted(texts, key=len)
  sorted_train_conv.append([user, ordered_text])

sorted_train_conv.sort(key=lambda x: len(x[1][0]))

few_shot_users = random.sample(sorted_train_conv[:500], 5)

for u in few_shot_users:
  user_data = next((item[u[0]] for item in data if u[0] in item), None)
  user_interactions = user_data.get("history_interaction", [])
  user_interactions = [item_map[m] for m in user_interactions]

  conversation = min(u[1], key=len)
  conversation[conversation.index("User:"):-2]
  convs = user_data.get("Conversation", [])
  for c in convs:
    user_likes = list(c.values())[0]["user_likes"]
    user_dislikes = list(c.values())[0]["user_dislikes"]
    recs= list(c.values())[0]["rec_item"]

    user_likes = [item_map[m] for m in user_likes]
    user_dislikes = [item_map[m] for m in user_dislikes]
    recs = [item_map[m] for m in recs]
  few_shot_data.append({
      # "user_interactions": user_interactions,
      "conversation": conversation,
      "user_likes": user_likes,
      "user_dislikes": user_dislikes,
      "recs": recs
  })

In [None]:
def contar_tokens(texto):
  print(len(tokenizer.encode(texto)))

# Generación de 20 listas de recomendación de 10 películas

### Zero-Shot con interacciones históricas:

In [1]:
import torch
from transformers import pipeline
import random
import time

if eleccion == "1":

  # MAJOR OPTIMIZATION: Initialize pipeline ONCE outside the loop
  pipe = pipeline(
      "text-generation",
      model=model_name,
      device_map="auto",
      torch_dtype=torch.float16,
  )

  # Pre-compile the base context to avoid string concatenation in loop
  base_ctx = (
      "You are a Movie Recommendation System."
      "Generate a numbered list of 10 Movies. \n"
      "RULES: \n"
      "a) DO NOT write dialogs, explanations nor additional text or information. \n"
      "b) DO NOT recommend movies already mentioned in the conversation. \n"
      "c) You MUST recommend 10 movies, nothing more, nothing less. \n"
      "d) The movies MUST be numbered from 1 to 10, with one movie name per line. \n"
      "\nFailure to follow the rules will result in incorrect output and be discarded by the system."
  )

  outputs_z_s = []
  k = 20

  # Generation parameters (moved outside loop)
  gen_kwargs = {
      "max_new_tokens": 200,
      "do_sample": True,
      "temperature": 0.7,
      "top_k": 50,
      "top_p": 0.95,
      "pad_token_id": pipe.tokenizer.eos_token_id  # Prevents warnings
  }
  total_start_time = time.time()
  for n in range(num_test_items):
      if torch.cuda.is_available():
          torch.cuda.empty_cache()
      if n%5 == 0:
        start_time = time.time()
      print(f"Generating for test item {n+1}...")
      outputs = []

      # Build message once per test item
      msg = (
          "\nBased on the following conversation: \n"
          f"{rand_conversations[n][1]} \n\n"
          "And the movies the user has previously interacted with: \n"
          f"{random.sample(rand_conversations[n][5], min(10, len(rand_conversations[n][5])))}\n\n"
          "Generate a list of 10 recommended movies (JUST NAMES, ONE PER LINE):"
      )

      messages = [{"role": "user", "content": base_ctx + msg}]
      prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

      # Tokenize once per test item, not per generation
      inputs = pipe.tokenizer(prompt, return_tensors="pt").to("cuda")

      # Batch generation option (much faster if memory allows)
      with torch.no_grad():
          iter_start_time = time.time()
          output_ids = pipe.model.generate(
              **inputs,
              num_return_sequences=k,
              **gen_kwargs
          )
          for output_id in output_ids:
              decoded = pipe.tokenizer.decode(output_id, skip_special_tokens=True)
              outputs.append(decoded)
          # print(f"Iteration {i} of test item {n+1} generated in {time.time() - iter_start_time} seconds.")

      outputs_z_s.append(outputs)
      if (n+1)%5 == 0:
        print(f"Test items {n-3}-{n+1} generated in {(time.time() - start_time):.3f} seconds.\n")
  total = time.time() - total_start_time
  print(f"Total generation time: {int(total//60)} minutes and {int(total - total//60*60)} seconds.\n")
  path = "/content/gdrive/MyDrive/Proyecto LLMonkeys/outputs/TinyLlama/outputs_zero_shot.json"
  guardar_datos_json(outputs_z_s, path)

elif eleccion == "2":
  path = "/content/gdrive/MyDrive/Proyecto LLMonkeys/outputs/TinyLlama/outputs_zero_shot.json"
  outputs_z_s = cargar_datos_json(path)


NameError: name 'eleccion' is not defined

### Few-Shot con interacción histórica

In [None]:
import torch
from transformers import pipeline
import random
import time

if eleccion == "1":
  # MAJOR OPTIMIZATION: Initialize pipeline ONCE outside the loop
  pipe = pipeline(
      "text-generation",
      model=model_name,
      device_map="auto",
      torch_dtype=torch.float16,
  )

  # Pre-compile the base context to avoid string concatenation in loop
  base_ctx = (
      "You are a Movie Recommendation System."
      "Generate a numbered list of 10 Movies. \n"
      "RULES: \n"
      "a) DO NOT write dialogs, explanations nor additional text or information. \n"
      "b) DO NOT recommend movies already mentioned in the conversation. \n"
      "c) You MUST recommend 10 movies. \n"
      "d) The movies MUST be numbered from 1 to 10, with one movie name per line. \n"
      "\nFailure to follow the rules will result in incorrect output and be discarded by the system."
  )

  outputs_f_s = []
  k = 20

  # Generation parameters (moved outside loop)
  gen_kwargs = {
      "max_new_tokens": 180,
      "do_sample": True,
      "temperature": 0.7,
      "top_k": 50,
      "top_p": 0.95,
      "pad_token_id": pipe.tokenizer.eos_token_id  # Prevents warnings
  }
  total_start_time = time.time()
  for n in range(num_test_items):
      if torch.cuda.is_available():
          torch.cuda.empty_cache()
      if n%5 == 0:
        start_time = time.time()
      print(f"Generating for test item {n+1}...")
      outputs = []

      # Build message once per test item
      msg = (
        "\nBased on these 4 examples: \n"
        f"{few_shot_data[0]}\n"
        f"{few_shot_data[1]}\n"
        f"{few_shot_data[2]}\n"
        f"{few_shot_data[3]}\n\n"
        "And based on the following conversation: \n"
        f"{rand_conversations[n][1]} \n\n"
        "And the movies the user has previously interacted with: \n"
        f"{random.sample(rand_conversations[n][5], min(10, len(rand_conversations[n][5])))}\n\n"
        "Generate a list of 10 recommended movies (JUST NAMES, ONE PER LINE):"
    )

      messages = [{"role": "user", "content": base_ctx + msg}]
      prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

      # Tokenize once per test item, not per generation
      inputs = pipe.tokenizer(prompt, return_tensors="pt").to("cuda")

      # Batch generation option (much faster if memory allows)
      with torch.no_grad():
          iter_start_time = time.time()
          output_ids = pipe.model.generate(
              **inputs,
              num_return_sequences=k,
              **gen_kwargs
          )
          for output_id in output_ids:
              decoded = pipe.tokenizer.decode(output_id, skip_special_tokens=True)
              outputs.append(decoded)
          # print(f"Iteration {i} of test item {n+1} generated in {time.time() - iter_start_time} seconds.")

      outputs_f_s.append(outputs)
      if (n+1)%5 == 0:
        print(f"Test items {n-3}-{n+1} generated in {(time.time() - start_time):.3f} seconds.\n")
  total = time.time() - total_start_time
  print(f"Total generation time: {int(total//60)} minutes and {int(total - total//60*60)} seconds.\n")
  path = "/content/gdrive/MyDrive/Proyecto LLMonkeys/outputs/TinyLlama/outputs_few_shot.json"
  guardar_datos_json(outputs_f_s, path)

elif eleccion == "2":
  path = "/content/gdrive/MyDrive/Proyecto LLMonkeys/outputs/TinyLlama/outputs_few_shot.json"
  outputs_f_s = cargar_datos_json(path)

Device set to use cuda:0


Generating for test item 1...
Generating for test item 2...
Generating for test item 3...
Generating for test item 4...
Generating for test item 5...
Test items 1-5 generated in 73.995 seconds.

Generating for test item 6...
Generating for test item 7...
Generating for test item 8...
Generating for test item 9...
Generating for test item 10...
Test items 6-10 generated in 74.225 seconds.

Generating for test item 11...
Generating for test item 12...
Generating for test item 13...
Generating for test item 14...
Generating for test item 15...
Test items 11-15 generated in 73.807 seconds.

Generating for test item 16...
Generating for test item 17...
Generating for test item 18...
Generating for test item 19...
Generating for test item 20...
Test items 16-20 generated in 76.282 seconds.

Generating for test item 21...
Generating for test item 22...
Generating for test item 23...
Generating for test item 24...
Generating for test item 25...
Test items 21-25 generated in 74.038 seconds.

Ge

### Fine-Tuning

In [None]:
from huggingface_hub import login
login(token="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")

In [None]:
movie_likes = {}
for u in train_conv:
  user = u[0]
  user_data = user_data_map[user]
  convs = user_data.get("Conversation", [])
  for c in convs:
    user_likes = list(c.values())[0]["user_likes"]
    for m in user_likes:
      try:
        movie_likes[m] += 1
      except:
        movie_likes[m] = 1

sorted_movie_likes = dict(sorted(movie_likes.items(), key=lambda item: item[1], reverse=True))
top_20_movies = dict(list(sorted_movie_likes.items())[:20])
top_20_movie_names = [item_map[m] for m in list(top_20_movies.keys())]

In [None]:
import random
def preparar_datos_fine_tuning(rand_conversations):

    fine_tune_data = []

    base_ctx = (
        "You are a Movie Recommendation System."
        "Generate a numbered list of 10 Movies. \n"
        "RULES: \n"
        "a) DO NOT write dialogs, explanations nor additional text or information. \n"
        "b) DO NOT recommend movies already mentioned in the conversation. \n"
        "c) You MUST recommend 10 movies, nothing more, nothing less. \n"
        "d) The movies MUST be numbered from 1 to 10, with one movie name per line. \n"
        "\nFailure to follow the rules will result in incorrect output and be discarded by the system."
    )

    for conversation_data in rand_conversations:
      msg = (
          "\nBased on the following conversation: \n"
          f"{conversation_data[1]} \n\n"
          "Generate a list of 10 recommended movies (JUST NAMES, ONE PER LINE):"
      )

      input_text = base_ctx + msg

      top_10_recommendations = []

      top_10_recommendations.append(item_map[conversation_data[4][0]]) # ground truth numero 1

      user_convs = list(conversation_data[2].get("Conversation", []))
      user_conv_id = conversation_data[0]
      user_likes = list(user_convs[user_conv_id].values())[0]["user_likes"]
      user_likes = [item_map[m] for m in user_likes]

      # agregamos 3 películas que le gustan al usuario que no se mencionan en el extracto
      for m in user_likes:
        if m not in conversation_data[1] and len(top_10_recommendations) < 4:
          top_10_recommendations.append(m)

      # agregamos 4 películas de sus interacciones
      interactions = random.sample(conversation_data[5], min(10, len(conversation_data[5])))
      top_10_recommendations.extend(interactions[:4])

      # rellenamos con películas al azar de las 20 más populares
      while len(top_10_recommendations) < 10:
        random_movie = random.choice(top_20_movie_names)
        if random_movie not in top_10_recommendations:
          top_10_recommendations.append(random_movie)

      output_text = "\n".join([f"{i+1}. {movie}" for i, movie in enumerate(top_10_recommendations)])
      fine_tune_data.append({"text": f"""{input_text}{output_text}<|endoftext|>"""})

    return fine_tune_data


In [None]:
entrenar = "2"
while entrenar not in ["1", "2"]:
  entrenar = input("Entrenar desde cero (1) o cargar modelo pre-entrenado (2)? ")

  if entrenar not in ["1", "2"]:
    print("Opción inválida. Por favor, elige 1 o 2.\n")

In [None]:
if entrenar == "1":
  fine_tune_convs = extraer_dialogos(train_conv, 4000)
  fine_tune_convs_val = extraer_dialogos(val_conv, min(1000, len(val_conv)))

In [None]:
if entrenar == "1":
  fine_tune_data = preparar_datos_fine_tuning(fine_tune_convs)
  fine_tune_data_val = preparar_datos_fine_tuning(fine_tune_convs_val)

In [None]:
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import Dataset

if entrenar == "1":
  model, tokenizer = FastLanguageModel.from_pretrained(
      model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
      max_seq_length = 2048,
      dtype = None,
      load_in_4bit = True,
      trust_remote_code = True,
  )

  model = FastLanguageModel.get_peft_model(
      model,
      r = 16,
      target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj"],
      lora_alpha = 16,
      lora_dropout = 0.1,
      bias = "none",
      use_gradient_checkpointing = True,
      random_state = 1234,
  )


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer
from datasets import Dataset

if entrenar == "1":
  train_dataset = Dataset.from_list(fine_tune_data)
  val_dataset = Dataset.from_list(fine_tune_data_val)

  data_collator = DataCollatorForLanguageModeling(
      tokenizer=tokenizer,
      mlm=False,  # False para modelos como TinyLlama (causal LM)
      pad_to_multiple_of=8,  # Optimización para tensor cores
      return_tensors="pt"
  )

  training_args = TrainingArguments(
      output_dir="/content/gdrive/MyDrive/Proyecto LLMonkeys/TinyLlama-fine-tune-3",
      per_device_train_batch_size=4,        # batch por GPU
      gradient_accumulation_steps=8,        # acumular gradientes para simular un batch mayor
      warmup_steps = 50,
      max_steps = 500,                    # ~2-3 epochs para 4000 ejemplos
      learning_rate = 2e-4,               # Relativamente alto para LoRA
      bf16 = True,                        # Crucial para eficiencia
      logging_steps = 25,
      optim = "adamw_8bit",               # Optimizador eficiente
      weight_decay = 0.01,
      lr_scheduler_type = "cosine",
      seed = 1234,
      save_steps = 100,
      eval_steps = 100,
      eval_strategy = "steps",
      load_best_model_at_end = True,
      metric_for_best_model = "eval_loss",
      greater_is_better = False,
  )

  trainer = SFTTrainer(
      model = model,
      tokenizer = tokenizer,
      train_dataset = train_dataset,
      eval_dataset = val_dataset,
      data_collator=data_collator,
      dataset_text_field = "text",
      max_seq_length = 2048,
      dataset_num_proc = 2,
      args = training_args,
  )

In [None]:
if entrenar == "1":
  trainer.train()

  model.save_pretrained("/content/gdrive/MyDrive/Proyecto LLMonkeys/TinyLlama-fine-tune-3")
  tokenizer.save_pretrained("/content/gdrive/MyDrive/Proyecto LLMonkeys/TinyLlama-fine-tune-3")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
from unsloth import FastLanguageModel
import torch

ft_model_path = "/content/gdrive/MyDrive/Proyecto LLMonkeys/TinyLlama-fine-tune-3"

ft_model, ft_tokenizer = FastLanguageModel.from_pretrained(
    model_name = ft_model_path,
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)
FastLanguageModel.for_inference(ft_model)

==((====))==  Unsloth 2025.6.2: Fast Llama patching. Transformers: 4.52.4.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/762M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

unsloth/tinyllama-chat-bnb-4bit does not have a padding token! Will use pad_token = <unk>.


Unsloth 2025.6.2 patched 22 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048, padding_idx=0)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              

In [None]:
from transformers import pipeline

if eleccion == "1":

  # Pre-compile the base context to avoid string concatenation in loop
  base_ctx = (
      "You are a Movie Recommendation System."
      "Generate a numbered list of 10 Movies. \n"
      "RULES: \n"
      "a) DO NOT write dialogs, explanations nor additional text or information. \n"
      "b) DO NOT recommend movies already mentioned in the conversation. \n"
      "c) You MUST recommend 10 movies, nothing more, nothing less. \n"
      "d) The movies MUST be numbered from 1 to 10, with one movie name per line. \n"
      "\nFailure to follow the rules will result in incorrect output and be discarded by the system."
  )

  outputs_ft_s = []
  k = 20

  # Generation parameters (moved outside loop)
  gen_kwargs = {
      "max_new_tokens": 200,
      "do_sample": True,
      "temperature": 0.7,
      "top_k": 50,
      "top_p": 0.95,
      "pad_token_id": ft_tokenizer.eos_token_id,
      "eos_token_id": ft_tokenizer.eos_token_id
    }

  total_start_time = time.time()
  for n in range(num_test_items):
      if torch.cuda.is_available():
          torch.cuda.empty_cache()
      if n%5 == 0:
        start_time = time.time()
      print(f"Generating for test item {n+1}...")
      outputs = []

      # Build message once per test item
      msg = (
          "\nBased on the following conversation: \n"
          f"{rand_conversations[n][1]} \n\n"
          # "And the movies the user has previously interacted with: \n"
          # f"{random.sample(rand_conversations[n][5], min(10, len(rand_conversations[n][5])))}\n\n"
          "Generate a list of 10 recommended movies (JUST NAMES, ONE PER LINE):"
      )

      messages = [{"role": "user", "content": base_ctx + msg}]
      prompt = ft_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

      # Tokenize once per test item, not per generation
      inputs = ft_tokenizer(prompt, return_tensors="pt").to("cuda")

      # Batch generation option (much faster if memory allows)
      with torch.no_grad():
          iter_start_time = time.time()
          output_ids = ft_model.generate(
              **inputs,
              num_return_sequences=k,
              **gen_kwargs
          )
          for output_id in output_ids:
              decoded = ft_tokenizer.decode(output_id, skip_special_tokens=True)
              outputs.append(decoded)
          # print(f"Iteration {i} of test item {n+1} generated in {time.time() - iter_start_time} seconds.")

      outputs_ft_s.append(outputs)
      if (n+1)%5 == 0:
        print(f"Test items {n-3}-{n+1} generated in {(time.time() - start_time):.3f} seconds.\n")
  total = time.time() - total_start_time
  print(f"Total generation time: {int(total//60)} minutes and {int(total - total//60*60)} seconds.\n")
  path = "/content/gdrive/MyDrive/Proyecto LLMonkeys/outputs/TinyLlama/outputs_fine_tuned.json"
  guardar_datos_json(outputs_ft_s, path)

elif eleccion == "2":
  path = "/content/gdrive/MyDrive/Proyecto LLMonkeys/outputs/TinyLlama/outputs_fine_tuned.json"
  outputs_ft_s = cargar_datos_json(path)


Generating for test item 1...
Generating for test item 2...
Generating for test item 3...
Generating for test item 4...
Generating for test item 5...
Test items 1-5 generated in 84.408 seconds.

Generating for test item 6...
Generating for test item 7...
Generating for test item 8...
Generating for test item 9...
Generating for test item 10...
Test items 6-10 generated in 80.938 seconds.

Generating for test item 11...
Generating for test item 12...
Generating for test item 13...
Generating for test item 14...
Generating for test item 15...
Test items 11-15 generated in 80.824 seconds.

Generating for test item 16...
Generating for test item 17...
Generating for test item 18...
Generating for test item 19...
Generating for test item 20...
Test items 16-20 generated in 85.460 seconds.

Generating for test item 21...
Generating for test item 22...
Generating for test item 23...
Generating for test item 24...
Generating for test item 25...
Test items 21-25 generated in 80.751 seconds.

Ge

## Evaluación de los modelos:

In [None]:
import numpy as np
import re
from sklearn.metrics import ndcg_score
import re
import html
from rapidfuzz import fuzz

def normalizar_titulo(titulo):
    # Decode entidades HTML como &amp;
    titulo = html.unescape(titulo)
    # Minúsculas
    titulo = titulo.lower()
    # Eliminar puntuación excepto letras, números y &
    titulo = re.sub(r"[^a-z0-9& ]+", "", titulo)
    # Eliminar múltiples espacios
    titulo = re.sub(r"\s+", " ", titulo).strip()
    return titulo

def comparar_titulos(t1, t2):
    t1 = normalizar_titulo(t1)
    t2 = normalizar_titulo(t2)
    return fuzz.token_set_ratio(t1, t2)

# Funciones generadas por DeepSeek
def recall_at_k(generated_recommendations, ground_truth, k=10):
    hits = 0
    # Tomar las primeras K recomendaciones generadas
    top_k = generated_recommendations[:k]
    for e in top_k:
      if comparar_titulos(e, ground_truth[0]) > 80:
        hits = 1

    # Evitar división por cero
    return hits

def ndcg_at_k(generated_recommendations, ground_truth, k=10):
    # Crear una lista binaria de relevancia (1 si está en ground truth, 0 si no)
    relevance = [1 if item in ground_truth else 0 for item in generated_recommendations[:k]]

    # Crear el "ideal ranking" (todas las relevantes primero)
    ideal_relevance = sorted(relevance, reverse=True)

    # Calcular NDCG
    return ndcg_score([relevance], [ideal_relevance])


### Recall@5 y NDCG@5:

In [None]:
z_s_rec_lists = []
i=1
for outputs in outputs_z_s:
  rec_lists = []
  for out in outputs:
      ans = format_ans(out[out.index("<|assistant|>"):],10)
      rec_lists.append(format_ans(out[out.index("<|assistant|>"):],10))

  # print(rec_lists)
  z_s_rec_lists.append(rec_lists)

In [None]:
recall_zs_1_5 = 0
ndcg_zs_1_5 = 0
best_recall_zs_5 = 0.0
best_ndcg_zs_5 = 0.0

for i in range(num_test_items):
  # Zero-shot sin sampling
  # print(i)
  if len(z_s_rec_lists[i][0]) > 1:
    recall_zs_1_5 += recall_at_k(z_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=5)
    ndcg_zs_1_5 += ndcg_at_k(z_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=5)

  # Zero-shot con sampling
  best_r = 0.0
  best_n = 0.0

  for l in z_s_rec_lists[i]:
    if len(l) > 1:
      recall = recall_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=5)
      ndcg = ndcg_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=5)
    if recall > best_r:
      best_r = recall
    if ndcg > best_n:
      best_n = ndcg

  best_recall_zs_5 += best_r
  best_ndcg_zs_5 += best_n
print(recall_zs_1_5/num_test_items, ndcg_zs_1_5/num_test_items)
print(best_recall_zs_5/num_test_items, best_ndcg_zs_5/num_test_items)

0.12 0.03079388872450849
0.17 0.06764274720732359


In [None]:
f_s_rec_lists = []
i=1
for outputs in outputs_f_s:
  rec_lists = []
  for out in outputs:
      ans = format_ans(out[out.index("<|assistant|>"):],10)
      rec_lists.append(format_ans(out[out.index("<|assistant|>"):],10))

  # print(rec_lists)
  f_s_rec_lists.append(rec_lists)

In [None]:
recall_fs_1_5 = 0
ndcg_fs_1_5 = 0
best_recall_fs_5 = 0.0
best_ndcg_fs_5 = 0.0

for i in range(num_test_items):
  # Few-shot sin sampling
  # print(f"\n{f_s_rec_lists[i][0]}")
  # print([item_map[m] for m in rand_conversations[i][4]])
  if len(f_s_rec_lists[i][0]) > 1:
    recall_fs_1_5 += recall_at_k(f_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=5)
    ndcg_fs_1_5 += ndcg_at_k(f_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=5)

  # Few-shot con sampling
  best_r = 0.0
  best_n = 0.0
  recall = 0
  ndcg = 0

  for l in f_s_rec_lists[i]:
    if len(l) > 1:
      recall = recall_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=5)
      ndcg = ndcg_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=5)
    if recall > best_r:
      best_r = recall
    if ndcg > best_n:
      best_n = ndcg

  best_recall_fs_5 += best_r
  best_ndcg_fs_5 += best_n
print(recall_fs_1_5/num_test_items, ndcg_fs_1_5/num_test_items)
print(best_recall_fs_5/num_test_items, best_ndcg_fs_5/num_test_items)

0.0 0.0
0.12 0.05079388872450849


In [None]:
ft_s_rec_lists = []
i=1
for outputs in outputs_ft_s:
  rec_lists = []
  for out in outputs:
      ans = format_ans(out[out.index("<|assistant|>"):],10)
      rec_lists.append(format_ans(out[out.index("<|assistant|>"):],10))

  # print(rec_lists)
  ft_s_rec_lists.append(rec_lists)

In [None]:
recall_fts_1_5 = 0
ndcg_fts_1_5 = 0
best_recall_fts_5 = 0.0
best_ndcg_fts_5 = 0.0

for i in range(num_test_items):
  # Few-shot sin sampling
  # print(f"\n{ft_s_rec_lists[i][0]}")
  # print([item_map[m] for m in rand_conversations[i][4]])
  if len(ft_s_rec_lists[i][0]) > 1:
    recall_fts_1_5 += recall_at_k(ft_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=5)
    ndcg_fts_1_5 += ndcg_at_k(ft_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=5)

  # Few-shot con sampling
  best_r = 0.0
  best_n = 0.0
  recall = 0
  ndcg = 0

  for l in ft_s_rec_lists[i]:
    # print(l)
    if len(l) > 1:
      recall = recall_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=5)
      ndcg = ndcg_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=5)
    if recall > best_r:
      best_r = recall
    if ndcg > best_n:
      best_n = ndcg

  best_recall_fts_5 += best_r
  best_ndcg_fts_5 += best_n
print(recall_fts_1_5/num_test_items, ndcg_fts_1_5/num_test_items)
print(best_recall_fts_5/num_test_items, best_ndcg_fts_5/num_test_items)

0.11 0.08743509370014568
0.2 0.15230624149734415


### Recall@10 y NDCG@10:

Zero-Shot:

In [None]:
recall_zs_1_10 = 0
ndcg_zs_1_10 = 0
best_recall_zs_10 = 0.0
best_ndcg_zs_10 = 0.0

for i in range(num_test_items):
  # Zero-shot sin sampling
  if len(z_s_rec_lists[i][0]) > 1:
    recall_zs_1_10 += recall_at_k(z_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=10)
    ndcg_zs_1_10 += ndcg_at_k(z_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=10)

  # Zero-shot con sampling
  best_r = 0.0
  best_n = 0.0
  recall = 0
  ndcg = 0

  for l in z_s_rec_lists[i]:
    if len(l) > 1:
      recall = recall_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=10)
      ndcg = ndcg_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=10)
    if recall > best_r:
      best_r = recall
    if ndcg > best_n:
      best_n = ndcg

  best_recall_zs_10 += best_r
  best_ndcg_zs_10 += best_n
print(recall_zs_1_10/num_test_items, ndcg_zs_1_10/num_test_items)
print(best_recall_zs_10/num_test_items, best_ndcg_zs_10/num_test_items)

0.14 0.031455116214975126
0.22 0.07299769145246339


Few-Shot

In [None]:
recall_fs_1_10 = 0
ndcg_fs_1_10 = 0
best_recall_fs_10 = 0.0
best_ndcg_fs_10 = 0.0

for i in range(num_test_items):
  # Few-shot sin sampling
  if len(f_s_rec_lists[i][0]) > 1:
    recall_fs_1_10 += recall_at_k(f_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=10)
    ndcg_fs_1_10 += ndcg_at_k(f_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=10)

  # Few-shot con sampling
  best_r = 0.0
  best_n = 0.0
  recall = 0
  ndcg = 0

  for l in f_s_rec_lists[i]:
    if len(l) > 1:
      recall = recall_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=10)
      ndcg = ndcg_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=10)
    if recall > best_r:
      best_r = recall
    if ndcg > best_n:
      best_n = ndcg

  best_recall_fs_10 += best_r
  best_ndcg_fs_10 += best_n
print(recall_fs_1_10/num_test_items, ndcg_fs_1_10/num_test_items)
print(best_recall_fs_10/num_test_items, best_ndcg_fs_10/num_test_items)

0.06 0.015749152613725975
0.24 0.07094383904259266


In [None]:
recall_fts_1_10 = 0
ndcg_fts_1_10 = 0
best_recall_fts_10 = 0.0
best_ndcg_fts_10 = 0.0

for i in range(num_test_items):
  # Fine-tuned sin sampling
  if len(ft_s_rec_lists[i][0]) > 1:
    recall_fts_1_10 += recall_at_k(ft_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=10)
    ndcg_fts_1_10 += ndcg_at_k(ft_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=10)

  # Fine-tuned con sampling
  best_r = 0.0
  best_n = 0.0
  recall = 0
  ndcg = 0

  for l in ft_s_rec_lists[i]:
    if len(l) > 1:
      recall = recall_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=10)
      ndcg = ndcg_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=10)
    if recall > best_r:
      best_r = recall
    if ndcg > best_n:
      best_n = ndcg

  best_recall_fts_10 += best_r
  best_ndcg_fts_10 += best_n
print(recall_fts_1_10/num_test_items, ndcg_fts_1_10/num_test_items)
print(best_recall_fts_10/num_test_items, best_ndcg_fts_10/num_test_items)

0.12 0.08444275460530175
0.24 0.15678200754544513


In [None]:
print("\nZero-Shot con interacciones históricas:")
print(f"Sin sampling: Recall@5: {recall_zs_1_5/num_test_items:.3f}, NDCG@5: {ndcg_zs_1_5/num_test_items:.3f}, Recall@10: {recall_zs_1_10/num_test_items:.3f}, NDCG@10: {ndcg_zs_1_10/num_test_items:.3f}")
print(f"Con sampling: Recall@5: {best_recall_zs_5/num_test_items:.3f}, NDCG@5: {best_ndcg_zs_5/num_test_items:.3f}, Recall@10: {best_recall_zs_10/num_test_items:.3f}, NDCG@10: {best_ndcg_zs_10/num_test_items:.3f}")

# print("\nFew-Shot con interacciones históricas:")
# print(f"Sin sampling: Recall@5: {recall_fs_1_5/num_test_items:.3f}, NDCG@5: {ndcg_fs_1_5/num_test_items:.3f}, Recall@10: {recall_fs_1_10/num_test_items:.3f}, NDCG@10: {ndcg_fs_1_10/num_test_items:.3f}")
# print(f"Con sampling: Recall@5: {best_recall_fs_5/num_test_items:.3f}, NDCG@5: {best_ndcg_fs_5/num_test_items:.3f}, Recall@10: {best_recall_fs_10/num_test_items:.3f}, NDCG@10: {best_ndcg_fs_10/num_test_items:.3f}")

print("\nFine-tuned sin interacciones históricas:")
print(f"Sin sampling: Recall@5: {recall_fts_1_5/num_test_items:.3f}, NDCG@5: {ndcg_fts_1_5/num_test_items:.3f}, Recall@10: {recall_fts_1_10/num_test_items:.3f}, NDCG@10: {ndcg_fts_1_10/num_test_items:.3f}")
print(f"Con sampling: Recall@5: {best_recall_fts_5/num_test_items:.3f}, NDCG@5: {best_ndcg_fts_5/num_test_items:.3f}, Recall@10: {best_recall_fts_10/num_test_items:.3f}, NDCG@10: {best_ndcg_fts_10/num_test_items:.3f}")



Zero-Shot con interacciones históricas:
Sin sampling: Recall@5: 0.120, NDCG@5: 0.031, Recall@10: 0.140, NDCG@10: 0.031
Con sampling: Recall@5: 0.170, NDCG@5: 0.068, Recall@10: 0.220, NDCG@10: 0.073

Fine-tuned sin interacciones históricas:
Sin sampling: Recall@5: 0.110, NDCG@5: 0.087, Recall@10: 0.120, NDCG@10: 0.084
Con sampling: Recall@5: 0.200, NDCG@5: 0.152, Recall@10: 0.240, NDCG@10: 0.157


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import torch.nn.functional as F
import random
import numpy as np

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Ahora con AutoModelForCausal a diferencia de la primera instancia
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb): 

In [None]:
import torch
import numpy as np
from collections import Counter
import re
import gc
from transformers import AutoTokenizer, AutoModelForCausalLM
from scipy.stats import entropy
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

def extract_answer(text, question_text):
    """Extrae la respuesta del texto generado"""
    # Buscar después de "Answer:" o similar
    patterns = [r"Answer:\s*(.+?)(?:\n|$)", r"answer:\s*(.+?)(?:\n|$)", r"Answer is\s*(.+?)(?:\n|$)"]

    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(1).strip()

    # Si no encuentra patrón, tomar lo que viene después del prompt
    try:
        # Dividir por el texto de la pregunta y tomar la parte después
        parts = text.split("Answer:")
        if len(parts) > 1:
            return parts[-1].strip().split('\n')[0].strip()
    except:
        pass

    return text.strip()

def calculate_uncertainty_metrics_lightweight(outputs_per_paraphrase, paraphrases):
    """
    Calcula métricas de incertidumbre usando Input Clarification Ensembling

    Args:
        outputs_per_paraphrase: Lista de listas, cada sublista contiene outputs para una paráfrasis
        paraphrases: Lista de paráfrasis usadas

    Returns:
        dict con métricas de incertidumbre
    """

    # 1. Extraer respuestas limpias
    all_answers = []
    answers_by_paraphrase = []

    for i, outputs in enumerate(outputs_per_paraphrase):
        paraphrase_answers = []
        for output in outputs:
            answer = extract_answer(output, paraphrases[i])
            paraphrase_answers.append(answer)
            all_answers.append(answer)
        answers_by_paraphrase.append(paraphrase_answers)

    # 2. Calcular frecuencias de respuestas
    answer_counts = Counter(all_answers)
    total_responses = len(all_answers)

    # 3. INCERTIDUMBRE TOTAL (Shannon Entropy de todas las respuestas)
    probs = np.array(list(answer_counts.values())) / total_responses
    total_uncertainty = entropy(probs, base=2)  # bits

    # 4. INCERTIDUMBRE ALEATORIA (promedio de entropías por paráfrasis)
    aleatoric_uncertainties = []
    for answers in answers_by_paraphrase:
        local_counts = Counter(answers)
        local_probs = np.array(list(local_counts.values())) / len(answers)
        if len(local_probs) > 1:
            aleatoric_uncertainties.append(entropy(local_probs, base=2))
        else:
            aleatoric_uncertainties.append(0.0)

    aleatoric_uncertainty = np.mean(aleatoric_uncertainties)

    # 5. INCERTIDUMBRE EPISTÉMICA (diferencia)
    epistemic_uncertainty = total_uncertainty - aleatoric_uncertainty

    # 6. Métricas adicionales
    unique_answers = len(set(all_answers))
    most_common_answer, most_common_count = answer_counts.most_common(1)[0]
    confidence = most_common_count / total_responses

    # 7. Consistencia entre paráfrasis
    consistency_scores = []
    for i in range(len(paraphrases)):
        for j in range(i+1, len(paraphrases)):
            # Comparar respuestas más frecuentes de cada paráfrasis
            answers_i = Counter(answers_by_paraphrase[i])
            answers_j = Counter(answers_by_paraphrase[j])

            most_common_i = answers_i.most_common(1)[0][0] if answers_i else ""
            most_common_j = answers_j.most_common(1)[0][0] if answers_j else ""

            # Similaridad simple (exacta o parcial)
            if most_common_i.lower().strip() == most_common_j.lower().strip():
                consistency_scores.append(1.0)
            else:
                # Similaridad parcial usando tokens comunes
                tokens_i = set(most_common_i.lower().split())
                tokens_j = set(most_common_j.lower().split())
                if tokens_i and tokens_j:
                    jaccard = len(tokens_i.intersection(tokens_j)) / len(tokens_i.union(tokens_j))
                    consistency_scores.append(jaccard)
                else:
                    consistency_scores.append(0.0)

    consistency = np.mean(consistency_scores) if consistency_scores else 0.0

    return {
        'total_uncertainty': float(total_uncertainty),
        'aleatoric_uncertainty': float(aleatoric_uncertainty),
        'epistemic_uncertainty': float(epistemic_uncertainty),
        'confidence': float(confidence),
        'unique_answers': int(unique_answers),
        'most_common_answer': str(most_common_answer),
        'consistency_across_paraphrases': float(consistency),
        'answer_distribution': {str(k): int(v) for k, v in answer_counts.items()},
        'num_paraphrases': len(paraphrases),
        'samples_per_paraphrase': len(outputs_per_paraphrase[0]) if outputs_per_paraphrase else 0
    }

def calculate_logit_uncertainty_lightweight(model, tokenizer, paraphrases, device="cuda", cleanup=True):
    """
    Versión optimizada que limpia memoria agresivamente
    """
    print("Calculando incertidumbre por logits (versión ligera)...")

    uncertainties_per_paraphrase = []
    logits_stats = []

    for i, paraphrase in enumerate(paraphrases):
        print(f"Procesando logits para paráfrasis {i+1}/{len(paraphrases)}")

        ctx = "You are an oracle who only responds with short and concise answers."
        msg = f"Answer the following question: {paraphrase}\nAnswer:"
        messages = [{"role": "user", "content": ctx + msg}]

        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = tokenizer(prompt, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            next_token_logits = outputs.logits[0, -1, :].cpu()  # Mover a CPU inmediatamente

            # Calcular probabilidades y entropía
            probs = torch.softmax(next_token_logits, dim=-1)
            uncertainty = entropy(probs.numpy(), base=2)
            uncertainties_per_paraphrase.append(float(uncertainty))

            # Guardar solo estadísticas básicas, no los logits completos
            logits_stats.append({
                'mean': float(next_token_logits.mean()),
                'std': float(next_token_logits.std()),
                'max': float(next_token_logits.max()),
                'min': float(next_token_logits.min())
            })

            # Limpiar memoria inmediatamente
            del outputs, next_token_logits, probs, inputs
            if cleanup:
                torch.cuda.empty_cache()
                gc.collect()

    # Calcular métricas finales sin guardar arrays grandes
    mean_uncertainty = np.mean(uncertainties_per_paraphrase)
    std_uncertainty = np.std(uncertainties_per_paraphrase)

    return {
        'logit_uncertainties_per_paraphrase': uncertainties_per_paraphrase,
        'mean_logit_uncertainty': float(mean_uncertainty),
        'std_logit_uncertainty': float(std_uncertainty),
        'logits_stats_summary': {
            'mean_of_means': float(np.mean([s['mean'] for s in logits_stats])),
            'mean_of_stds': float(np.mean([s['std'] for s in logits_stats])),
        }
    }


In [None]:

# Configuración
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # o el modelo que uses
device = "cuda" if torch.cuda.is_available() else "cpu"

# Cargar modelo y tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Paráfrasis de la pregunta
paraphrases = [
    "What year did the Berlin Wall fall?",
    "When did the Berlin Wall come down?",
    "In which year was the Berlin Wall demolished?",
    "What year did the fall of the Berlin Wall occur?",
    "When was the Berlin Wall brought down?",
    "In what year did the Berlin Wall collapse?",
    "What year did they tear down the Berlin Wall?",
    "When did the destruction of the Berlin Wall happen?",
    "In which year did the Berlin Wall get demolished?",
    "What year marked the fall of the Berlin Wall?"
]

# Generar respuestas con sampling
outputs_per_paraphrase = []
k = 5  # Número de samples por paráfrasis

print("Generando respuestas...")
for i, paraphrase in enumerate(paraphrases):
    print(f"Procesando paráfrasis {i+1}/{len(paraphrases)}: {paraphrase}")

    outputs = []
    ctx = "You are an oracle who only responds with short and concise answers."
    msg = f"Answer the following question: {paraphrase}\nAnswer:"
    messages = [{"role": "user", "content": ctx + msg}]

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    for j in range(k):
        with torch.no_grad():
            output_ids = model.generate(
                **inputs,
                max_new_tokens=50,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                top_p=0.95,
                pad_token_id=tokenizer.eos_token_id
            )

            # Decodificar solo la parte nueva (sin el prompt)
            new_tokens = output_ids[0][inputs['input_ids'].shape[1]:]
            decoded = tokenizer.decode(new_tokens, skip_special_tokens=True)
            outputs.append(decoded)

    outputs_per_paraphrase.append(outputs)



Generando respuestas...
Procesando paráfrasis 1/10: What year did the Berlin Wall fall?
Procesando paráfrasis 2/10: When did the Berlin Wall come down?
Procesando paráfrasis 3/10: In which year was the Berlin Wall demolished?
Procesando paráfrasis 4/10: What year did the fall of the Berlin Wall occur?
Procesando paráfrasis 5/10: When was the Berlin Wall brought down?
Procesando paráfrasis 6/10: In what year did the Berlin Wall collapse?
Procesando paráfrasis 7/10: What year did they tear down the Berlin Wall?
Procesando paráfrasis 8/10: When did the destruction of the Berlin Wall happen?
Procesando paráfrasis 9/10: In which year did the Berlin Wall get demolished?
Procesando paráfrasis 10/10: What year marked the fall of the Berlin Wall?


In [None]:
include_logits = True
cleanup_model = True

In [None]:
print("="*50)
print("CALCULANDO INCERTIDUMBRE (VERSIÓN OPTIMIZADA)...")
print("="*50)

# 1. Calcular métricas básicas de incertidumbre
uncertainty_metrics = calculate_uncertainty_metrics_lightweight(outputs_per_paraphrase, paraphrases)

# 2. Calcular métricas de logits si se solicita
logit_metrics = None
if include_logits and model is not None and tokenizer is not None:
    logit_metrics = calculate_logit_uncertainty_lightweight(
        model, tokenizer, paraphrases, device, cleanup=True
    )

# 3. LIMPIAR MODELO DE MEMORIA SI SE SOLICITA
if cleanup_model and model is not None:
    print("Limpiando modelo de memoria...")
    del model
    if tokenizer is not None:
        del tokenizer
    torch.cuda.empty_cache()
    gc.collect()
    print("✓ Modelo removido de memoria")

# 4. Mostrar resultados
print(f"\n📊 MÉTRICAS DE INCERTIDUMBRE:")
print(f"├── Incertidumbre Total: {uncertainty_metrics['total_uncertainty']:.3f} bits")
print(f"├── Incertidumbre Aleatoria: {uncertainty_metrics['aleatoric_uncertainty']:.3f} bits")
print(f"├── Incertidumbre Epistémica: {uncertainty_metrics['epistemic_uncertainty']:.3f} bits")
print(f"├── Confianza: {uncertainty_metrics['confidence']:.3f}")
print(f"├── Respuestas únicas: {uncertainty_metrics['unique_answers']}")
print(f"├── Consistencia entre paráfrasis: {uncertainty_metrics['consistency_across_paraphrases']:.3f}")
print(f"└── Respuesta más común: '{uncertainty_metrics['most_common_answer']}'")

if logit_metrics:
    print(f"\n🔢 MÉTRICAS DE LOGITS:")
    print(f"├── Incertidumbre promedio: {logit_metrics['mean_logit_uncertainty']:.3f} bits")
    print(f"├── Desviación estándar: {logit_metrics['std_logit_uncertainty']:.3f} bits")
    print(f"└── Media de logits: {logit_metrics['logits_stats_summary']['mean_of_means']:.3f}")

# 5. Interpretación
print(f"\n🧠 INTERPRETACIÓN:")
if uncertainty_metrics['epistemic_uncertainty'] > uncertainty_metrics['aleatoric_uncertainty']:
    print("├── El modelo tiene más incertidumbre sobre QUÉ responder")
    print("└── → Sugiere falta de conocimiento específico")
else:
    print("├── El modelo tiene más incertidumbre sobre CÓMO responder")
    print("└── → Sugiere ambigüedad inherente en la pregunta")

if uncertainty_metrics['consistency_across_paraphrases'] > 0.8:
    print("├── Alta consistencia entre paráfrasis")
elif uncertainty_metrics['consistency_across_paraphrases'] > 0.5:
    print("├── Consistencia moderada entre paráfrasis")
else:
    print("├── Baja consistencia - posible confusión del modelo")

# DEVOLVER SOLO RESULTADOS LIGEROS
results = {
    'uncertainty_metrics': uncertainty_metrics,
    'logit_metrics': logit_metrics,
    'analysis_params': {
        'num_paraphrases': len(paraphrases),
        'samples_per_paraphrase': len(outputs_per_paraphrase[0]) if outputs_per_paraphrase else 0,
        'included_logits': include_logits
    }
}

print(results)

CALCULANDO INCERTIDUMBRE (VERSIÓN OPTIMIZADA)...
Respuestas extraídas:
Paráfrasis 1: ["The Berlin Wall fell in 1989, on November 9, 1989, when East Germany's government announced the end of the wall separating East and West Berlin.", 'The Berlin Wall fell on November 9, 1989, marking the end of the Cold War and the division of East and West Germany.', 'The Berlin Wall fell on November 9, 1989, marking the end of the Cold War and the collapse of the Soviet Union.', 'The Berlin Wall fell on November 9, 1989, which was 24 years ago.', 'The Berlin Wall fell on November 9, 1989, at 03:15 a.m. CET (Central European Time) in the early morning hours of November 9, 1989.']
Paráfrasis 2: ['The Berlin Wall came down on November 9, 1989, marking the end of the Cold War and the collapse of the Soviet Union.', "The Berlin Wall came down on November 9, 1989, on the evening of that day. The Soviet Union's fall from power and the subsequent peaceful reunification of Germany marked a historic turning po

In [None]:

# Configuración
device = "cuda" if torch.cuda.is_available() else "cpu"
ft_model = ft_model.to(device)
if ft_tokenizer.pad_token is None:
    ft_tokenizer.pad_token = ft_tokenizer.eos_token

# Paráfrasis de la pregunta
paraphrases = [
    "What year did the Berlin Wall fall?",
    "When did the Berlin Wall come down?",
    "In which year was the Berlin Wall demolished?",
    "What year did the fall of the Berlin Wall occur?",
    "When was the Berlin Wall brought down?",
    "In what year did the Berlin Wall collapse?",
    "What year did they tear down the Berlin Wall?",
    "When did the destruction of the Berlin Wall happen?",
    "In which year did the Berlin Wall get demolished?",
    "What year marked the fall of the Berlin Wall?"
]

# Generar respuestas con sampling
outputs_per_paraphrase = []
k = 5  # Número de samples por paráfrasis

print("Generando respuestas...")
for i, paraphrase in enumerate(paraphrases):
    print(f"Procesando paráfrasis {i+1}/{len(paraphrases)}: {paraphrase}")

    outputs = []
    ctx = "You are an oracle who only responds with short and concise answers."
    msg = f"Answer the following question: {paraphrase}\nAnswer:"
    messages = [{"role": "user", "content": ctx + msg}]

    prompt = ft_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = ft_tokenizer(prompt, return_tensors="pt").to(device)

    for j in range(k):
        with torch.no_grad():
            output_ids = ft_model.generate(
                **inputs,
                max_new_tokens=50,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                top_p=0.95,
                pad_token_id=ft_tokenizer.eos_token_id
            )

            # Decodificar solo la parte nueva (sin el prompt)
            new_tokens = output_ids[0][inputs['input_ids'].shape[1]:]
            decoded = ft_tokenizer.decode(new_tokens, skip_special_tokens=True)
            outputs.append(decoded)

    outputs_per_paraphrase.append(outputs)



Generando respuestas...
Procesando paráfrasis 1/10: What year did the Berlin Wall fall?
Procesando paráfrasis 2/10: When did the Berlin Wall come down?
Procesando paráfrasis 3/10: In which year was the Berlin Wall demolished?
Procesando paráfrasis 4/10: What year did the fall of the Berlin Wall occur?
Procesando paráfrasis 5/10: When was the Berlin Wall brought down?
Procesando paráfrasis 6/10: In what year did the Berlin Wall collapse?
Procesando paráfrasis 7/10: What year did they tear down the Berlin Wall?
Procesando paráfrasis 8/10: When did the destruction of the Berlin Wall happen?
Procesando paráfrasis 9/10: In which year did the Berlin Wall get demolished?
Procesando paráfrasis 10/10: What year marked the fall of the Berlin Wall?


In [None]:
print("="*50)
print("CALCULANDO INCERTIDUMBRE (VERSIÓN OPTIMIZADA)...")
print("="*50)

# 1. Calcular métricas básicas de incertidumbre
uncertainty_metrics = calculate_uncertainty_metrics_lightweight(outputs_per_paraphrase, paraphrases)

# 2. Calcular métricas de logits si se solicita
logit_metrics = None
if include_logits and ft_model is not None and tokenizer is not None:
    logit_metrics = calculate_logit_uncertainty_lightweight(
        ft_model, tokenizer, paraphrases, device, cleanup=True
    )

# 3. LIMPIAR MO.DELO DE MEMORIA SI SE SOLICITA
if cleanup_model and ft_model is not None:
    print("Limpiando modelo de memoria...")
    del ft_model
    if tokenizer is not None:
        del tokenizer
    torch.cuda.empty_cache()
    gc.collect()
    print("✓ Modelo removido de memoria")

# 4. Mostrar resultados
print(f"\n📊 MÉTRICAS DE INCERTIDUMBRE:")
print(f"├── Incertidumbre Total: {uncertainty_metrics['total_uncertainty']:.3f} bits")
print(f"├── Incertidumbre Aleatoria: {uncertainty_metrics['aleatoric_uncertainty']:.3f} bits")
print(f"├── Incertidumbre Epistémica: {uncertainty_metrics['epistemic_uncertainty']:.3f} bits")
print(f"├── Confianza: {uncertainty_metrics['confidence']:.3f}")
print(f"├── Respuestas únicas: {uncertainty_metrics['unique_answers']}")
print(f"├── Consistencia entre paráfrasis: {uncertainty_metrics['consistency_across_paraphrases']:.3f}")
print(f"└── Respuesta más común: '{uncertainty_metrics['most_common_answer']}'")

if logit_metrics:
    print(f"\n🔢 MÉTRICAS DE LOGITS:")
    print(f"├── Incertidumbre promedio: {logit_metrics['mean_logit_uncertainty']:.3f} bits")
    print(f"├── Desviación estándar: {logit_metrics['std_logit_uncertainty']:.3f} bits")
    print(f"└── Media de logits: {logit_metrics['logits_stats_summary']['mean_of_means']:.3f}")

# 5. Interpretación
print(f"\n🧠 INTERPRETACIÓN:")
if uncertainty_metrics['epistemic_uncertainty'] > uncertainty_metrics['aleatoric_uncertainty']:
    print("├── El modelo tiene más incertidumbre sobre QUÉ responder")
    print("└── → Sugiere falta de conocimiento específico")
else:
    print("├── El modelo tiene más incertidumbre sobre CÓMO responder")
    print("└── → Sugiere ambigüedad inherente en la pregunta")

if uncertainty_metrics['consistency_across_paraphrases'] > 0.8:
    print("├── Alta consistencia entre paráfrasis")
elif uncertainty_metrics['consistency_across_paraphrases'] > 0.5:
    print("├── Consistencia moderada entre paráfrasis")
else:
    print("├── Baja consistencia - posible confusión del modelo")

# DEVOLVER SOLO RESULTADOS LIGEROS
results = {
    'uncertainty_metrics': uncertainty_metrics,
    'logit_metrics': logit_metrics,
    'analysis_params': {
        'num_paraphrases': len(paraphrases),
        'samples_per_paraphrase': len(outputs_per_paraphrase[0]) if outputs_per_paraphrase else 0,
        'included_logits': include_logits
    }
}

print(results)

CALCULANDO INCERTIDUMBRE (VERSIÓN OPTIMIZADA)...
Calculando incertidumbre por logits (versión ligera)...
Procesando logits para paráfrasis 1/10
Procesando logits para paráfrasis 2/10
Procesando logits para paráfrasis 3/10
Procesando logits para paráfrasis 4/10
Procesando logits para paráfrasis 5/10
Procesando logits para paráfrasis 6/10
Procesando logits para paráfrasis 7/10
Procesando logits para paráfrasis 8/10
Procesando logits para paráfrasis 9/10
Procesando logits para paráfrasis 10/10
Limpiando modelo de memoria...
✓ Modelo removido de memoria

📊 MÉTRICAS DE INCERTIDUMBRE:
├── Incertidumbre Total: 5.644 bits
├── Incertidumbre Aleatoria: 2.322 bits
├── Incertidumbre Epistémica: 3.322 bits
├── Confianza: 0.020
├── Respuestas únicas: 50
├── Consistencia entre paráfrasis: 0.179
└── Respuesta más común: 'I don't have access to the latest news. However, I'm glad you asked. The Berlin Wall fell on August 14, 1961, and it became a symbol of the Cold War between the Soviet Union and'

🔢 M

In [None]:
limpiar_y_guardar()

🎯 Limpieza específica para modelo LoRA...
   ├── Eliminando: tokenizer
   ├── Eliminando: pipe
   ├── Eliminando: inputs
   ├── Eliminando: output_ids
   ├── Eliminando: prompt
   ├── Eliminando: messages
   ├── Eliminando: ctx
   ├── Eliminando: msg
   ├── Eliminando: decoded
   ├── Eliminando: outputs

📊 Variables mantenidas:
   ├── item_map: 9687 elementos
   ├── Conversation: 16935069 elementos
   ├── model_name: 34 elementos
   ├── outputs_f_s: 20 elementos
   ├── outputs_ft_s: 20 elementos
   ├── rand_conversations: 20 elementos
   ├── num_test_items: <class 'int'>
   ├── train_conv: 2512 elementos
   ├── test_conv: 619 elementos
   ├── num_test_items: <class 'int'>
   ├── few_shot_data: 5 elementos
