# Propuesta de proyecto: Uso de muestreo repetitivo para mejorar el rendimiento de una LLM en recomendación conversacional

Este notebook asume que se está ejecutando en Google Colab, y que el dataset `LLM-Redial` disponible en https://drive.google.com/drive/folders/1TIP4PFm9z0C4R4--KnHoWuiB1uK-dv5m se encuentra descargado en el drive del usuario.

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import shutil
import os

source_path = '/content/gdrive/MyDrive/Proyecto LLMonkeys/Tools.py'
destination_path = '/content/Tools.py'

shutil.copy(source_path, destination_path)

'/content/Tools.py'

In [3]:
import zipfile

zip_path = '/content/gdrive/MyDrive/Proyecto LLMonkeys/LLM_Redial.zip'
extract_path = '/content/LLM_Redial'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [4]:
import dill

try:
    dill.load_session('/content/gdrive/MyDrive/Proyecto LLMonkeys/sessions/TinyLlama/TinyLlama_notebook_env.db')
except FileNotFoundError:
    print("El archivo no existe")
except dill.UnpicklingError:
    print("Archivo no válido")
except EOFError:
    print('EOFError')
except Exception as e:
    print(e)

In [5]:
from transformers import AutoTokenizer

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

In [6]:
import Tools as t

path = "./LLM_Redial/Movie"
final_data_path = '{}/final_data.jsonl'.format(path)
Conversation_path = '{}/Conversation.txt'.format(path)
user_map_path = '{}/user_ids.json'.format(path)
item_map_path = '{}/item_map.json'.format(path)

final_data = t.read_jsonl(final_data_path)
user_map = t.read_json(user_map_path)
item_map = t.read_json(item_map_path)
Conversation = t.read_dialogue(Conversation_path)

### Para limpiar el entorno y guardar las variables relevantes

In [16]:
import gc
import torch
import dill

def cleanup_for_dill_serialization():
    """
    Limpia todas las variables pesadas que pueden causar problemas con dill
    Mantiene solo los outputs y variables esenciales
    """
    # Variables que debes eliminar ANTES de dill.dump_session()
    variables_to_delete = [
        'model', 'base_model', 'config', 'pipe',

        # Tensores y objetos de PyTorch
        'inputs', 'output_ids', 'outputs',

        # Variables temporales del loop
        'decoded', 'prompt', 'messages', 'ctx', 'msg',

        # Indices y variables de control
        'i', 'n', 'k',
    ]

    # Lista de variables que SÍ quieres mantener
    variables_to_keep = [
        'item_map',
        'Conversation',
        'model_name',
        'output_r'
        'outputs_mp'
        'outputs_z_s',
        'outputs_f_s',
        'outputs_ft_s',
        'outputs_z_n',
        'outputs_f_n',
        'outputs_ft_n',
        'rand_conversations',
        'num_test_items',
        'train_conv',
        'test_conv',
        'few_shot_users',
        'num_test_items',
        'few_shot_data'
    ]

    # Obtener todas las variables globales
    global_vars = list(globals().keys())

    # Eliminar variables específicamente problemáticas
    for var_name in variables_to_delete:
        if var_name in globals():
            print(f"   ├── Eliminando: {var_name}")
            try:
                del globals()[var_name]
            except:
                print(f"No se pudo eliminar {var_name}")

    vars_to_remove = []
    for var_name in global_vars:
        if var_name.startswith('_'):  # Variables privadas
            continue

        if var_name in variables_to_keep:  # No eliminar variables importantes
            continue

        try:
            var_obj = globals().get(var_name)
            var_type = str(type(var_obj))

            # Detectar objetos problemáticos
            problematic_types = [
                'transformers',
                'peft',
                'torch.nn',
                'pipeline',
                'PreTrainedModel',
                'PreTrainedTokenizer',
                'PeftModel',
                'Tensor'
            ]

            if any(prob_type in var_type for prob_type in problematic_types):
                vars_to_remove.append(var_name)

        except Exception as e:
            vars_to_remove.append(var_name)

    # Eliminar variables problemáticas detectadas
    for var_name in vars_to_remove:
        print(f"   ├── Eliminando: {var_name}")
        try:
            del globals()[var_name]
        except:
            print(f"No se pudo eliminar: {var_name}")

    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    gc.collect()

    # Verificar tamaño de variables mantenidas
    print("\n📊 Variables mantenidas:")
    for var_name in variables_to_keep:
        if var_name in globals():
            var_obj = globals()[var_name]
            try:
                if hasattr(var_obj, '__len__'):
                    print(f"   ├── {var_name}: {len(var_obj)} elementos")
                else:
                    print(f"   ├── {var_name}: {type(var_obj)}")
            except:
                print(f"   ├── {var_name}: (no se puede medir)")

    return True

# FUNCIÓN PRINCIPAL PARA TU CASO
def cleanup_after_lora_generation():
    """
    Limpieza específica después de generar con modelo LoRA
    """
    print("🎯 Limpieza específica para modelo LoRA...")

    # Variables específicas de tu código LoRA
    lora_specific_vars = [
        'peft_model_path',
        'config',           # PeftConfig
        'base_model',       # Modelo base
        'model',           # PeftModel final
        'tokenizer',       # Tokenizer
        'pipe',           # Pipeline
        'inputs',          # Tensors de input
        'output_ids',      # Tensors de output IDs
        'prompt',          # Prompt generado
        'messages',        # Mensajes del chat template
        'ctx',            # Context string
        'msg',            # Message string
        'decoded',        # String decodificado
        'outputs',        # Lista temporal (no outputs_ft_s)
    ]

    for var_name in lora_specific_vars:
        if var_name in globals():
            print(f"   ├── Eliminando: {var_name}")
            try:
                del globals()[var_name]
            except Exception as e:
                print(f"   │   └── Error: {e}")

    # Limpieza general
    cleanup_for_dill_serialization()

def limpiar_y_guardar():
  cleanup_after_lora_generation()
  path = '/content/gdrive/MyDrive/Proyecto LLMonkeys/sessions/TinyLlama/TinyLlama_notebook_env.db'
  with open(path, 'wb') as f:
      dill.dump_session(f)

### Para retornar la respuesta del modelo como una lista de nombres

In [8]:
import re
def format_ans(ans, n):
    answer = "None"
    try:
        answer_list = []
        current_pos = 0

        for j in range(1, n + 1):
            # Busca el patrón del número y el inicio del texto
            pattern_start = f"{j}. "
            start_index = ans.find(pattern_start, current_pos)

            if start_index == -1:
                break  # Si no se encuentra el número, salimos del bucle

            start_text = start_index + len(pattern_start)

            # Busca el final del texto: el inicio del siguiente número O un " - "
            next_number_start = ans.find(f"{j + 1}. ", start_text)
            dash_start = ans.find(" - ", start_text)

            end_text = -1

            # Determina el final del texto basado en la primera ocurrencia
            if next_number_start != -1 and dash_start != -1:
                end_text = min(next_number_start, dash_start)
            elif next_number_start != -1:
                end_text = next_number_start
            elif dash_start != -1:
                end_text = dash_start
            else:
                # Si no hay siguiente número ni " - ", toma hasta el final de la línea o la cadena
                end_line = ans.find("\n", start_text)
                if end_line != -1:
                    end_text = end_line
                else:
                    end_text = len(ans)

            if end_text == -1: # Si no se encontró un final válido, toma hasta el final de la cadena
                end_text = len(ans)

            movie_name = ans[start_text:end_text].strip()
            answer_list.append(movie_name)
            current_pos = end_text # Actualiza la posición para la próxima búsqueda

        if not answer_list: # Si no se encontraron películas
             return "Formato de respuesta incorrecto", i

        answer_str = ", ".join([f'{movie}' for movie in answer_list])
        answer = f"{i}. {answer_str}\n"
        i += 1

    except Exception as e:
        answer = f"Error: {e}"

    finally:
        pattern = r"\(\d{4}\)"
        answer_list = [m.replace('\"', '') for m in answer_list]
        answer_list = [re.sub(pattern, '', m)[:-1] for m in answer_list]
        return answer_list

### Separación de diálogos en train y test, cuidando que todas las conversaciones de un usuario en particular se encuentren en sólo train o sólo en test


In [12]:
import json
import random

path = './LLM_Redial/Movie/final_data.jsonl'

with open(path, 'r', encoding='utf-8') as file:
    data = [json.loads(line) for line in file]
n_conversations = 10089
train_len = n_conversations * 0.8
test_len = n_conversations * 0.2

train_conv = []
test_conv = []
used_users = []

aux = []

while len(aux) < test_len:
    try:
        convs = []
        random.seed(42)
        user_id = random.choice(list(set(user_map.keys())^set(used_users)))
        # print(user_id)
        used_users.append(user_id)
        user_data = next((item[user_id] for item in data if user_id in item), None)
        user_conversations = user_data.get("Conversation", [])
        for i in range(len(user_conversations)):
            selected_conversation = user_conversations[i]
            conversation_details = list(selected_conversation.values())[0]
            conversation_id = conversation_details["conversation_id"]
            if conversation_id != 10088:
                conversation = Conversation[Conversation.index(f"{conversation_id}\n"):Conversation.index(f"{conversation_id+1}\n")]
            else:
                conversation = Conversation[Conversation.index(f"{conversation_id}\n"):]

            convs.append(conversation)
            aux.append(conversation)
        test_conv.append([user_id,convs])
    except ValueError:
        print("ValueError: ", user_id, f"{conversation_id}\n")

for user_id in list(set(user_map.keys()) ^ set(used_users)):
    try:
        convs = []
        user_data = next((item[user_id] for item in data if user_id in item), None)
        user_conversations = user_data.get("Conversation", [])
        for i in range(len(user_conversations)):
            selected_conversation = user_conversations[i]
            conversation_details = list(selected_conversation.values())[0]
            conversation_id = conversation_details["conversation_id"]
            if conversation_id != 10088:
                conversation = Conversation[Conversation.index(f"{conversation_id}\n"):Conversation.index(f"{conversation_id+1}\n")]
            else:
                conversation = Conversation[Conversation.index(f"{conversation_id}\n"):]

            convs.append(conversation)
        train_conv.append([user_id,convs])
    except ValueError:
        print("ValueError: ", user_id, f"{conversation_id}\n")

print(len(test_conv), len(train_conv), len(test_conv) + len(train_conv), n_conversations)

658 2473 3131 10089


### Selección aleatoria de diálogos para testear

In [9]:
num_test_items = 20

In [13]:
import random
rand_conversations = []
for n in range(num_test_items):

  random.seed(n)
  rand_user = random.choice(test_conv)
  user_id = rand_user[0]
  user_conversation  = random.choice(rand_user[1])

  user_data = next((item[user_id] for item in data if user_id in item), None)
  convs = user_data.get("Conversation", [])
  for i in range(len(rand_user[1])):
    if user_conversation == rand_user[1][i]:
      rand_user_conv_id = i
  dialog = "\n\n".join(user_conversation.split("\n\n")[1:4])
  dialog_id = user_conversation.split("\n\n")[0]
  dialog_ground_truth = list(convs[rand_user_conv_id].values())[0]["rec_item"]
  rand_conversations.append([rand_user_conv_id, dialog, user_data, dialog_id, dialog_ground_truth])
  # print(user_conversation[user_conversation.index("User:"):-2])
  # print(list(convs[rand_user_conv_id].values())[0]["rec_item"])
  # print(list(convs[rand_user_conv_id].values())[0]["rec_item"])
  # print(dialog)

### Agregamos las interacciones históricas a la lista

In [14]:
for i in range(num_test_items):
  user_data = rand_conversations[i][2]
  rand_user_interactions = user_data.get("history_interaction", [])
  rand_user_interactions = [item_map[m] for m in rand_user_interactions]
  rand_conversations[i].append(rand_user_interactions)

### Seleccionamos aleatoriamente los datos de Few shot desde train

In [15]:
few_shot_data = []
random.seed(42)
few_shot_users = random.sample(train_conv, 5)

for u in few_shot_users:
  user_data = next((item[u[0]] for item in data if u[0] in item), None)
  user_interactions = user_data.get("history_interaction", [])
  user_interactions = [item_map[m] for m in user_interactions]

  conversation = min(u[1], key=len)
  conversation[conversation.index("User:"):-2]
  convs = user_data.get("Conversation", [])
  for c in convs:
    user_likes = list(c.values())[0]["user_likes"]
    user_dislikes = list(c.values())[0]["user_dislikes"]
    recs= list(c.values())[0]["rec_item"]

    user_likes = [item_map[m] for m in user_likes]
    user_dislikes = [item_map[m] for m in user_dislikes]
    recs = [item_map[m] for m in recs]
  few_shot_data.append({
      # "user_interactions": user_interactions,
      "conversation": conversation,
      # "user_likes": user_likes,
      # "user_dislikes": user_dislikes,
      # "recs": recs
  })

# Generación de 10 listas de recomendación de 10 películas en 3 modelos distintos, Random, Most Popular y LLM TinyLlama:

## Random:

In [17]:
import random

random.seed(44)
all_movies = list(item_map.values())

output_r = []
for i in range(num_test_items):
  output = []
  for _ in range(10):
      current_list = []
      while len(current_list) < 10:
          movie = random.choice(all_movies)

          if movie not in current_list:
              current_list.append(movie)

      output.append(current_list)
  output_r.append(output)

for output in output_r:
  print(output)

[['Into the Abyss', "TYLER PERRY'S A MADEA CHRISTMAS", 'The Trip to Italy (original uncut version) [UK import, Region 2 PAL format]', "We're No Angels VHS", 'Les Choristes (2004) / The Chorus', 'Person of Interest: Complete Season 1', 'Shaolin Soccer', 'Louise Brooks - Looking for Lulu', 'Tucker &amp; Dale vs. Evil', 'Ken Burns: Prohibition'], ["We're No Angels VHS", 'Found', 'Wall Street', 'Return of Frank James VHS', 'Dick VHS', 'Winning Team VHS', 'Big Business VHS', 'The Uninvited', 'Weimar Republic', 'The Curse of the Jade Scorpion VHS'], ['Godsend', 'Ghost Rider: Spirit of Vengeance', 'Conqueror Worm VHS', 'Haywire', 'Scott &amp; Bailey: Season 1 Regions 2 &amp; 4', 'Who Framed Roger Rabbit', 'Autumn in New York VHS', 'Charmed: Season 7', "Marvin's Room", 'Okami-san and Her Seven Companions: Complete Collection'], ['The Best of Friends, Vol. 1-2 VHS', 'Teenage Mutant Ninja Turtles: (Teenage Mutant Ninja Turtles / Secret of the Ooze / Turtles in Time / TMNT)', 'Grim VHS', 'The Lov

## Most Popular

In [18]:
movie_likes = {}
for u in train_conv:
  user = u[0]
  user_data = next((item[user] for item in data if user in item), None)
  convs = user_data.get("Conversation", [])
  for c in convs:
    user_likes = list(c.values())[0]["user_likes"]
    for m in user_likes:
      try:
        movie_likes[m] += 1
      except:
        movie_likes[m] = 1

sorted_movie_likes = dict(sorted(movie_likes.items(), key=lambda item: item[1], reverse=True))
top_10_movies = dict(list(sorted_movie_likes.items())[:10])

outputs_mp = []
for i in range(num_test_items):
  outputs = []
  for _ in range(10):
    outputs.append([item_map[m] for m in list(top_10_movies.keys())])
  outputs_mp.append(outputs)

for output in outputs_mp:
  print(output)

[['The Lord of the Rings: The Fellowship of the Ring', 'The Sixth Sense VHS', 'The Lord of the Rings: The Return of the King', 'Terminator, The', 'Batman Begins', 'The Silence of the Lambs VHS', 'Finding Nemo (Mandarin Chinese Edition) [2 DVDs]', 'Spider-Man 2', 'Gladiator VHS', 'Pirates of the Caribbean: The Curse of the Black Pearl'], ['The Lord of the Rings: The Fellowship of the Ring', 'The Sixth Sense VHS', 'The Lord of the Rings: The Return of the King', 'Terminator, The', 'Batman Begins', 'The Silence of the Lambs VHS', 'Finding Nemo (Mandarin Chinese Edition) [2 DVDs]', 'Spider-Man 2', 'Gladiator VHS', 'Pirates of the Caribbean: The Curse of the Black Pearl'], ['The Lord of the Rings: The Fellowship of the Ring', 'The Sixth Sense VHS', 'The Lord of the Rings: The Return of the King', 'Terminator, The', 'Batman Begins', 'The Silence of the Lambs VHS', 'Finding Nemo (Mandarin Chinese Edition) [2 DVDs]', 'Spider-Man 2', 'Gladiator VHS', 'Pirates of the Caribbean: The Curse of the 

In [15]:
import torch

print("GPU disponible:", torch.cuda.is_available())

GPU disponible: True


## TinyLlama:

### Zero-Shot con interacciones históricas:

In [23]:
from transformers import pipeline

outputs_z_s = []
for n in range(num_test_items):
  k = 10
  outputs = []
  # Pipeline Initialization
  pipe = pipeline(
      "text-generation",
      model=model_name,
      device_map="auto",
      torch_dtype=torch.float16,
  )

  ctx = (
      "YOU ARE A MOVIE RECOMMENDATION SYSTEM."
      "GENERATE A NUMBERED LIST OF 10 MOVIES. \n"
      "RULES: \n"
      "1. DO NOT write dialogs, explanations nor additional text or information. \n"
      "2. DO NOT repeat movies mentioned in the conversation. \n"
      "3. Response format: \n"
      "1. \"Movie name 1\" \n"
      "2. \"Movie name 2\" \n"
      "... \n"
      "10. \"Movie name 10\" \n"
      "Use ONLY this format and NOTHING else."
  )

  msg = (
      "Based on the following conversation: \n"
      f"{rand_conversations[n][1]} \n\n"
      "And the movies the user has previously interaced with: \n"
      f"{random.sample(rand_conversations[n][5], min(10, len(rand_conversations[n][5])))}\n\n"
      "Generate a list of 10 recommended movies (JUST NAMES, ONE PER LINE):"
  )

  messages = [{"role": "user", "content": ctx + msg}]

  prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
  inputs = pipe.tokenizer(prompt, return_tensors="pt").to("cuda")

  with torch.no_grad():
      for i in range(k):
          output_ids = pipe.model.generate(
              **inputs,
              max_new_tokens=200,
              do_sample=True,
              temperature=0.7,
              top_k=50,
              top_p=0.95
          )
          decoded = pipe.tokenizer.decode(output_ids[0], skip_special_tokens=True)
          outputs.append(decoded)
  outputs_z_s.append(outputs)

Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


### Few-Shot con interacción histórica

In [24]:
from transformers import pipeline
k = 10
outputs_f_s = []
for n in range(num_test_items):
  outputs = []
  pipe = pipeline(
      "text-generation",
      model=model_name,
      device_map="auto",
      torch_dtype=torch.float16,
  )

  ctx = (
      "YOU ARE A MOVIE RECOMMENDATION SYSTEM."
      "GENERATE A NUMBERED LIST OF 10 MOVIES. \n"
      "RULES: \n"
      "1. DO NOT write dialogs, explanations nor additional text or information. \n"
      "2. DO NOT repeat movies mentioned in the conversation. \n"
      "3. Response format: \n"
      "1. \"Movie name 1\" \n"
      "2. \"Movie name 2\" \n"
      "... \n"
      "10. \"Movie name 10\" \n"
      "Use ONLY this format and NOTHING else."
  )

  msg = (
      "Based on these 4 examples: \n"
      f"{few_shot_data[0]}\n"
      f"{few_shot_data[1]}\n"
      f"{few_shot_data[2]}\n"
      f"{few_shot_data[3]}\n\n"
      "And based on the following conversation: \n"
      f"{rand_conversations[n][1]} \n\n"
      "And the movies the user has previously interaced with: \n"
      f"{random.sample(rand_conversations[n][5], min(10, len(rand_conversations[n][5])))}\n\n"
      "Generate a list of 10 recommended movies (JUST NAMES, ONE PER LINE):"
  )

  prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
  inputs = pipe.tokenizer(prompt, return_tensors="pt").to("cuda")

  with torch.no_grad():
      for i in range(k):
          output_ids = pipe.model.generate(
              **inputs,
              max_new_tokens=200,
              do_sample=True,
              temperature=0.7,
              top_k=50,
              top_p=0.95
          )
          decoded = pipe.tokenizer.decode(output_ids[0], skip_special_tokens=True)
          outputs.append(decoded)
  outputs_f_s.append(outputs)

Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


In [12]:
import dill
path = '/content/gdrive/MyDrive/Proyecto LLMonkeys/sessions/TinyLlama/TinyLlama_notebook_env_fewshot.db'
with open(path, 'wb') as f:
    dill.dump_session(f)

### Fine-Tuning

In [5]:
%pip install transformers datasets accelerate peft bitsandbytes trl



In [6]:
from huggingface_hub import login
login(token="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")

In [7]:
all_convs = []
for u in train_conv:
  all_convs.extend(u[1])

In [8]:
fine_tune_convs = random.sample(all_convs, 4000)

In [9]:
def convertir_dialogo(texto):
    salida = ""
    for linea in texto.strip().splitlines():
        if linea.startswith("User:"):
            salida += "<|user|> " + linea[6:].strip() + "\n"
        elif linea.startswith("Agent:"):
            salida += "<|assistant|> " + linea[7:].strip() + "\n"
    return {"text": salida.strip()}

In [10]:
fine_tune_convs = [convertir_dialogo(c) for c in fine_tune_convs]

In [11]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
    llm_int8_skip_modules=None
)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [13]:
from datasets import Dataset
def tokenize_fn(example):
    # Tokenizamos, truncamos a 512 tokens como máximo y rellenamos
    return tokenizer(
        example["text"],
        truncation=True,
        max_length=512
    )

dataset = Dataset.from_list(fine_tune_convs)

tokenized_dataset = dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"]  # ya no necesitamos el campo "text" crudo
)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [19]:
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="/content/gdrive/MyDrive/Proyecto LLMonkeys/TinyLlama-fine-tune",
    per_device_train_batch_size=4,        # batch por GPU
    gradient_accumulation_steps=8,        # acumular gradientes para simular un batch mayor
    num_train_epochs=2,
    learning_rate=2e-4,
    fp16=True,                            # entrenar en FP16
    optim="paged_adamw_8bit",             # usar optimizador 8-bit de bitsandbytes
    logging_steps=20,
    save_steps=500,
    save_total_limit=3,
    report_to="none",                     # desactivar logs externos
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [21]:
trainer.train()

model.save_pretrained("/content/gdrive/MyDrive/Proyecto LLMonkeys/TinyLlama-fine-tune-2")
tokenizer.save_pretrained("/content/gdrive/MyDrive/Proyecto LLMonkeys/TinyLlama-fine-tune-2")

Step,Training Loss
20,1.1534
40,1.1035
60,1.0649
80,1.0649
100,1.0554
120,1.0461
140,1.0183
160,1.0324
180,1.0199
200,1.025


('/content/gdrive/MyDrive/Proyecto LLMonkeys/TinyLlama-fine-tune-2/tokenizer_config.json',
 '/content/gdrive/MyDrive/Proyecto LLMonkeys/TinyLlama-fine-tune-2/special_tokens_map.json',
 '/content/gdrive/MyDrive/Proyecto LLMonkeys/TinyLlama-fine-tune-2/chat_template.jinja',
 '/content/gdrive/MyDrive/Proyecto LLMonkeys/TinyLlama-fine-tune-2/tokenizer.model',
 '/content/gdrive/MyDrive/Proyecto LLMonkeys/TinyLlama-fine-tune-2/added_tokens.json',
 '/content/gdrive/MyDrive/Proyecto LLMonkeys/TinyLlama-fine-tune-2/tokenizer.json')

In [22]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig

# Paso 1: Cargar la configuración PEFT
peft_model_path = "/content/gdrive/MyDrive/Proyecto LLMonkeys/TinyLlama-fine-tune"
config = PeftConfig.from_pretrained(peft_model_path)

# Paso 2: Cargar el modelo base
base_model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)

# Paso 3: Aplicar los pesos LoRA al modelo base
ft_model = PeftModel.from_pretrained(base_model, peft_model_path)

# Paso 4: Cargar el tokenizer
ft_tokenizer = AutoTokenizer.from_pretrained(peft_model_path)

In [23]:
from transformers import pipeline

outputs_ft_s = []
for n in range(num_test_items):
  k = 10
  outputs = []
  # Pipeline Initialization
  pipe = pipeline(
      "text-generation",
      model=model_name,
      device_map="auto",
      torch_dtype=torch.float16,
  )

  ctx = (
      "YOU ARE A MOVIE RECOMMENDATION SYSTEM."
      "GENERATE A NUMBERED LIST OF 10 MOVIES. \n"
      "RULES: \n"
      "1. DO NOT write dialogs, explanations nor additional text or information. \n"
      "2. DO NOT repeat movies mentioned in the conversation. \n"
      "3. Response format: \n"
      "1. \"Movie name 1\" \n"
      "2. \"Movie name 2\" \n"
      "... \n"
      "10. \"Movie name 10\" \n"
      "Use ONLY this format and NOTHING else."
  )

  msg = (
      "Based on the following conversation: \n"
      f"{rand_conversations[n][1]} \n\n"
      "And the movies the user has previously interaced with: \n"
      f"{random.sample(rand_conversations[n][5], min(10, len(rand_conversations[n][5])))}\n\n"
      "Generate a list of 10 recommended movies (JUST NAMES, ONE PER LINE):"
  )

  messages = [{"role": "user", "content": ctx + msg}]

  prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
  inputs = pipe.tokenizer(prompt, return_tensors="pt").to("cuda")

  with torch.no_grad():
      for i in range(k):
          output_ids = pipe.model.generate(
              **inputs,
              max_new_tokens=200,
              do_sample=True,
              temperature=0.7,
              top_k=50,
              top_p=0.95
          )
          decoded = pipe.tokenizer.decode(output_ids[0], skip_special_tokens=True)
          outputs.append(decoded)
  outputs_ft_s.append(outputs)

Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


## Evaluación de los modelos:

In [24]:
%pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.1/3.1 MB[0m [31m156.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m81.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.13.0


In [25]:
import numpy as np
import re
from sklearn.metrics import ndcg_score
import re
import html
from rapidfuzz import fuzz

def normalizar_titulo(titulo):
    # Decode entidades HTML como &amp;
    titulo = html.unescape(titulo)
    # Minúsculas
    titulo = titulo.lower()
    # Eliminar puntuación excepto letras, números y &
    titulo = re.sub(r"[^a-z0-9& ]+", "", titulo)
    # Eliminar múltiples espacios
    titulo = re.sub(r"\s+", " ", titulo).strip()
    return titulo

def comparar_titulos(t1, t2):
    t1 = normalizar_titulo(t1)
    t2 = normalizar_titulo(t2)
    return fuzz.token_set_ratio(t1, t2)

# Funciones generadas por DeepSeek
def recall_at_k(generated_recommendations, ground_truth, k=10):
    hits = 0
    # Tomar las primeras K recomendaciones generadas
    top_k = generated_recommendations[:k]
    for e in top_k:
      if comparar_titulos(e, ground_truth[0]) > 80:
        hits = 1

    # Evitar división por cero
    return hits

def ndcg_at_k(generated_recommendations, ground_truth, k=10):
    # Crear una lista binaria de relevancia (1 si está en ground truth, 0 si no)
    relevance = [1 if item in ground_truth else 0 for item in generated_recommendations[:k]]

    # Crear el "ideal ranking" (todas las relevantes primero)
    ideal_relevance = sorted(relevance, reverse=True)

    # Calcular NDCG
    return ndcg_score([relevance], [ideal_relevance])


### Recall@5 y NDCG@5:

### Random:

In [26]:
recall_r_1_5 = 0
ndcg_r_1_5 = 0
best_recall_r_5 = 0
best_ndcg_r_5 = 0

for i in range(num_test_items):
  # Random sin sampling
  recall_r_1_5 += recall_at_k(output_r[i][0], [item_map[m] for m in rand_conversations[i][4]], k=5)
  ndcg_r_1_5 += ndcg_at_k(output_r[i][0], [item_map[m] for m in rand_conversations[i][4]], k=5)

  # Random con sampling
  best_r = 0.0
  best_n = 0.0
  for l in output_r[i]:
    recall = recall_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=5)
    ndcg = ndcg_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=5)
    if recall > best_r:
      best_r = recall
    if ndcg > best_n:
      best_n = ndcg
  best_recall_r_5 += best_r
  best_ndcg_r_5 += best_n

print(recall_r_1_5/num_test_items, ndcg_r_1_5/num_test_items)
print(best_recall_r_5/num_test_items, best_ndcg_r_5/num_test_items)

0.0 0.0
0.0 0.0


### Most Popular:

In [27]:
recall_mp_1_5 = 0
ndcg_mp_1_5 = 0
best_recall_mp_5 = 0
best_ndcg_mp_5 = 0

for i in range(num_test_items):
  # Most Popular sin sampling
  recall_mp_1_5 += recall_at_k(outputs_mp[i][0], [item_map[m] for m in rand_conversations[i][4]], k=5)
  ndcg_mp_1_5 += ndcg_at_k(outputs_mp[i][0], [item_map[m] for m in rand_conversations[i][4]], k=5)

  # Most Popular con sampling
  best_r = 0.0
  best_n = 0.0
  for l in outputs_mp[i]:
    recall = recall_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=5)
    ndcg = ndcg_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=5)
    if recall > best_r:
      best_r = recall
    if ndcg > best_n:
      best_n = ndcg
  best_recall_mp_5 += best_r
  best_ndcg_mp_5 += best_n

print(recall_mp_1_5/num_test_items, ndcg_mp_1_5/num_test_items)
print(best_recall_mp_5/num_test_items, best_ndcg_mp_5/num_test_items)

0.0 0.0
0.0 0.0


### TinyLlama:

In [58]:
z_s_rec_lists = []
i=1
for outputs in outputs_z_s:
  rec_lists = []
  for out in outputs:
      ans = format_ans(out[out.index("<|assistant|>"):],10)
      rec_lists.append(format_ans(out[out.index("<|assistant|>"):],10))

  # print(rec_lists)
  z_s_rec_lists.append(rec_lists)

In [61]:
recall_zs_1_5 = 0
ndcg_zs_1_5 = 0
best_recall_zs_5 = 0.0
best_ndcg_zs_5 = 0.0

for i in range(num_test_items):
  # Zero-shot sin sampling
  # print(i)
  recall_zs_1_5 += recall_at_k(z_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=5)
  if len(z_s_rec_lists[i][0]) > 0:
    ndcg_zs_1_5 += ndcg_at_k(z_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=5)

  # Zero-shot con sampling
  best_r = 0.0
  best_n = 0.0

  for l in z_s_rec_lists[i]:
    if len(l) > 0:
      recall = recall_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=5)
      ndcg = ndcg_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=5)
    if recall > best_r:
      best_r = recall
    if ndcg > best_n:
      best_n = ndcg

  best_recall_zs_5 += best_r
  best_ndcg_zs_5 += best_n
print(recall_zs_1_5/num_test_items, ndcg_zs_1_5/num_test_items)
print(best_recall_zs_5/num_test_items, best_ndcg_zs_5/num_test_items)

0.15 0.024355738985992397
0.3 0.050382510846739906


In [69]:
f_s_rec_lists = []
i=1
for outputs in outputs_f_s:
  rec_lists = []
  for out in outputs:
      ans = format_ans(out[out.index("<|assistant|>"):],10)
      rec_lists.append(format_ans(out[out.index("<|assistant|>"):],10))

  # print(rec_lists)
  f_s_rec_lists.append(rec_lists)

In [82]:
for e in f_s_rec_lists:
  print(len(e))

10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10


In [71]:
recall_fs_1_5 = 0
ndcg_fs_1_5 = 0
best_recall_fs_5 = 0.0
best_ndcg_fs_5 = 0.0

for i in range(num_test_items):
  # Few-shot sin sampling
  # print(f"\n{f_s_rec_lists[i][0]}")
  # print([item_map[m] for m in rand_conversations[i][4]])
  recall_fs_1_5 += recall_at_k(f_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=5)
  if len(f_s_rec_lists[i][0]) > 0:
    ndcg_fs_1_5 += ndcg_at_k(f_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=5)

  # Few-shot con sampling
  best_r = 0.0
  best_n = 0.0

  for l in f_s_rec_lists[i]:
    if len(l) > 0:
      recall = recall_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=5)
      ndcg = ndcg_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=5)
    if recall > best_r:
      best_r = recall
    if ndcg > best_n:
      best_n = ndcg

  best_recall_fs_5 += best_r
  best_ndcg_fs_5 += best_n
print(recall_fs_1_5/num_test_items, ndcg_fs_1_5/num_test_items)
print(best_recall_fs_5/num_test_items, best_ndcg_fs_5/num_test_items)

0.0 0.0
0.0 0.0


In [72]:
ft_s_rec_lists = []
i=1
for outputs in outputs_ft_s:
  rec_lists = []
  for out in outputs:
      ans = format_ans(out[out.index("<|assistant|>"):],10)
      if len(ans) >= 10:
        rec_lists.append(format_ans(out[out.index("<|assistant|>"):],10))
        # print(rec_lists)

  # print(rec_lists)
  ft_s_rec_lists.append(rec_lists)

In [73]:
recall_fts_1_5 = 0
ndcg_fts_1_5 = 0
best_recall_fts_5 = 0.0
best_ndcg_fts_5 = 0.0

for i in range(num_test_items):
  # Few-shot sin sampling
  # print(f"\n{ft_s_rec_lists[i][0]}")
  # print([item_map[m] for m in rand_conversations[i][4]])
  recall_fts_1_5 += recall_at_k(ft_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=5)
  ndcg_fts_1_5 += ndcg_at_k(ft_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=5)

  # Few-shot con sampling
  best_r = 0.0
  best_n = 0.0

  for l in ft_s_rec_lists[i]:
    # print(l)
    recall = recall_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=5)
    ndcg = ndcg_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=5)
    if recall > best_r:
      best_r = recall
    if ndcg > best_n:
      best_n = ndcg

  best_recall_fts_5 += best_r
  best_ndcg_fts_5 += best_n
print(recall_fts_1_5/num_test_items, ndcg_fts_1_5/num_test_items)
print(best_recall_fts_5/num_test_items, best_ndcg_fts_5/num_test_items)

0.2 0.0
0.3 0.03846399052874361


### Recall@10 y NDCG@10:

### Random:

In [74]:
recall_r_1_10 = 0
ndcg_r_1_10 = 0
best_recall_r_10 = 0
best_ndcg_r_10 = 0

for i in range(num_test_items):
  # Random sin sampling
  recall_r_1_10 += recall_at_k(output_r[i][0], [item_map[m] for m in rand_conversations[i][4]], k=10)
  ndcg_r_1_10 += ndcg_at_k(output_r[i][0], [item_map[m] for m in rand_conversations[i][4]], k=10)

  # Random con sampling
  best_r = 0.0
  best_n = 0.0
  for l in output_r[i]:
    recall = recall_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=10)
    ndcg = ndcg_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=10)
    if recall > best_r:
      best_r = recall
    if ndcg > best_n:
      best_n = ndcg
  best_recall_r_10 += best_r
  best_ndcg_r_10 += best_n

print(recall_r_1_10/num_test_items, ndcg_r_1_10/num_test_items)
print(best_recall_r_10/num_test_items, best_ndcg_r_10/num_test_items)

0.0 0.0
0.05 0.0


### Most Popular

In [75]:
recall_mp_1_10 = 0
ndcg_mp_1_10 = 0
best_recall_mp_10 = 0
best_ndcg_mp_10 = 0

for i in range(num_test_items):
  # Most Popular sin sampling
  recall_mp_1_10 += recall_at_k(outputs_mp[i][0], [item_map[m] for m in rand_conversations[i][4]], k=10)
  ndcg_mp_1_10 += ndcg_at_k(outputs_mp[i][0], [item_map[m] for m in rand_conversations[i][4]], k=10)

  # Most Popular con sampling
  best_r = 0.0
  best_n = 0.0
  for l in outputs_mp[i]:
    recall = recall_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=10)
    ndcg = ndcg_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=10)
    if recall > best_r:
      best_r = recall
    if ndcg > best_n:
      best_n = ndcg
  best_recall_mp_10 += best_r
  best_ndcg_mp_10 += best_n

print(recall_mp_1_10/num_test_items, ndcg_mp_1_10/num_test_items)
print(best_recall_mp_10/num_test_items, best_ndcg_mp_10/num_test_items)

0.0 0.0
0.0 0.0


### TinyLlama

Zero-Shot:

In [77]:
recall_zs_1_10 = 0
ndcg_zs_1_10 = 0
best_recall_zs_10 = 0.0
best_ndcg_zs_10 = 0.0

for i in range(num_test_items):
  # Zero-shot sin sampling
  if len(z_s_rec_lists[i][0]) > 0:
    recall_zs_1_10 += recall_at_k(z_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=10)
    ndcg_zs_1_10 += ndcg_at_k(z_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=10)

  # Zero-shot con sampling
  best_r = 0.0
  best_n = 0.0

  for l in z_s_rec_lists[i]:
    if len(l) > 0:
      recall = recall_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=10)
      ndcg = ndcg_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=10)
    if recall > best_r:
      best_r = recall
    if ndcg > best_n:
      best_n = ndcg

  best_recall_zs_10 += best_r
  best_ndcg_zs_10 += best_n
print(recall_zs_1_10/num_test_items, ndcg_zs_1_10/num_test_items)
print(best_recall_zs_10/num_test_items, best_ndcg_zs_10/num_test_items)

0.15 0.01968644076715747
0.3 0.050382510846739906


Few-Shot

In [78]:
recall_fs_1_10 = 0
ndcg_fs_1_10 = 0
best_recall_fs_10 = 0.0
best_ndcg_fs_10 = 0.0

for i in range(num_test_items):
  # Few-shot sin sampling
  # print(f"\n{f_s_rec_lists[i][0]}")
  # print([item_map[m] for m in rand_conversations[i][4]])
  if len(f_s_rec_lists[i][0]) > 0:
    recall_fs_1_10 += recall_at_k(f_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=10)
    ndcg_fs_1_10 += ndcg_at_k(f_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=10)

  # Few-shot con sampling
  best_r = 0.0
  best_n = 0.0

  for l in f_s_rec_lists[i]:
    if len(l) > 0:
      recall = recall_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=10)
      ndcg = ndcg_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=10)
    if recall > best_r:
      best_r = recall
    if ndcg > best_n:
      best_n = ndcg

  best_recall_fs_10 += best_r
  best_ndcg_fs_10 += best_n
print(recall_fs_1_10/num_test_items, ndcg_fs_1_10/num_test_items)
print(best_recall_fs_10/num_test_items, best_ndcg_fs_10/num_test_items)

0.0 0.0
0.05 0.0


In [79]:
recall_fts_1_10 = 0
ndcg_fts_1_10 = 0
best_recall_fts_10 = 0.0
best_ndcg_fts_10 = 0.0

for i in range(num_test_items):
  # Few-shot sin sampling
  # print(f"\n{ft_s_rec_lists[i][0]}")
  # print([item_map[m] for m in rand_conversations[i][4]])
  recall_fts_1_10 += recall_at_k(ft_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=10)
  ndcg_fts_1_10 += ndcg_at_k(ft_s_rec_lists[i][0], [item_map[m] for m in rand_conversations[i][4]], k=10)

  # Few-shot con sampling
  best_r = 0.0
  best_n = 0.0

  for l in ft_s_rec_lists[i]:
    recall = recall_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=10)
    ndcg = ndcg_at_k(l, [item_map[m] for m in rand_conversations[i][4]], k=10)
    if recall > best_r:
      best_r = recall
    if ndcg > best_n:
      best_n = ndcg

  best_recall_fts_10 += best_r
  best_ndcg_fts_10 += best_n
print(recall_fts_1_10/num_test_items, ndcg_fts_1_10/num_test_items)
print(best_recall_fts_10/num_test_items, best_ndcg_fts_10/num_test_items)

0.2 0.0
0.3 0.036161691583200956


In [81]:
print("Random:")
print(f"Sin sampling: Recall@5: {recall_r_1_5/num_test_items}, NDCG@5: {ndcg_r_1_5/num_test_items}, Recall@10: {recall_r_1_10/num_test_items}, NDCG@10: {ndcg_r_1_10/num_test_items}")
print(f"Con sampling: Recall@5: {best_recall_r_5/num_test_items}, NDCG@5: {best_ndcg_r_5/num_test_items}, Recall@10: {best_recall_r_10/num_test_items}, NDCG@10: {best_ndcg_r_10/num_test_items}")

print("\nMost Popular:")
print(f"Sin sampling: Recall@5: {recall_mp_1_5/num_test_items}, NDCG@5: {ndcg_mp_1_5/num_test_items}, Recall@10: {recall_mp_1_10/num_test_items}, NDCG@10: {ndcg_mp_1_10/num_test_items}")
print(f"Con sampling: Recall@5: {best_recall_mp_5/num_test_items}, NDCG@5: {best_ndcg_mp_5/num_test_items}, Recall@10: {best_recall_mp_10/num_test_items}, NDCG@10: {best_ndcg_mp_10/num_test_items}")

# print("\nZero-Shot sin interacciones históricas:")
# print(f"Sin sampling: Recall@5: {recall_zn_1_5/num_test_items}, NDCG@5: {ndcg_zn_1_5/num_test_items}, Recall@10: {recall_zn_1_10/num_test_items}, NDCG@10: {ndcg_zn_1_10/num_test_items}")
# print(f"Con sampling: Recall@5: {best_recall_zn_5/num_test_items}, NDCG@5: {best_ndcg_zn_5/num_test_items}, Recall@10: {best_recall_zn_10/num_test_items}, NDCG@10: {best_ndcg_zn_10/num_test_items}")

print("\nZero-Shot con interacciones históricas:")
print(f"Sin sampling: Recall@5: {recall_zs_1_5/num_test_items}, NDCG@5: {ndcg_zs_1_5/num_test_items}, Recall@10: {recall_zs_1_10/num_test_items}, NDCG@10: {ndcg_zs_1_10/num_test_items}")
print(f"Con sampling: Recall@5: {best_recall_zs_5/num_test_items}, NDCG@5: {best_ndcg_zs_5/num_test_items}, Recall@10: {best_recall_zs_10/num_test_items}, NDCG@10: {best_ndcg_zs_10/num_test_items}")

# print("\nFew-Shot sin interacciones históricas:")
# print(f"Sin sampling: Recall@5: {recall_fn_1_5/num_test_items}, NDCG@5: {ndcg_fn_1_5/num_test_items}, Recall@10: {recall_fn_1_10/num_test_items}, NDCG@10: {ndcg_fn_1_10/num_test_items}")
# print(f"Con sampling: Recall@5: {best_recall_fn_5/num_test_items}, NDCG@5: {best_ndcg_fn_5/num_test_items}, Recall@10: {best_recall_fn_10/num_test_items}, NDCG@10: {best_ndcg_fn_10/num_test_items}")

print("\nFew-Shot con interacciones históricas:")
print(f"Sin sampling: Recall@5: {recall_fs_1_5/num_test_items}, NDCG@5: {ndcg_fs_1_5/num_test_items}, Recall@10: {recall_fs_1_10/num_test_items}, NDCG@10: {ndcg_fs_1_10/num_test_items}")
print(f"Con sampling: Recall@5: {best_recall_fs_5/num_test_items}, NDCG@5: {best_ndcg_fs_5/num_test_items}, Recall@10: {best_recall_fs_10/num_test_items}, NDCG@10: {best_ndcg_fs_10/num_test_items}")

print("\nFine-tuned con interacciones históricas:")
print(f"Sin sampling: Recall@5: {recall_fts_1_5/num_test_items}, NDCG@5: {ndcg_fts_1_5/num_test_items}, Recall@10: {recall_fts_1_10/num_test_items}, NDCG@10: {ndcg_fts_1_10/num_test_items}")
print(f"Con sampling: Recall@5: {best_recall_fts_5/num_test_items}, NDCG@5: {best_ndcg_fts_5/num_test_items}, Recall@10: {best_recall_fts_10/num_test_items}, NDCG@10: {best_ndcg_fts_10/num_test_items}")


Random:
Sin sampling: Recall@5: 0.0, NDCG@5: 0.0, Recall@10: 0.0, NDCG@10: 0.0
Con sampling: Recall@5: 0.0, NDCG@5: 0.0, Recall@10: 0.05, NDCG@10: 0.0

Most Popular:
Sin sampling: Recall@5: 0.0, NDCG@5: 0.0, Recall@10: 0.0, NDCG@10: 0.0
Con sampling: Recall@5: 0.0, NDCG@5: 0.0, Recall@10: 0.0, NDCG@10: 0.0

Zero-Shot con interacciones históricas:
Sin sampling: Recall@5: 0.15, NDCG@5: 0.024355738985992397, Recall@10: 0.15, NDCG@10: 0.01968644076715747
Con sampling: Recall@5: 0.3, NDCG@5: 0.050382510846739906, Recall@10: 0.3, NDCG@10: 0.050382510846739906

Few-Shot con interacciones históricas:
Sin sampling: Recall@5: 0.0, NDCG@5: 0.0, Recall@10: 0.0, NDCG@10: 0.0
Con sampling: Recall@5: 0.0, NDCG@5: 0.0, Recall@10: 0.05, NDCG@10: 0.0

Fine-tuned con interacciones históricas:
Sin sampling: Recall@5: 0.2, NDCG@5: 0.0, Recall@10: 0.2, NDCG@10: 0.0
Con sampling: Recall@5: 0.3, NDCG@5: 0.03846399052874361, Recall@10: 0.3, NDCG@10: 0.036161691583200956


In [36]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import torch.nn.functional as F
import random
import numpy as np

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb): 

In [44]:
import torch
import numpy as np
from collections import Counter
import re
import gc
from transformers import AutoTokenizer, AutoModelForCausalLM
from scipy.stats import entropy
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

def extract_answer(text, question_text):
    """Extrae la respuesta del texto generado"""
    # Buscar después de "Answer:" o similar
    patterns = [r"Answer:\s*(.+?)(?:\n|$)", r"answer:\s*(.+?)(?:\n|$)", r"Answer is\s*(.+?)(?:\n|$)"]

    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(1).strip()

    # Si no encuentra patrón, tomar lo que viene después del prompt
    try:
        # Dividir por el texto de la pregunta y tomar la parte después
        parts = text.split("Answer:")
        if len(parts) > 1:
            return parts[-1].strip().split('\n')[0].strip()
    except:
        pass

    return text.strip()

def calculate_uncertainty_metrics_lightweight(outputs_per_paraphrase, paraphrases):
    """
    Calcula métricas de incertidumbre usando Input Clarification Ensembling

    Args:
        outputs_per_paraphrase: Lista de listas, cada sublista contiene outputs para una paráfrasis
        paraphrases: Lista de paráfrasis usadas

    Returns:
        dict con métricas de incertidumbre
    """

    # 1. Extraer respuestas limpias
    all_answers = []
    answers_by_paraphrase = []

    for i, outputs in enumerate(outputs_per_paraphrase):
        paraphrase_answers = []
        for output in outputs:
            answer = extract_answer(output, paraphrases[i])
            paraphrase_answers.append(answer)
            all_answers.append(answer)
        answers_by_paraphrase.append(paraphrase_answers)

    # 2. Calcular frecuencias de respuestas
    answer_counts = Counter(all_answers)
    total_responses = len(all_answers)

    # 3. INCERTIDUMBRE TOTAL (Shannon Entropy de todas las respuestas)
    probs = np.array(list(answer_counts.values())) / total_responses
    total_uncertainty = entropy(probs, base=2)  # bits

    # 4. INCERTIDUMBRE ALEATORIA (promedio de entropías por paráfrasis)
    aleatoric_uncertainties = []
    for answers in answers_by_paraphrase:
        local_counts = Counter(answers)
        local_probs = np.array(list(local_counts.values())) / len(answers)
        if len(local_probs) > 1:
            aleatoric_uncertainties.append(entropy(local_probs, base=2))
        else:
            aleatoric_uncertainties.append(0.0)

    aleatoric_uncertainty = np.mean(aleatoric_uncertainties)

    # 5. INCERTIDUMBRE EPISTÉMICA (diferencia)
    epistemic_uncertainty = total_uncertainty - aleatoric_uncertainty

    # 6. Métricas adicionales
    unique_answers = len(set(all_answers))
    most_common_answer, most_common_count = answer_counts.most_common(1)[0]
    confidence = most_common_count / total_responses

    # 7. Consistencia entre paráfrasis
    consistency_scores = []
    for i in range(len(paraphrases)):
        for j in range(i+1, len(paraphrases)):
            # Comparar respuestas más frecuentes de cada paráfrasis
            answers_i = Counter(answers_by_paraphrase[i])
            answers_j = Counter(answers_by_paraphrase[j])

            most_common_i = answers_i.most_common(1)[0][0] if answers_i else ""
            most_common_j = answers_j.most_common(1)[0][0] if answers_j else ""

            # Similaridad simple (exacta o parcial)
            if most_common_i.lower().strip() == most_common_j.lower().strip():
                consistency_scores.append(1.0)
            else:
                # Similaridad parcial usando tokens comunes
                tokens_i = set(most_common_i.lower().split())
                tokens_j = set(most_common_j.lower().split())
                if tokens_i and tokens_j:
                    jaccard = len(tokens_i.intersection(tokens_j)) / len(tokens_i.union(tokens_j))
                    consistency_scores.append(jaccard)
                else:
                    consistency_scores.append(0.0)

    consistency = np.mean(consistency_scores) if consistency_scores else 0.0

    return {
        'total_uncertainty': float(total_uncertainty),
        'aleatoric_uncertainty': float(aleatoric_uncertainty),
        'epistemic_uncertainty': float(epistemic_uncertainty),
        'confidence': float(confidence),
        'unique_answers': int(unique_answers),
        'most_common_answer': str(most_common_answer),
        'consistency_across_paraphrases': float(consistency),
        'answer_distribution': {str(k): int(v) for k, v in answer_counts.items()},
        'num_paraphrases': len(paraphrases),
        'samples_per_paraphrase': len(outputs_per_paraphrase[0]) if outputs_per_paraphrase else 0
    }

def calculate_logit_uncertainty_lightweight(model, tokenizer, paraphrases, device="cuda", cleanup=True):
    """
    Versión optimizada que limpia memoria agresivamente
    """
    print("Calculando incertidumbre por logits (versión ligera)...")

    uncertainties_per_paraphrase = []
    logits_stats = []

    for i, paraphrase in enumerate(paraphrases):
        print(f"Procesando logits para paráfrasis {i+1}/{len(paraphrases)}")

        ctx = "You are an oracle who only responds with short and concise answers."
        msg = f"Answer the following question: {paraphrase}\nAnswer:"
        messages = [{"role": "user", "content": ctx + msg}]

        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = tokenizer(prompt, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            next_token_logits = outputs.logits[0, -1, :].cpu()  # Mover a CPU inmediatamente

            # Calcular probabilidades y entropía
            probs = torch.softmax(next_token_logits, dim=-1)
            uncertainty = entropy(probs.numpy(), base=2)
            uncertainties_per_paraphrase.append(float(uncertainty))

            # Guardar solo estadísticas básicas, no los logits completos
            logits_stats.append({
                'mean': float(next_token_logits.mean()),
                'std': float(next_token_logits.std()),
                'max': float(next_token_logits.max()),
                'min': float(next_token_logits.min())
            })

            # Limpiar memoria inmediatamente
            del outputs, next_token_logits, probs, inputs
            if cleanup:
                torch.cuda.empty_cache()
                gc.collect()

    # Calcular métricas finales sin guardar arrays grandes
    mean_uncertainty = np.mean(uncertainties_per_paraphrase)
    std_uncertainty = np.std(uncertainties_per_paraphrase)

    return {
        'logit_uncertainties_per_paraphrase': uncertainties_per_paraphrase,
        'mean_logit_uncertainty': float(mean_uncertainty),
        'std_logit_uncertainty': float(std_uncertainty),
        'logits_stats_summary': {
            'mean_of_means': float(np.mean([s['mean'] for s in logits_stats])),
            'mean_of_stds': float(np.mean([s['std'] for s in logits_stats])),
        }
    }


In [40]:

# Configuración
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # o el modelo que uses
device = "cuda" if torch.cuda.is_available() else "cpu"

# Cargar modelo y tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Paráfrasis de la pregunta
paraphrases = [
    "What year did the Berlin Wall fall?",
    "When did the Berlin Wall come down?",
    "In which year was the Berlin Wall demolished?",
    "What year did the fall of the Berlin Wall occur?",
    "When was the Berlin Wall brought down?",
    "In what year did the Berlin Wall collapse?",
    "What year did they tear down the Berlin Wall?",
    "When did the destruction of the Berlin Wall happen?",
    "In which year did the Berlin Wall get demolished?",
    "What year marked the fall of the Berlin Wall?"
]

# Generar respuestas con sampling
outputs_per_paraphrase = []
k = 5  # Número de samples por paráfrasis

print("Generando respuestas...")
for i, paraphrase in enumerate(paraphrases):
    print(f"Procesando paráfrasis {i+1}/{len(paraphrases)}: {paraphrase}")

    outputs = []
    ctx = "You are an oracle who only responds with short and concise answers."
    msg = f"Answer the following question: {paraphrase}\nAnswer:"
    messages = [{"role": "user", "content": ctx + msg}]

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    for j in range(k):
        with torch.no_grad():
            output_ids = model.generate(
                **inputs,
                max_new_tokens=50,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                top_p=0.95,
                pad_token_id=tokenizer.eos_token_id
            )

            # Decodificar solo la parte nueva (sin el prompt)
            new_tokens = output_ids[0][inputs['input_ids'].shape[1]:]
            decoded = tokenizer.decode(new_tokens, skip_special_tokens=True)
            outputs.append(decoded)

    outputs_per_paraphrase.append(outputs)



Generando respuestas...
Procesando paráfrasis 1/10: What year did the Berlin Wall fall?
Procesando paráfrasis 2/10: When did the Berlin Wall come down?
Procesando paráfrasis 3/10: In which year was the Berlin Wall demolished?
Procesando paráfrasis 4/10: What year did the fall of the Berlin Wall occur?
Procesando paráfrasis 5/10: When was the Berlin Wall brought down?
Procesando paráfrasis 6/10: In what year did the Berlin Wall collapse?
Procesando paráfrasis 7/10: What year did they tear down the Berlin Wall?
Procesando paráfrasis 8/10: When did the destruction of the Berlin Wall happen?
Procesando paráfrasis 9/10: In which year did the Berlin Wall get demolished?
Procesando paráfrasis 10/10: What year marked the fall of the Berlin Wall?


In [41]:
include_logits = True
cleanup_model = True

In [42]:
print("="*50)
print("CALCULANDO INCERTIDUMBRE (VERSIÓN OPTIMIZADA)...")
print("="*50)

# 1. Calcular métricas básicas de incertidumbre
uncertainty_metrics = calculate_uncertainty_metrics_lightweight(outputs_per_paraphrase, paraphrases)

# 2. Calcular métricas de logits si se solicita
logit_metrics = None
if include_logits and model is not None and tokenizer is not None:
    logit_metrics = calculate_logit_uncertainty_lightweight(
        model, tokenizer, paraphrases, device, cleanup=True
    )

# 3. LIMPIAR MODELO DE MEMORIA SI SE SOLICITA
if cleanup_model and model is not None:
    print("Limpiando modelo de memoria...")
    del model
    if tokenizer is not None:
        del tokenizer
    torch.cuda.empty_cache()
    gc.collect()
    print("✓ Modelo removido de memoria")

# 4. Mostrar resultados
print(f"\n📊 MÉTRICAS DE INCERTIDUMBRE:")
print(f"├── Incertidumbre Total: {uncertainty_metrics['total_uncertainty']:.3f} bits")
print(f"├── Incertidumbre Aleatoria: {uncertainty_metrics['aleatoric_uncertainty']:.3f} bits")
print(f"├── Incertidumbre Epistémica: {uncertainty_metrics['epistemic_uncertainty']:.3f} bits")
print(f"├── Confianza: {uncertainty_metrics['confidence']:.3f}")
print(f"├── Respuestas únicas: {uncertainty_metrics['unique_answers']}")
print(f"├── Consistencia entre paráfrasis: {uncertainty_metrics['consistency_across_paraphrases']:.3f}")
print(f"└── Respuesta más común: '{uncertainty_metrics['most_common_answer']}'")

if logit_metrics:
    print(f"\n🔢 MÉTRICAS DE LOGITS:")
    print(f"├── Incertidumbre promedio: {logit_metrics['mean_logit_uncertainty']:.3f} bits")
    print(f"├── Desviación estándar: {logit_metrics['std_logit_uncertainty']:.3f} bits")
    print(f"└── Media de logits: {logit_metrics['logits_stats_summary']['mean_of_means']:.3f}")

# 5. Interpretación
print(f"\n🧠 INTERPRETACIÓN:")
if uncertainty_metrics['epistemic_uncertainty'] > uncertainty_metrics['aleatoric_uncertainty']:
    print("├── El modelo tiene más incertidumbre sobre QUÉ responder")
    print("└── → Sugiere falta de conocimiento específico")
else:
    print("├── El modelo tiene más incertidumbre sobre CÓMO responder")
    print("└── → Sugiere ambigüedad inherente en la pregunta")

if uncertainty_metrics['consistency_across_paraphrases'] > 0.8:
    print("├── Alta consistencia entre paráfrasis")
elif uncertainty_metrics['consistency_across_paraphrases'] > 0.5:
    print("├── Consistencia moderada entre paráfrasis")
else:
    print("├── Baja consistencia - posible confusión del modelo")

# DEVOLVER SOLO RESULTADOS LIGEROS
results = {
    'uncertainty_metrics': uncertainty_metrics,
    'logit_metrics': logit_metrics,
    'analysis_params': {
        'num_paraphrases': len(paraphrases),
        'samples_per_paraphrase': len(outputs_per_paraphrase[0]) if outputs_per_paraphrase else 0,
        'included_logits': include_logits
    }
}

print(results)

CALCULANDO INCERTIDUMBRE (VERSIÓN OPTIMIZADA)...
Respuestas extraídas:
Paráfrasis 1: ["The Berlin Wall fell in 1989, on November 9, 1989, when East Germany's government announced the end of the wall separating East and West Berlin.", 'The Berlin Wall fell on November 9, 1989, marking the end of the Cold War and the division of East and West Germany.', 'The Berlin Wall fell on November 9, 1989, marking the end of the Cold War and the collapse of the Soviet Union.', 'The Berlin Wall fell on November 9, 1989, which was 24 years ago.', 'The Berlin Wall fell on November 9, 1989, at 03:15 a.m. CET (Central European Time) in the early morning hours of November 9, 1989.']
Paráfrasis 2: ['The Berlin Wall came down on November 9, 1989, marking the end of the Cold War and the collapse of the Soviet Union.', "The Berlin Wall came down on November 9, 1989, on the evening of that day. The Soviet Union's fall from power and the subsequent peaceful reunification of Germany marked a historic turning po

In [54]:

# Configuración
device = "cuda" if torch.cuda.is_available() else "cpu"
ft_model = ft_model.to(device)
if ft_tokenizer.pad_token is None:
    ft_tokenizer.pad_token = ft_tokenizer.eos_token

# Paráfrasis de la pregunta
paraphrases = [
    "What year did the Berlin Wall fall?",
    "When did the Berlin Wall come down?",
    "In which year was the Berlin Wall demolished?",
    "What year did the fall of the Berlin Wall occur?",
    "When was the Berlin Wall brought down?",
    "In what year did the Berlin Wall collapse?",
    "What year did they tear down the Berlin Wall?",
    "When did the destruction of the Berlin Wall happen?",
    "In which year did the Berlin Wall get demolished?",
    "What year marked the fall of the Berlin Wall?"
]

# Generar respuestas con sampling
outputs_per_paraphrase = []
k = 5  # Número de samples por paráfrasis

print("Generando respuestas...")
for i, paraphrase in enumerate(paraphrases):
    print(f"Procesando paráfrasis {i+1}/{len(paraphrases)}: {paraphrase}")

    outputs = []
    ctx = "You are an oracle who only responds with short and concise answers."
    msg = f"Answer the following question: {paraphrase}\nAnswer:"
    messages = [{"role": "user", "content": ctx + msg}]

    prompt = ft_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = ft_tokenizer(prompt, return_tensors="pt").to(device)

    for j in range(k):
        with torch.no_grad():
            output_ids = ft_model.generate(
                **inputs,
                max_new_tokens=50,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                top_p=0.95,
                pad_token_id=ft_tokenizer.eos_token_id
            )

            # Decodificar solo la parte nueva (sin el prompt)
            new_tokens = output_ids[0][inputs['input_ids'].shape[1]:]
            decoded = ft_tokenizer.decode(new_tokens, skip_special_tokens=True)
            outputs.append(decoded)

    outputs_per_paraphrase.append(outputs)



Generando respuestas...
Procesando paráfrasis 1/10: What year did the Berlin Wall fall?
Procesando paráfrasis 2/10: When did the Berlin Wall come down?
Procesando paráfrasis 3/10: In which year was the Berlin Wall demolished?
Procesando paráfrasis 4/10: What year did the fall of the Berlin Wall occur?
Procesando paráfrasis 5/10: When was the Berlin Wall brought down?
Procesando paráfrasis 6/10: In what year did the Berlin Wall collapse?
Procesando paráfrasis 7/10: What year did they tear down the Berlin Wall?
Procesando paráfrasis 8/10: When did the destruction of the Berlin Wall happen?
Procesando paráfrasis 9/10: In which year did the Berlin Wall get demolished?
Procesando paráfrasis 10/10: What year marked the fall of the Berlin Wall?


In [55]:
print("="*50)
print("CALCULANDO INCERTIDUMBRE (VERSIÓN OPTIMIZADA)...")
print("="*50)

# 1. Calcular métricas básicas de incertidumbre
uncertainty_metrics = calculate_uncertainty_metrics_lightweight(outputs_per_paraphrase, paraphrases)

# 2. Calcular métricas de logits si se solicita
logit_metrics = None
if include_logits and ft_model is not None and tokenizer is not None:
    logit_metrics = calculate_logit_uncertainty_lightweight(
        ft_model, tokenizer, paraphrases, device, cleanup=True
    )

# 3. LIMPIAR MO.DELO DE MEMORIA SI SE SOLICITA
if cleanup_model and ft_model is not None:
    print("Limpiando modelo de memoria...")
    del ft_model
    if tokenizer is not None:
        del tokenizer
    torch.cuda.empty_cache()
    gc.collect()
    print("✓ Modelo removido de memoria")

# 4. Mostrar resultados
print(f"\n📊 MÉTRICAS DE INCERTIDUMBRE:")
print(f"├── Incertidumbre Total: {uncertainty_metrics['total_uncertainty']:.3f} bits")
print(f"├── Incertidumbre Aleatoria: {uncertainty_metrics['aleatoric_uncertainty']:.3f} bits")
print(f"├── Incertidumbre Epistémica: {uncertainty_metrics['epistemic_uncertainty']:.3f} bits")
print(f"├── Confianza: {uncertainty_metrics['confidence']:.3f}")
print(f"├── Respuestas únicas: {uncertainty_metrics['unique_answers']}")
print(f"├── Consistencia entre paráfrasis: {uncertainty_metrics['consistency_across_paraphrases']:.3f}")
print(f"└── Respuesta más común: '{uncertainty_metrics['most_common_answer']}'")

if logit_metrics:
    print(f"\n🔢 MÉTRICAS DE LOGITS:")
    print(f"├── Incertidumbre promedio: {logit_metrics['mean_logit_uncertainty']:.3f} bits")
    print(f"├── Desviación estándar: {logit_metrics['std_logit_uncertainty']:.3f} bits")
    print(f"└── Media de logits: {logit_metrics['logits_stats_summary']['mean_of_means']:.3f}")

# 5. Interpretación
print(f"\n🧠 INTERPRETACIÓN:")
if uncertainty_metrics['epistemic_uncertainty'] > uncertainty_metrics['aleatoric_uncertainty']:
    print("├── El modelo tiene más incertidumbre sobre QUÉ responder")
    print("└── → Sugiere falta de conocimiento específico")
else:
    print("├── El modelo tiene más incertidumbre sobre CÓMO responder")
    print("└── → Sugiere ambigüedad inherente en la pregunta")

if uncertainty_metrics['consistency_across_paraphrases'] > 0.8:
    print("├── Alta consistencia entre paráfrasis")
elif uncertainty_metrics['consistency_across_paraphrases'] > 0.5:
    print("├── Consistencia moderada entre paráfrasis")
else:
    print("├── Baja consistencia - posible confusión del modelo")

# DEVOLVER SOLO RESULTADOS LIGEROS
results = {
    'uncertainty_metrics': uncertainty_metrics,
    'logit_metrics': logit_metrics,
    'analysis_params': {
        'num_paraphrases': len(paraphrases),
        'samples_per_paraphrase': len(outputs_per_paraphrase[0]) if outputs_per_paraphrase else 0,
        'included_logits': include_logits
    }
}

print(results)

CALCULANDO INCERTIDUMBRE (VERSIÓN OPTIMIZADA)...
Calculando incertidumbre por logits (versión ligera)...
Procesando logits para paráfrasis 1/10
Procesando logits para paráfrasis 2/10
Procesando logits para paráfrasis 3/10
Procesando logits para paráfrasis 4/10
Procesando logits para paráfrasis 5/10
Procesando logits para paráfrasis 6/10
Procesando logits para paráfrasis 7/10
Procesando logits para paráfrasis 8/10
Procesando logits para paráfrasis 9/10
Procesando logits para paráfrasis 10/10
Limpiando modelo de memoria...
✓ Modelo removido de memoria

📊 MÉTRICAS DE INCERTIDUMBRE:
├── Incertidumbre Total: 5.644 bits
├── Incertidumbre Aleatoria: 2.322 bits
├── Incertidumbre Epistémica: 3.322 bits
├── Confianza: 0.020
├── Respuestas únicas: 50
├── Consistencia entre paráfrasis: 0.179
└── Respuesta más común: 'I don't have access to the latest news. However, I'm glad you asked. The Berlin Wall fell on August 14, 1961, and it became a symbol of the Cold War between the Soviet Union and'

🔢 M

In [65]:
limpiar_y_guardar()

🎯 Limpieza específica para modelo LoRA...
   ├── Eliminando: tokenizer
   ├── Eliminando: pipe
   ├── Eliminando: inputs
   ├── Eliminando: output_ids
   ├── Eliminando: prompt
   ├── Eliminando: messages
   ├── Eliminando: ctx
   ├── Eliminando: msg
   ├── Eliminando: decoded
   ├── Eliminando: outputs

📊 Variables mantenidas:
   ├── item_map: 9687 elementos
   ├── Conversation: 16935069 elementos
   ├── model_name: 34 elementos
   ├── outputs_f_s: 20 elementos
   ├── outputs_ft_s: 20 elementos
   ├── rand_conversations: 20 elementos
   ├── num_test_items: <class 'int'>
   ├── train_conv: 2512 elementos
   ├── test_conv: 619 elementos
   ├── num_test_items: <class 'int'>
   ├── few_shot_data: 5 elementos
