# Propuesta de proyecto: Uso de muestreo repetitivo para mejorar el rendimiento de una LLM en recomendación conversacional

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name_1 = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer_1 = AutoTokenizer.from_pretrained(model_name_1)
# model_1 = AutoModelForCausalLM.from_pretrained(model_name_1)

model_name_2 = "distilgpt2"
tokenizer_2 = AutoTokenizer.from_pretrained(model_name_2)
# model_2 = AutoModelForCausalLM.from_pretrained(model_name_2)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import Tools as t

path = "./LLM_Redial/Movie"
final_data_path = '{}/final_data.jsonl'.format(path)
Conversation_path = '{}/Conversation.txt'.format(path)
user_map_path = '{}/user_ids.json'.format(path)
item_map_path = '{}/item_map.json'.format(path)

final_data = t.read_jsonl(final_data_path)
user_map = t.read_json(user_map_path)
item_map = t.read_json(item_map_path)
Conversation = t.read_dialogue(Conversation_path)

In [None]:
import json
import random

# Ruta del archivo
final_path = './LLM_Redial/Movie/final_data.jsonl'
movie_map_path = './LLM_Redial/Movie/item_map.json'

# Leer el archivo JSONL
with open(final_path, 'r', encoding='utf-8') as file:
    data = [json.loads(line) for line in file]

user_id = "AQP1VPK16SVWM"
user_data = next((item[user_id] for item in data if user_id in item), None)

user_conversation = user_data.get("Conversation", [])
combined_user_likes = []
for conversation in user_conversation:
    for key, value in conversation.items():
        combined_user_likes.extend(value.get("user_likes", []))

combined_user_likes = list(set(combined_user_likes))

if user_data:
    user_likes = combined_user_likes
    
    # Mezclar los datos aleatoriamente
    random.shuffle(user_likes)
    
    # Realizar el split 60-40
    split_index = int(len(user_likes) * 0.7)
    train_likes = user_likes[:split_index]
    test_likes = user_likes[split_index:]
    
    # Formatear el resultado
    split_data = {
        "train": train_likes,
        "test": test_likes
    }
    
    # Imprimir el resultado
    print(json.dumps(split_data, indent=4))
else:
    print(f"No se encontraron datos para el usuario {user_id}.")


{
    "train": [
        "0784013470",
        "B0001XAPWE",
        "6304632444",
        "6300216691",
        "B000006DAL",
        "B0000639GO"
    ],
    "test": [
        "6302309506",
        "7883704591",
        "B004ZBA9P6"
    ]
}
['0784013470', 'B0001XAPWE', '6304632444', '6300216691', 'B000006DAL', 'B0000639GO']


In [49]:
print(train_likes)

['Stir Of Echoes', 'The Barbarian Invasions', 'Quatermass &amp; The Pit VHS', 'Islands in the Stream VHS', 'The Three Musketeers/The Four Musketeers VHS', 'And Now the Screaming Starts']


In [None]:
from transformers import pipeline
k = 1
outputs = []
# Pipeline Initialization
pipe = pipeline(
    "text-generation",
    model=model_name_1,
    device_map="auto",
)

# Prompt
ctx = "You are a movie recommendation system. Your task is to provide personalized movie recommendations based on the user's preferences. The user has provided a list of movies they like which is, and you need to suggest similar movies they might enjoy."
msg = (
    "Based on the following movies the user likes: " + str(train_likes) +
    ". Suggest 10 similar movies that the user might enjoy. "
    "Do NOT include any of the listed movies in your response. "
    "List only the movie titles, one per line. The titles MUST be different from the given ones."
)

messages = [
    {
        "role": "system",
        "content": ctx,
    },
    {"role": "user", "content": msg}
]

prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

for i in range(k):
    output = pipe(
        prompt,
        max_new_tokens=100,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95
    )
    outputs.append(output)

Some parameters are on the meta device because they were offloaded to the disk and cpu.
Device set to use cpu


In [47]:
for out in outputs:
    print(out[0]['generated_text'])

<|system|>
You are a movie recommendation system. Your task is to provide personalized movie recommendations based on the user's preferences. The user has provided a list of movies they like, and you need to suggest similar movies they might enjoy.</s>
<|user|>
Give me a list of 10 movie recommendations based on the following movies: ['Stir Of Echoes', 'The Barbarian Invasions', 'Quatermass &amp; The Pit VHS', 'Islands in the Stream VHS', 'The Three Musketeers/The Four Musketeers VHS', 'And Now the Screaming Starts'] excluding the given movies. Please provide the movie names only, without any additional information.</s>
<|assistant|>
Here are 10 movie recommendations based on the given movies:

1. "Stir of Echoes" (1999) - Directed by David Koepp, starring Mark Wahlberg, Kathryn Morris, and Kiefer Sutherland

2. "The Barbarian Invasions" (1983) - Directed by James Frawley, starring Robert Urich, Robert Foxworth, and Susan
