In [5]:
###1. Data Preparation
import pandas as pd #para manejar el dataset de películas (leer CSV, limpiar, procesar datos).
from transformers import AutoTokenizer, AutoModel #permiten cargar modelos pre-entrenados y sus tokenizadores
from typing import List #para anotaciones de tipo (listas).
from langchain_community.vectorstores import FAISS #base de datos vectorial (similarity search). Sirve para buscar embeddings parecidos (ej. películas con tramas similares).
from langchain_huggingface import HuggingFaceEmbeddings #wrapper de LangChain que genera embeddings usando modelos de Hugging Face.
from tqdm import tqdm ##librería que muestra la barra de progreso en los loops. 
from langchain.prompts import PromptTemplate #define plantillas de prompts para el LLM.
from langchain_huggingface import HuggingFacePipeline #conecta un modelo de Hugging Face como un LLM en LangChain.
from langchain.chains import RetrievalQA #construye un pipeline de pregunta–respuesta con recuperación de contexto (RAG).

# Load IMDB Top 1000 dataset
movies = pd.read_csv('imdb_top_1000.csv')

# Preprocess data
movies['Genre'] = movies['Genre'].apply(lambda x: ', '.join(x.split(','))) #"Drama,Action,Thriller" → "Drama, Action, Thriller"
movies['MetaText'] = movies.apply(lambda row: 
    f"Title: {row['Series_Title']}\n"
    f"Director: {row['Director']}\n"
    f"Genre: {row['Genre']}\n"
    f"Plot: {row['Overview']}\n"
    f"Stars: {row['Star1']}, {row['Star2']}\n"
    f"Year: {row['Released_Year']}\n"
    f"Rating: {row['IMDB_Rating']}", axis=1)

#Ejemplo para Inception:
    #Title: Inception
    #Director: Christopher Nolan
    #Genre: Action, Adventure, Sci-Fi
    #Plot: A thief who steals corporate secrets ...
    #Stars: Leonardo DiCaprio, Joseph Gordon-Levitt
    #Year: 2010
    #Rating: 8.8

# Create a movie_id column
movies['movie_id'] = movies.index.astype(str) #FAISS necesita un ID único para cada documento.

In [6]:
###The E5 multilingual embedding model is at the heart of our system:
class E5EmbeddingWrapper:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-small")#convierte texto → tokens (números que entiende la red).
        self.model = AutoModel.from_pretrained("intfloat/multilingual-e5-small")#carga el modelo neuronal (aquí: "intfloat/multilingual-e5-small", que entiende varios idiomas).
        
    def embed_documents(self, texts: List[str]) -> List[List[float]]:#Itera por cada texto y llama a embed_query
        return [self.embed_query(text) for text in texts]#cada embedding es un vector List[float]
    
    def embed_query(self, text: str) -> List[float]:
        inputs = self.tokenizer(#Convierte el texto en tensores PyTorch (input_ids, attention_mask), con padding y truncado.
            text, padding=True, truncation=True, return_tensors="pt"
        )
        with torch.no_grad():##Se asegura de que el modelo no calcule gradientes
            outputs = self.model(**inputs) #Pasa el texto por el modelo → devuelve representaciones vectoriales
        return outputs.last_hidden_state[:, 0, :].cpu().numpy()[0].tolist() #Toma solo el primer token ([CLS]) de la secuencia.
             ##Convierte el tensor → NumPy → lista de floats → lista Python.
        
    # Make the class callable as required by LangChain
    def __call__(self, text: str) -> List[float]:#Hace que el objeto sea callable como una función.
        return self.embed_query(text)

In [10]:
# Ejemplo usando embeddings de HuggingFace
embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-small")

def batch_embed(texts, #lista de strings (ejemplo: todos los plots de las películas).
                batch_size=16):#cuántos textos procesar por cada iteración
    embedded_vectors = []#Inicializa una lista vacía para ir guardando los embeddings resultantes.
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding texts"):
        batch = texts[i:i+batch_size]
        embedded_vectors.extend(embeddings.embed_documents(batch))
    return embedded_vectors #Devuelve la lista completa con todos los embeddings (uno por cada texto de entrada).

movie_descriptions = movies['MetaText'].tolist()#"Title: Inception\nDirector: Nolan\nGenre: Action, Sci-Fi\nPlot: ...",
movie_embeddings = batch_embed(movie_descriptions)#  [0.12, -0.03, 0.88, ...],  # vector para Inception

Embedding texts: 100%|██████████████████████████| 63/63 [00:05<00:00, 12.13it/s]


In [11]:
###We use FAISS to efficiently store and query our movie embeddings:

# Crear el vector store
vector_store = FAISS.from_embeddings( #Eso le dice a FAISS: "este vector pertenece a esta película".
    embedding=embeddings,
    text_embeddings=list(zip(movies['movie_id'].tolist(), #lista de IDs únicos de las películas.
                             movie_embeddings)),#lista de vectores (ya calculados con tu batch_embed).
    metadatas=movies.to_dict('records')
)

# Guardar en disco
vector_store.save_local("imdb_e5_index")


In [18]:
###5. RAG Implementation with LangChain
prompt_template = """Analyze this movie recommendation context:
{context}

Generate a personalized recommendation explaining:
1. Genre alignment with "{query}"
2. Director/style connections
3. Star actor relevance
4. Plot similarities"""

PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "query"]
)

# Create the LLM
llm = HuggingFacePipeline.from_model_id(
    model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    task="text-generation",
    model_kwargs={"temperature": 0.4, "max_length": 512}
)

# Crear la cadena de RAG con RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(search_kwargs={"k": 1}),
    return_source_documents=True,
    #chain_type_kwargs={"prompt": PROMPT}
)



Device set to use mps:0


In [19]:
###6. Advanced Hybrid Recommender
class AdvancedRecommender:
    def __init__(self, vector_store, qa_chain):
        self.store = vector_store #Recibe el vector store (FAISS con embeddings de películas) 
        self.qa_chain = qa_chain #y el qa_chain (el RAG que genera explicaciones).
        self.rating_weights = {
            'IMDB_Rating': 0.6,
            'Meta_score': 0.3,
            'No_of_Votes': 0.1
        }
    
    def _hybrid_score(self, movie):
        # Handle potential missing columns or NaN values
        score = 0
        for col, weight in self.rating_weights.items():
            if col in movie and pd.notna(movie[col]):
                score += movie[col] * weight
        return score
        #example:
            #IMDB_Rating = 8.5 → 8.5*0.6 = 5.1
            #Meta_score = 70 → 70*0.3 = 21
            #No_of_Votes = 200000 → 200000*0.1 = 20000
    
    def recommend(self, query, top_n=5):
        # First, get a larger set of candidates through similarity search
        content_results = self.store.similarity_search(query, k=top_n*2)#Busca en FAISS las películas más cercanas al query.
        #Ejemplo: si pides 5 recomendaciones, primero busca 10.
        
        # Apply popularity/quality boost and sort
        sorted_results = sorted( #Reranking con calidad
            content_results,
            key=lambda x: self._hybrid_score(x.metadata),
            reverse=True
        )[:top_n]
        
        # Generate explanations using RetrievalQA
        explanations = [] #Generar explicaciones personalizadas (RAG)
        for doc in sorted_results:
            # Create a specific query for this movie
            movie_query = f"{query} - specifically for '{doc.metadata['Series_Title']}'"
            
            # Run the RetrievalQA chain
            result = self.qa_chain({"query": movie_query})
            
            # Extract the explanation
            explanation = result['result']
            
            explanations.append({
                'title': doc.metadata['Series_Title'],
                'year': doc.metadata['Released_Year'],
                'rating': doc.metadata['IMDB_Rating'],
                'explanation': explanation
            })
        
        return explanations


In [20]:
recommender = AdvancedRecommender(vector_store, qa_chain)
recommendations = recommender.recommend("Psychological thrillers with twist endings", top_n=3)

print("Top Recommendations:")
for i, rec in enumerate(recommendations):
    print(f"\n{i+1}. {rec['title']} ({rec['year']}) - IMDB {rec['rating']}")
    print(f"Explanation: {rec['explanation']}")

Top Recommendations:

1. Psycho (1960) - IMDB 8.5
Explanation: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

49

Question: Psychological thrillers with twist endings - specifically for 'Psycho'
Helpful Answer: "Psycho" is a classic psychological thriller that has been remade several times, but the original is still widely regarded as one of the best. The film follows the story of a wealthy family who moves into a secluded mansion in the hills of California, only to find that the house is haunted by the ghost of the murdered woman who once lived there.

The twist ending is that the killer is not the family's own daughter, but a psychiatrist who has been hired to help the family cope with their grief. The twist is that the psychiatrist is actually the killer, and the family is in on the scheme all along.

The film is known for its tense and suspenseful atmosphere, as well