In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import AutoTokenizer, AutoModel, pipeline
import torch
from typing import List

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('movies.csv')
movies = df.copy()
movies

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,https://m.media-amazon.com/images/M/MV5BNGEwMT...,Breakfast at Tiffany's,1961,A,115 min,"Comedy, Drama, Romance",7.6,A young New York socialite becomes interested ...,76.0,Blake Edwards,Audrey Hepburn,George Peppard,Patricia Neal,Buddy Ebsen,166544,
996,https://m.media-amazon.com/images/M/MV5BODk3Yj...,Giant,1956,G,201 min,"Drama, Western",7.6,Sprawling epic covering the life of a Texas ca...,84.0,George Stevens,Elizabeth Taylor,Rock Hudson,James Dean,Carroll Baker,34075,
997,https://m.media-amazon.com/images/M/MV5BM2U3Yz...,From Here to Eternity,1953,Passed,118 min,"Drama, Romance, War",7.6,"In Hawaii in 1941, a private is cruelly punish...",85.0,Fred Zinnemann,Burt Lancaster,Montgomery Clift,Deborah Kerr,Donna Reed,43374,30500000
998,https://m.media-amazon.com/images/M/MV5BZTBmMj...,Lifeboat,1944,,97 min,"Drama, War",7.6,Several survivors of a torpedoed merchant ship...,78.0,Alfred Hitchcock,Tallulah Bankhead,John Hodiak,Walter Slezak,William Bendix,26471,


In [3]:
#preprocess dATA
movies['Genre'] = movies['Genre'].apply(lambda x: ', '.join(x.split(',')))

#We create a rich MetaText field that combines all relevant movie information in a structured way, which helps our embedding model capture the full context of each movie.
movies['Metatext'] = movies.apply(
    lambda row:(
    f"Title: {row['Series_Title']}\n",
    f"Director: {row['Director']}\n",
    f"Genre: {row['Genre']}\n",
    f"Plot: {row['Overview']}\n",
    f"Stars: {row['Star1']}, {row['Star2']}\n",
    f"Year: {row['Released_Year']}\n",
    f"Rating: {row['IMDB_Rating']}\n"), axis=1
)

#create a movie_id column
movies['movie_id'] = movies.index.astype(str)

In [4]:
#E5 embedding implementation
#The E5 multilingual embedding model is at the heart of our system: This wrapper makes the E5 model compatible with LangChain’s API expectations. I’ve specifically chosen the “multilingual-e5-small” model for its efficient balance between performance and resource usage.

class E5EmbeddingWrapper:
    def __init__(self, model_name: str = "intfloat/multilingual-e5-small"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
    
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        # For documents, prefix with "passage: "
        return [self.embed_query("passage: " + text) for text in texts]
    
    def embed_query(self, text: str) -> List[float]:
        # For queries, you’d pass "query: ..."
        inputs = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            outputs = self.model(**inputs)
        # Mean pooling (better than just CLS for E5)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        return embeddings[0].cpu().numpy().tolist()
    
    # Make the class callable (LangChain expects this sometimes)
    def __call__(self, text: str) -> List[float]:
        return self.embed_query("query: " + text)


In [5]:
#batch embedding
#To process our entire dataset efficiently, we implement batch processing: This approach optimizes memory usage and takes advantage of GPU acceleration when available.

def batch_embed(texts, batch_size = 16):
    embedded_vectors = []
    
    for i in tqdm(range(0, len(texts), batch_size), desc = "Embedding Texts"):
        batch = texts[i:i + batch_size]
        embedded_vectors.extend(embeddings.embed_documents(batch))
    return embedded_vectors

embeddings = E5EmbeddingWrapper()

movie_descriptions = [str(desc) for desc in movies['Metatext'].tolist()]
movie_embeddings = batch_embed(movie_descriptions)


Embedding Texts: 100%|██████████| 63/63 [02:01<00:00,  1.93s/it]


In [6]:
#Vector Storage with FAISS
#We use FAISS to efficiently store and query our movie embeddings: FAISS allows us to perform similarity searches extremely efficiently, and saving the index locally means we can reuse it without recomputing embeddings. For more details on FAISS, refer here.

vector_store = FAISS.from_embeddings(
    embedding=embeddings,
    text_embeddings=list(zip(movies['movie_id'].tolist(), movie_embeddings)),
    metadatas=movies.to_dict('records')
)
vector_store.save_local("imdb_e5_index")

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [9]:
#  RAG Implementation with LangChain

prompt_template = """Analyse this movie recommendation context:
{context}

Generate a personalized recommendation explaining:
1. Genre alignment with "{question}"
2. Director/style connections
3. Star actor relevance
4. Plot Similarities"""

PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

# Create the LLM
llm = HuggingFacePipeline.from_model_id(
    model_id="google/flan-t5-base",
    task="text2text-generation",   # ✅ Correct task for T5
    model_kwargs={"temperature": 0.4},   # stays here
    pipeline_kwargs={"max_new_tokens": 128}  # ✅ move here
)

# Create the RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(search_kwargs={"k": 1}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Device set to use cpu


In [10]:
#Advanced Hybrid Recommender
#The final piece combines content-based search with quality metrics:

# This recommender implements a sophisticated two-stage process:
# Semantic search using E5 embeddings finds content-relevant candidates
# Re-ranking based on a weighted quality score (IMDB rating, Metascore, and number of votes)
# Generating personalized, structured explanations for each recommendation

class AdvancedRecommender:
    def __init__(self, vector_store, qa_chain):
        self.store = vector_store
        self.qa_chain = qa_chain
        self.rating_weights = {
            'IMDB_Rating': 0.6,
            'Meta_score': 0.3,
            'No_of_Votes': 0.1
        }
        
    def _hybrid_score(self, movie):
        # Handle potential missing columns or NaN values
        score = 0
        for col, weight in self.rating_weights.items():
            if col in movie and pd.notna(movie[col]):
                score += movie[col] * weight
        return score
    
    def recommend(self, query, top_n=5):
        # First, get a larger set of candidates through similarity search
        content_results = self.store.similarity_search(query, k=top_n*2)
        
        # Apply popularity/quality boost and sort
        sorted_results = sorted(
            content_results,
            key=lambda x: self._hybrid_score(x.metadata),
            reverse=True
        )[:top_n]
        
        # Generate explanations using RetrievalQA
        explanations = []
        for doc in sorted_results:
            # Create a specific query for this movie
            movie_query = f"{query} - specifically for '{doc.metadata['Series_Title']}'"
            
            # Run the RetrievalQA chain
            result = self.qa_chain(movie_query)
            
            # Extract the explanation
            explanation = result['result']
            
            explanations.append({
                'title': doc.metadata['Series_Title'],
                'year': doc.metadata['Released_Year'],
                'rating': doc.metadata['IMDB_Rating'],
                'explanation': explanation
            })
        
        return explanations


In [11]:
recommender = AdvancedRecommender(vector_store, qa_chain)
recommendations = recommender.recommend("Psychological thrillers with twist endings", top_n=3)

print("Top Recommendations:")
for i, rec in enumerate(recommendations):
    print(f"\n{i+1}. {rec['title']} ({rec['year']}) - IMDB {rec['rating']}")
    print(f"Explanation: {rec['explanation']}")

  result = self.qa_chain(movie_query)


Top Recommendations:

1. Psycho (1960) - IMDB 8.5
Explanation: 1. 'Psycho' is a psychological thriller with twist endings - specifically for 'Psycho' 2. director/style connections 3. star actor relevance 4. plot

2. End of Watch (2012) - IMDB 7.6
Explanation: 1. End of Watch is a psychological thriller with twist endings. 2. Director/style connections. 3. Star actor relevance. 4. Plot similarities.

3. Misery (1990) - IMDB 7.8
Explanation: 1. Psychological thrillers with twist endings - specifically for 'Misery' 2. director/style connections 3. Star actor relevance 4. plot


In [12]:
recommender = AdvancedRecommender(vector_store, qa_chain)
recommendations = recommender.recommend("Crime Thrillers with serial killer", top_n=3)

print("Top Recommendations:")
for i, rec in enumerate(recommendations):
    print(f"\n{i+1}. {rec['title']} ({rec['year']}) - IMDB {rec['rating']}")
    print(f"Explanation: {rec['explanation']}")

Top Recommendations:

1. Se7en (1995) - IMDB 8.6
Explanation: 1. Se7en is a crime thriller with a serial killer. 2. It has a director/style connection. 3. It has a star actor. 4. It has a plot similar to Se7en.

2. Saw (2004) - IMDB 7.6
Explanation: 1. 'Saw' is a crime thriller with a serial killer in it. 2. It has a director/style connection. 3. It has a star actor. 4. It has a plot similar to 'Saw'.

3. Ang-ma-reul bo-at-da (2010) - IMDB 7.8
Explanation: 1. Ang-ma-reul Bo-at-da 2. director/style connections 3. star actor relevance 4. plot


In [13]:
recommender = AdvancedRecommender(vector_store, qa_chain)
recommendations = recommender.recommend("Romance with Drama", top_n=3)

print("Top Recommendations:")
for i, rec in enumerate(recommendations):
    print(f"\n{i+1}. {rec['title']} ({rec['year']}) - IMDB {rec['rating']}")
    print(f"Explanation: {rec['explanation']}")

Top Recommendations:

1. Gone with the Wind (1939) - IMDB 8.1
Explanation: Gone with the Wind is a great movie. It's a good movie with a lot of action. It's a good movie with a lot of action. It's a good movie with a lot of action. It's a good movie with a lot of action. It's a good movie with a lot of action. It's a good movie with a lot of action. It's a good movie with a lot of action. It's a good movie with a lot of action. It's a

2. Pride & Prejudice (2005) - IMDB 7.8
Explanation: 1. Pride & Prejudice 2. Director/style connections 3. Star actor relevance 4. Plot Similarities

3. Once (2007) - IMDB 7.8
Explanation: 1. 'Once' is a good movie. 2. It's a good movie. 3. It's a good movie. 4. It's a good movie. 5. It's a good movie. 6. It's a good movie. 7. It's a good movie. 8. It's a good movie. 9. It's a good movie. 10. It's a good movie. 11. It's a good movie. 12. It's a good movie. 13. It's a good movie. 14. It's a good movie
