In [None]:
import json
import numpy as np
import pandas as pd

from elasticsearch import Elasticsearch
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


# RAG Flow

## 1. Load dataset

In [2]:
data = pd.read_csv("rotten_tomatoes/movies_at_home.csv")

In [3]:
data.head(2)

Unnamed: 0,title,synopsis,consensus,critics_score,audience_score,director,producer,screenwriter,distributor,production_co,...,genre,original_language,release_date_theaters,rerelease_date_theaters,release_date_streaming,box_office,runtime,sound_mix,aspect_ratio,rating_details
0,The Substance,Have you ever dreamt of a better version of yo...,"Audaciously gross, wickedly clever, and possib...",90%,74%,Coralie Fargeat,"Coralie Fargeat, Eric Fellner, Tim Bevan",Coralie Fargeat,MUBI,Working Title Films,...,"Horror, Drama",English,"Sep 20, 2024, Wide",,"Oct 31, 2024",$15.5M,2h 21m,Dolby Digital,Digital 2.39:1,"Graphic Nudity, Gore, Language, Strong Violent..."
1,The Wild Robot,The epic adventure follows the journey of a ro...,"A simple tale told with great sophistication, ...",98%,98%,Christopher Sanders,Jeff Hermann,Christopher Sanders,Universal Pictures,DreamWorks Animation,...,"Kids & Family, Adventure, Animation",English,"Sep 27, 2024, Wide",,"Oct 15, 2024",,1h 42m,Dolby Digital,Scope (2.35:1),"Thematic Elements, Action/Peril"


In [4]:
data = data.drop(["distributor", "production_co", "rerelease_date_theaters", "sound_mix", "aspect_ratio", "rating_details"], axis=1)

## 2. Setting embedding model

In [5]:
embedding_model = SentenceTransformer("multi-qa-distilbert-cos-v1")

user_question = "Which movies Peter Jackson directed?"
encoded_question = embedding_model.encode(user_question)

## 3. Creating embeddings

In [6]:
records = json.loads(data.to_json(orient='records'))

In [7]:
records[0]

{'title': 'The Substance',
 'synopsis': "Have you ever dreamt of a better version of yourself? You, only better in every way. You should try this new product, it's called The Substance. IT CHANGED MY LIFE. With The Substance, you can generate another you: younger, more beautiful, more perfect. You just have to share time -- one week for one, one week for the other. A perfect balance of seven days each... Easy right? If you respect the balance... What could possibly go wrong?",
 'consensus': "Audaciously gross, wickedly clever, and possibly Demi Moore's finest hour, The Substance is a gasp-inducing feat from writer-director Coralie Fargeat.",
 'critics_score': '90%',
 'audience_score': '74%',
 'director': 'Coralie Fargeat',
 'producer': 'Coralie Fargeat, Eric Fellner, Tim Bevan',
 'screenwriter': 'Coralie Fargeat',
 'rating': 'R ',
 'genre': 'Horror, Drama',
 'original_language': 'English',
 'release_date_theaters': 'Sep 20, 2024, Wide',
 'release_date_streaming': 'Oct 31, 2024',
 'box_

In [8]:
embeddings = []

for document in tqdm(records):
    syn_con_dir_gen = f"{document['synopsis']} {document['consensus']} {document['director']} {document['genre']}"
    embeddings.append(embedding_model.encode(syn_con_dir_gen))

100%|██████████| 9969/9969 [00:30<00:00, 324.40it/s]


## 4. Search test

In [9]:
X = np.array(embeddings)
scores = X.dot(encoded_question)
max_score = scores.max()

print("Max score:",max_score)
print("Index:", np.argsort(scores)[-1])

Max score: 0.5363765
Index: 1758


In [10]:
records[1758]

{'title': 'King Kong',
 'synopsis': "Peter Jackson's expansive remake of the 1933 classic follows director Carl Denham (Jack Black) and his crew on a journey from New York City to the ominous Skull Island to film a new movie. Accompanying him are playwright Jack Driscoll (Adrien Brody) and actress Ann Darrow (Naomi Watts), who is whisked away by the monstrous ape, Kong, after they reach the island. The crew encounters dinosaurs and other creatures as they race to rescue Ann, while the actress forms a bond with her simian captor.",
 'consensus': "Featuring state-of-the-art special effects, terrific performances, and a majestic sense of spectacle, Peter Jackson's remake of King Kong is a potent epic that's faithful to the spirit of the 1933 original.",
 'critics_score': '84%',
 'audience_score': '50%',
 'director': 'Peter Jackson',
 'producer': 'Jan Blenkin, Carolynne Cunningham, Fran Walsh',
 'screenwriter': 'Merian C. Cooper, Edgar Wallace, Fran Walsh, Peter Jackson, Philippa Boyens',


In [11]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=records, embeddings=X)
search_engine.search(encoded_question, num_results=5)

[{'title': 'King Kong',
  'synopsis': "Peter Jackson's expansive remake of the 1933 classic follows director Carl Denham (Jack Black) and his crew on a journey from New York City to the ominous Skull Island to film a new movie. Accompanying him are playwright Jack Driscoll (Adrien Brody) and actress Ann Darrow (Naomi Watts), who is whisked away by the monstrous ape, Kong, after they reach the island. The crew encounters dinosaurs and other creatures as they race to rescue Ann, while the actress forms a bond with her simian captor.",
  'consensus': "Featuring state-of-the-art special effects, terrific performances, and a majestic sense of spectacle, Peter Jackson's remake of King Kong is a potent epic that's faithful to the spirit of the 1933 original.",
  'critics_score': '84%',
  'audience_score': '50%',
  'director': 'Peter Jackson',
  'producer': 'Jan Blenkin, Carolynne Cunningham, Fran Walsh',
  'screenwriter': 'Merian C. Cooper, Edgar Wallace, Fran Walsh, Peter Jackson, Philippa B

## 5. Indexing

In [12]:
es_client = Elasticsearch("http://localhost:9200")

In [13]:
index_name = "movies_at_home"

dense_vector_type = {
    "type": "dense_vector",
    "dims": 768,
    "index": True,
    "similarity": "cosine",
}

index_settings = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 0},
    "mappings": {
        "properties": {
            "title": {"type": "text"},
            "synopsis": {"type": "text"},
            "consensus": {"type": "text"},
            "critics_score": {"type": "text"},
            "audience_score": {"type": "text"},
            "director": {"type": "text"},
            "producer": {"type": "text"},
            "screenwriter": {"type": "text"},
            "rating": {"type": "text"},
            "genre": {"type": "text"},
            "original_language": {"type": "text"},
            "release_date_theaters": {"type": "text"},
            "release_date_streaming": {"type": "text"},
            "box_office": {"type": "text"},
            "id": {"type": "text"},
            "title_vector": dense_vector_type,
            "synopsis_vector": dense_vector_type,
            "consensus_vector": dense_vector_type,
            "director_vector": dense_vector_type,
            "genre_vector": dense_vector_type,
            "syn_con_dir_gen_vector": dense_vector_type,
        }
    }
}

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'movies_at_home'})

In [15]:
def encode(text: str):
    try:
        return embedding_model.encode(text)
    except TypeError:
        return embedding_model.encode("No entry")


for i in tqdm(range(len(records))):
    title = records[i]["title"]
    synopsis = records[i]["synopsis"]
    consensus = records[i]["consensus"]
    director = records[i]["director"]
    genre = records[i]["genre"]

    records[i]["id"] = f"{(i+1):06d}"
    records[i]["title_vector"] = encode(title)
    records[i]["synopsis_vector"] = encode(synopsis)
    records[i]["consensus_vector"] = encode(consensus)
    records[i]["director_vector"] = encode(director)
    records[i]["genre_vector"] = encode(genre)
    records[i]["syn_con_dir_gen_vector"] = X[i]

    es_client.index(index=index_name, document=records[i])

100%|██████████| 9969/9969 [05:09<00:00, 32.17it/s]


In [16]:
len(records)

9969

## 6. Vector search with Elasticsearch

In [39]:
def elastic_knn_search(vector: np.ndarray, field="syn_con_dir_gen_vector"):
    
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000
    }

    es_results = es_client.search(
        index=index_name,
        knn=knn,
        source=["title", "synopsis", "consensus", "director", "genre", "critics_score", "audience_score"]
    )

    results = []
    for hit in es_results['hits']['hits']:
        results.append(hit['_source'])
    
    return results

In [37]:
es_results = elastic_knn_search(encoded_question)
es_results

[{'title': 'King Kong',
  'synopsis': "Peter Jackson's expansive remake of the 1933 classic follows director Carl Denham (Jack Black) and his crew on a journey from New York City to the ominous Skull Island to film a new movie. Accompanying him are playwright Jack Driscoll (Adrien Brody) and actress Ann Darrow (Naomi Watts), who is whisked away by the monstrous ape, Kong, after they reach the island. The crew encounters dinosaurs and other creatures as they race to rescue Ann, while the actress forms a bond with her simian captor.",
  'consensus': "Featuring state-of-the-art special effects, terrific performances, and a majestic sense of spectacle, Peter Jackson's remake of King Kong is a potent epic that's faithful to the spirit of the 1933 original.",
  'critics_score': '84%',
  'audience_score': '50%',
  'director': 'Peter Jackson',
  'genre': 'Adventure, Action, Fantasy'},
 {'title': 'Hook',
  'synopsis': 'When his young children are abducted by his old nemesis, Capt. Hook (Dustin 

## 7. Generating answer with an llm

In [45]:
def build_prompt(query, search_results):
    template = """
You are a movie expert. Using your knowledge and based on the CONTEXT provided
below, do your best to give movie recommendations that will satisfy the person
that is asking. Make sure your recommendations are not biased by gender, ethnicity,
age, religion and so on. Consider critics and audience scores when giving your
recommendations.

QUESTION: {question}

CONTEXT: {context}
""".strip()

    context = ""
    for doc in search_results:
        context += f"title: {doc['title']}\nsynopsis: {doc['synopsis']}\ncritics consensus: {doc['consensus']}\ndirector: {doc['director']}\ngenre: {doc['genre']}\ncritics score: {doc['critics_score']}\naudience score: {doc['audience_score']}\n\n"

    prompt = template.format(question=query, context=context).strip()
    return prompt


client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")


def run_llm(prompt, model="llama3.2"):
    response = client.chat.completions.create(
        model=model, messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content


def search(query):
    vector_query = embedding_model.encode(query["question"])
    return elastic_knn_search(vector_query)


def rag(query: dict, model="llama3.2") -> str:
    search_results = search(query)
    prompt = build_prompt(query["question"], search_results)
    answer = run_llm(prompt, model=model)
    return answer

In [48]:
print(rag({'question': 'recommend me action movies with lots of violence like john wick'}))

Based on your interest in action movies with lots of violence like John Wick and The Punisher, I'd like to recommend the following films:

1. **Atomic Blonde** (2017) - With a 69% critics score and an audience score of 87%, this spy thriller starring Charlize Theron has plenty of intense hand-to-hand combat sequences and high-stakes action.
2. **The Accountant** (2016) - Despite having a relatively low budget, this action movie stars Ben Affleck as a socially awkward accountant who moonlights as a deadly assassin. It boasts an 83% audience score and is known for its graphic violence.
3. **Taken** (2008) - This high-octane action film starring Liam Neeson follows a former CIA operative as he battles kidnappers and corrupt officials to rescue his daughter. With a 74% critical consensus and an impressive 95% audience score, it's a must-watch for fans of intense action.
4. **The Raid: Redemption** (2011) - This Indonesian martial arts film was praised by critics for its non-stop action seq