In [3]:
import warnings
warnings.filterwarnings('ignore')

import os
from dotenv import load_dotenv, find_dotenv
from datasets import load_dataset
import pandas as pd
from typing import List, Optional
from pydantic import BaseModel
from datetime import datetime
from pymongo.mongo_client import MongoClient
import openai
import time

_ = load_dotenv("./.env")
MONGO_URI = os.environ.get("MONGO_URI")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
openai.api_key = OPENAI_API_KEY

In [4]:
dataset = load_dataset("Pablinho/movies-dataset", streaming=True, split="train")
dataset = dataset.take(200)  # 200 movies for the sake of simplicity
dataset_df = pd.DataFrame(dataset)

dataset_df

Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Genre,Poster_Url
0,2021-12-15,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...,5083.954,8940,8.3,en,"Action, Adventure, Science Fiction",https://image.tmdb.org/t/p/original/1g0dhYtq4i...
1,2022-03-01,The Batman,"In his second year of fighting crime, Batman u...",3827.658,1151,8.1,en,"Crime, Mystery, Thriller",https://image.tmdb.org/t/p/original/74xTEgt7R3...
2,2022-02-25,No Exit,Stranded at a rest stop in the mountains durin...,2618.087,122,6.3,en,Thriller,https://image.tmdb.org/t/p/original/vDHsLnOWKl...
3,2021-11-24,Encanto,"The tale of an extraordinary family, the Madri...",2402.201,5076,7.7,en,"Animation, Comedy, Family, Fantasy",https://image.tmdb.org/t/p/original/4j0PNHkMr5...
4,2021-12-22,The King's Man,As a collection of history's worst tyrants and...,1895.511,1793,7.0,en,"Action, Adventure, Thriller, War",https://image.tmdb.org/t/p/original/aq4Pwv5Xeu...
...,...,...,...,...,...,...,...,...,...
195,2016-02-09,Deadpool,Deadpool tells the origin story of former Spec...,189.206,26390,7.6,en,"Action, Adventure, Comedy",https://image.tmdb.org/t/p/original/3E53WEZJqP...
196,2021-10-22,The Harder They Fall,"Gunning for revenge, outlaw Nat Love saddles u...",188.865,611,6.7,en,Western,https://image.tmdb.org/t/p/original/su9WzL7lwU...
197,2001-11-01,"Monsters, Inc.","James Sullivan and Mike Wazowski are monsters,...",188.759,15255,7.8,en,"Animation, Comedy, Family",https://image.tmdb.org/t/p/original/sgheSKxZkt...
198,2021-02-09,Ainbo: Spirit of the Amazon,An epic journey of a young hero and her Spirit...,188.484,285,7.1,en,"Adventure, Animation, Family, Fantasy",https://image.tmdb.org/t/p/original/l8HyObVj8f...


## Modeling the Data with Pydantic

In [5]:
class Movie(BaseModel):
    Release_Date: Optional[str]
    Title: str
    Overview: str
    Popularity: float
    Vote_Count: int
    Vote_Average: float
    Original_Language: str
    Genre: List[str]
    Poster_Url: str
    text_embeddings: List[float]


In [18]:
def get_embedding(text):
    if not text or not isinstance(text, str):
        return None
    try:
        embedding = openai.embeddings.create(
            input=text,
            model="text-embedding-3-small", dimensions=1536).data[0].embedding
        print(embedding)
        return embedding
    except Exception as e:
        print(f"Error in get_embedding: {e}")
        return None

In [7]:
def process_and_embed_record(record):
    for key, value in record.items():
        if pd.isnull(value):
            record[key] = None

    if record['Genre']:
        record['Genre'] = record['Genre'].split(', ')
    else:
        record['Genre'] = []

    text_to_embed = f"{record['Title']} {record['Overview']}"
    embedding = get_embedding(text_to_embed)
    record['text_embeddings'] = embedding
    return record

records = [process_and_embed_record(record) for record in dataset_df.to_dict(orient='records')]


In [8]:
def get_mongo_client(mongo_uri):
    client = MongoClient(mongo_uri, appname="pmr.movie.python")
    print("Connection to MongoDB successful")
    return client

mongo_client = get_mongo_client(MONGO_URI)
database_name = "movies_dataset"
collection_name = "movies"
db = mongo_client.get_database(database_name)
collection = db.get_collection(collection_name)

collection.delete_many({})


Connection to MongoDB successful


DeleteResult({'n': 0, 'electionId': ObjectId('7fffffff0000000000000014'), 'opTime': {'ts': Timestamp(1728039461, 5), 't': 20}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1728039461, 5), 'signature': {'hash': b'"\xdf\x87\xa6\x00MV[\xaa\xa8\xd8C5R\r\x93\xb9U\x7f#', 'keyId': 7384892070118293507}}, 'operationTime': Timestamp(1728039461, 5)}, acknowledged=True)

In [11]:
movies = [Movie(**record).model_dump() for record in records]
collection.insert_many(movies)

InsertManyResult([ObjectId('66ffca2c4e8bf58059ca8d4b'), ObjectId('66ffca2c4e8bf58059ca8d4c'), ObjectId('66ffca2c4e8bf58059ca8d4d'), ObjectId('66ffca2c4e8bf58059ca8d4e'), ObjectId('66ffca2c4e8bf58059ca8d4f'), ObjectId('66ffca2c4e8bf58059ca8d50'), ObjectId('66ffca2c4e8bf58059ca8d51'), ObjectId('66ffca2c4e8bf58059ca8d52'), ObjectId('66ffca2c4e8bf58059ca8d53'), ObjectId('66ffca2c4e8bf58059ca8d54'), ObjectId('66ffca2c4e8bf58059ca8d55'), ObjectId('66ffca2c4e8bf58059ca8d56'), ObjectId('66ffca2c4e8bf58059ca8d57'), ObjectId('66ffca2c4e8bf58059ca8d58'), ObjectId('66ffca2c4e8bf58059ca8d59'), ObjectId('66ffca2c4e8bf58059ca8d5a'), ObjectId('66ffca2c4e8bf58059ca8d5b'), ObjectId('66ffca2c4e8bf58059ca8d5c'), ObjectId('66ffca2c4e8bf58059ca8d5d'), ObjectId('66ffca2c4e8bf58059ca8d5e'), ObjectId('66ffca2c4e8bf58059ca8d5f'), ObjectId('66ffca2c4e8bf58059ca8d60'), ObjectId('66ffca2c4e8bf58059ca8d61'), ObjectId('66ffca2c4e8bf58059ca8d62'), ObjectId('66ffca2c4e8bf58059ca8d63'), ObjectId('66ffca2c4e8bf58059ca8d

## Implementing Vector search

In [35]:
def vector_search(user_query, db, collection, vector_index="vector_index_text", max_retries=3):
    query_embedding = get_embedding(user_query)
    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    vector_search_stage = {
        "$vectorSearch": {
            "index": vector_index,
            "queryVector": query_embedding,
            "path": "text_embeddings",
            "numCandidates": 150,
            "limit": 20
        }
    }

    pipeline = [vector_search_stage]

    for attempt in range(max_retries):
        try:
            results = list(collection.aggregate(pipeline))
            if results:
                explain_query_execution = db.command(
                    'explain', {
                        'aggregate': collection.name,
                        'pipeline': pipeline,
                        'cursor': {}
                    },
                    verbosity='executionStats')
                vector_search_explain = explain_query_execution['stages'][0]['$vectorSearch']
                millis_elapsed = vector_search_explain['explain']['collectStats']['allCollectorStats']['millisElapsed']
                print(f"Total time for the execution to complete on the database server: {millis_elapsed} milliseconds")
                return results
            else:
                print(f"No results found on attempt {attempt + 1}. Retrying...")
                time.sleep(2)
        except Exception as e:
            print(f"Error on attempt {attempt + 1}: {str(e)}")
            print(e)
            time.sleep(2)
    
    return "Failed to retrieve results after multiple attempts."

In [27]:
class SearchResultItem(BaseModel):
    Title: str
    Overview: str
    Genre: List[str]
    Vote_Average: float
    Popularity: float

def handle_user_query(query, db, collection):
    get_knowledge = vector_search(query, db, collection)

    if isinstance(get_knowledge, str):
        return get_knowledge, "No source information available."
        
    search_results_models = [SearchResultItem(**result) for result in get_knowledge]
    search_results_df = pd.DataFrame([item.model_dump() for item in search_results_models])

    completion = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a movie recommendation system."},
            {"role": "user", "content": f"Answer this user query: {query} with the following context:\n{search_results_df}"}
        ]
    )

    system_response = completion.choices[0].message.content

    print(f"- User Question:\n{query}\n")
    print(f"- System Response:\n{system_response}\n")

    return system_response

In [37]:
query = """
I'm in the mood for a highly-rated romance movie. Can you recommend something popular?
Include a reason for your recommendation.
"""
handle_user_query(query, db, collection)

[-0.01474519819021225, 0.002958740573376417, -0.06615937501192093, -0.010998268611729145, 0.017218898981809616, -0.0019583466928452253, -0.05083213001489639, 0.007445354945957661, 0.0017749411053955555, -0.02805953100323677, -0.015266615897417068, 0.01695212721824646, -0.04069480672478676, 0.0032558271195739508, -0.0019750199280679226, 0.009118741378188133, 0.012186615727841854, 0.006723858881741762, 0.01832236349582672, 0.02529480680823326, 0.021244727075099945, 0.04879496619105339, -0.03957921639084816, -0.02182677388191223, 0.04328976944088936, 0.02305149845778942, -0.06615937501192093, 0.04818866774439812, 0.011495434679090977, -0.008009213022887707, 0.0019265159498900175, -0.02185102552175522, 0.012792915105819702, -0.013750867918133736, -0.05548850819468498, -0.05883528292179108, 0.008130473084747791, 0.0005092914216220379, 0.04440535977482796, -0.04115559533238411, 0.042052917182445526, 0.02444598637521267, 0.04113134369254112, -0.05015307664871216, -0.019922994077205658, -0.000

'Based on your mood for a highly-rated romance movie, I recommend "Through My Window" with a vote average of 7.8. This movie is highly rated in the romance genre and is quite popular, as evident from its popularity score of 659.105. The film offers a compelling story about Raquel\'s long-standing crush on her next-door neighbor, making it a captivating watch filled with romantic tension and drama. If you enjoy stories where deeply held emotions unfold, "Through My Window" could be an excellent choice for you.'