In [None]:
# Make sure to install necessary dependencies
! pip install chromadb
! pip install sentence_transformers 
! pip install --upgrade huggingface_hub==0.26.0
! pip install numpy , pandas
! pip install nltk

In [None]:
import pandas as pd
import os
import numpy as np
import chromadb
from sentence_transformers import SentenceTransformer
from chromadb import Client
import timeit
from clean_genres import clean_genres
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
class RecommendMovies:
    '''
    Class to recommend movies based on a given query.
    '''
    def __init__(self,movies_df_path,num_movies=5,num_rows=None):
        '''
        Initialize the RecommendMovies class.
        Parameters:
            movies_df_path (str): Path to the movies dataframe.
            num_movies (int): Number of movies to recommend. Deafult 5
            num_rows (int): (Dataset size) Number of rows to sample from the dataframe. Default None. Specifed by the user
            query_string (str): Query string to search for movies.
            model (SentenceTransformer): SentenceTransformer model to use for embeddings.
            client (chromadb.Client): ChromaDB client to use for database operations.
            collection (chromadb.Collection): ChromaDB collection to use for database operations.
            movie_embeddings (np.ndarray): Movie embeddings for the movies in the dataframe.
            query_embedding (np.ndarray): Query embedding for the query string.
        Returns:
            None
        '''
        assert movies_df_path is not None, f'movies_df_path should not be None'
        assert os.path.exists(movies_df_path), f'movies_df_path {movies_df_path} does not exist'

        assert num_movies is not None, f'num_movies should not be None'
        assert isinstance(num_movies, int), f'num_movies should be an integer, but got {type(num_movies)}'
        assert num_movies > 0, f'num_movies should be greater than 0, but got {num_movies}'

        self.movies_path = movies_df_path
        self.num_rows = num_rows
        self.random_state = 42
        self.movies_df = pd.read_csv(self.movies_path)
        if self.num_rows is not None:
            assert isinstance(self.num_rows, int), f'num_rows should be an integer, but got {type(self.num_rows)}'
            assert self.num_rows > 0, f'num_rows should be greater than 0, but got {self.num_rows}'
            assert self.num_rows <= len(self.movies_df), f'Number of rows {self.num_rows} is greater than the number of rows in the dataframe {len(self.movies_df)}'
            self.movies_df = self.movies_df.sample(n=self.num_rows, random_state=self.random_state)
            self.movies_df = self.movies_df.reset_index(drop=True)
        self.query_string = None
        self.num_movies = num_movies
        self.batch_size = 64

        self.processed_df = None
        self.all_genres = set()
        self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        self.movie_embeddings = None
        self.query_embedding = None

        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.tokenized_dictionary = {}
        self.processed_genres = []
        self.vectorizer = TfidfVectorizer()
        
        
        print('Starting data preprocessing')
        preprocessing_time = timeit.timeit(self.preprocess_data, number=1)
        print(f'Data Preprocessing completed in {preprocessing_time:.4f} seconds')

        print(f'Starting database Engine')
        self.client = chromadb.Client()

        
        existing_collections = self.client.list_collections()
        if "movie_recommender" in existing_collections:
            print(f'Collection already exists')
            print(f'Loading existing collection')
            self.collection = self.client.get_collection(name="movie_recommender")
        else:
            print(f'Creating a new collection')
            self.collection = self.client.create_collection(name="movie_recommender")
            print('Starting Vector database creation')
            vector_data = timeit.timeit(self.create_embeddings, number=1)
            print(f'Vector database creation completed in {vector_data:.4f} seconds')

    def get_tokenized_text(self,text):
        '''
        This function is used to tokenize the text and remove the stop words.
        The function uses the nltk library to tokenize the text and remove the stop words.
        Parameters:
            text (str): Text to tokenize
        Returns:
            (str): Tokenized text
        '''

        tokens = word_tokenize(text)
        processed_tokens = []

        for word in tokens:
            word_lower = word.lower()
            if word_lower not in self.stop_words and word.isalnum(): # Check if word is alphanumeric
                stemmed_word = self.stemmer.stem(word_lower)
                lemmatized_word = self.lemmatizer.lemmatize(stemmed_word)  # Lemmatize after stemming
                processed_tokens.append(lemmatized_word)

        return ' '.join(processed_tokens)

    def update_all_genres(self):
        '''
        This function is used to update the all_genres set with the tokenized text.
        The function uses the get_tokenized_text function to tokenize the text and remove the stop words.
        Parameters:
            None
        Returns:
            None
        '''
        for word in self.all_genres:
            token= self.get_tokenized_text(word)
            self.tokenized_dictionary[token] = word
            self.processed_genres.append(token)
        
        self.vectorizer.fit(self.processed_genres)
            

    def preprocess_data(self):
        '''
        The dataset used here is wikipedia movie plots dataset (https://www.kaggle.com/datasets/jrobischon/wikipedia-movie-plots/data).
        Our columns of interest are Title, Genre and Plot.
        This function is used to preprocess the data. The preprocessing steps include:
        1. Selecting the columns of interest
        2. Dropping the rows with missing values
        3. Dropping the duplicate rows
        4. Cleaning the genres : The clean_genres function is used to clean the genres. The genres are cleaned by removing the unwanted characters and splitting the genres into a list of known genres.
        5. Creating a set of valid genres : The valid genres are created by splitting the genres into a list of known genres. The valid genres are used to filter the movies based on the genre.
        Parameters:
            None
        Returns:
            None
        '''
        self.processed_df = self.movies_df[['Title', 'Genre', 'Plot']]
        self.processed_df = self.processed_df.dropna()
        self.processed_df = self.processed_df.drop_duplicates()
        self.processed_df['Genre'] = clean_genres(self.processed_df)
        self.processed_df['Genre'].apply(lambda x: self.all_genres.update([i.strip() for i in x.split('|') if i.strip() not in self.stop_words]))
        
        self.update_all_genres()
        
    def create_metadata(self,df_list):
        '''
        This function is used for filtering the entire database. To make the retrieval faster, we will create a metadata dictionary for each movie.
        The metadata dictionary will contain the genre of the movie as key value pairs. As a result we query only the movies that are in the same genre as the query.
        Parameters:
            df_list (list): List of genres for each movie obtained as a pipe-separated string from dataset
        Returns:
            (dict) : Dictionary with key value pairs indicating Genre of the movie
        '''
        metadata = []

        for i in range(len(df_list)):
            genres = df_list[i].split('|')
            metadata_dict = {'genre_combined': df_list[i]}  
        
            for genre in genres:
                metadata_dict[f'{genre.strip().lower()}'] = True
            
            metadata.append(metadata_dict)

        return metadata
    
    def create_embeddings(self):
        '''
        This is the function that is used for storing the movies into the vector database.
        The movie summaries are first encoded using a sentence transformer model to create embeddings that capture semantic meaning.
        The embeddings are then stored in the vector database along with the metadata and ids of the movies.
        The metadata is used for filtering the movies based on the genre.
        Parameters:
            None
        Returns:
            None
        '''
        self.movie_embeddings = self.model.encode(self.processed_df['Plot'].tolist())
        print(f'Generated Vector embeddings for {len(self.movie_embeddings)} movies')

        num_rows = len(self.processed_df)
        for start_idx in range(0, num_rows, self.batch_size):
            end_idx = min(start_idx + self.batch_size, num_rows)
            batch_documents = self.processed_df['Plot'].tolist()[start_idx:end_idx]
            batch_embeddings = self.movie_embeddings[start_idx:end_idx]
            batch_metadatas = self.create_metadata(self.processed_df['Genre'][start_idx:end_idx].tolist())
            batch_ids = self.processed_df.index.astype(str).tolist()[start_idx:end_idx]
            
            self.collection.upsert(
                documents=batch_documents,
                embeddings=batch_embeddings,
                metadatas=batch_metadatas,
                ids=batch_ids
            )
            # print(f'\tInserted batch of documents:',{start_idx}, 'to', {end_idx})

    
    def analyze_genre_from_query(self, query):
        '''
        This function is used to extract the genres that user has mentioned in the query.
        The genres are extracted by splitting the query into words and checking if the words are in the valid genres.

        Parameters:
            query (str): Query string to analyze for genres
        Returns:
            (list): List of genres from the query string compatible with chromadb query method for faster query
            raises ValueError if no valid genres are found in the query.
        '''

        tokenized_query = [ self.get_tokenized_text(word) for word in query.split() if word.lower() not in self.stop_words]
        vectorized_query = self.vectorizer.transform(tokenized_query)
        vectorized_genres = self.vectorizer.transform(self.processed_genres)
        cosine_similarities = cosine_similarity(vectorized_query, vectorized_genres)
        extracted_genres = set()
        top_indices = np.argsort(cosine_similarities.flatten())[::-1]

        top_pairs = [np.unravel_index(index, cosine_similarities.shape) for index in top_indices[:3]]
        for i, j in top_pairs:
            # print(f'Query word: {tokenized_query[i]} - Genre: {self.processed_genres[j]} - Similarity: {cosine_similarities[i, j]}')
            extracted_genres.add(self.tokenized_dictionary[self.processed_genres[j]])

        # print(f'Extracted genres from query: {extracted_genres}')

        if not extracted_genres:
            print(f'No valid genres found in the query. Please check the query and try again. Defaulting to "Comedy" genre')
            # raise ValueError(f'No valid genres found in the query. Please check the query and try again')
        
        # Create a query filter dictionary
        genre_filter = {
            'genre_combined': '|'.join(sorted(list(extracted_genres)))
        }
        
        # Add individual genre flags
        for genre in extracted_genres:
            genre_filter[f'{genre.strip().lower()}'] = True
        
        return genre_filter

    def delete_collection(self):
        '''
        This function is used to delete the collection from the database.
        The function first checks if the collection exists and then deletes the collection.
        Parameters:
            None
        Returns:
            None
        '''
        if self.client.get_collection(name="movie_recommender") is not None:
            self.client.delete_collection(name="movie_recommender")
            print('Collection deleted')
        else:
            print('Collection does not exist')

        
    def recommend_movies(self,query,num_movies=5):
        '''
        This function is used as an abstrcation to recommend movies based on the query provided by the user.
        The function first checks if the query is valid and then creates a query embedding using the sentence transformer model.
        The query embedding is then used to query the vector database for the most similar movies.
        The query is filtered based on the genres that are present in the query.
        Parameters:
            query (str): Query string to search for movies.
            num_movies (int): Number of movies to recommend. Default 5
        Returns:
            (pd.DataFrame): DataFrame containing the recommended movies and their scores.
        '''

        assert query is not None, f'query should not be None'
        assert isinstance(query, str), f'query should be a string, but got {type(query)}'
        assert num_movies is not None, f'num_movies should not be None'
        assert isinstance(num_movies, int), f'num_movies should be an integer, but got {type(num_movies)}'
        assert num_movies > 0, f'num_movies should be greater than 0, but got {num_movies}'

        self.num_movies = num_movies
        self.query_string = query
        self.query_embedding = self.model.encode([self.query_string])
        genre_filter = self.analyze_genre_from_query(self.query_string)
        
        or_conditions = []
    
        for key, value in genre_filter.items():
            if key != 'genre_combined' and value is True:
                or_conditions.append({key: True})
        
        if or_conditions:
            results = self.collection.query(
                query_embeddings=self.query_embedding,
                n_results=self.num_movies,
                where={"$or": or_conditions}
            )
        else:
            results = self.collection.query(
                query_embeddings=self.query_embedding,
                n_results=self.num_movies
            )
        
        ids = [int(i) for i in results['ids'][0]  ]
        returned_df = self.processed_df.loc[ids]
        returned_df['Score'] = results['distances'][0]
        returned_df.drop(columns=['Plot'],inplace=True)
        return returned_df

# Step 1
## Dataset Download
The Dataset [Link]('https://www.kaggle.com/datasets/jrobischon/wikipedia-movie-plots') needs to be downloaded prior to running the script. Manual download is the quickest and easiest option. Otherwise downloading from kagglehub requires authentication which is not feasible if there is no kaggle account. \
Download and extract the csv file to the location where the script exists or make sure to have the location of the file which is needed in step 2

**Note**: If the link doesn't work copy paste the URL in the browser.

# Step 2
## Class Instantiation
Create an instance of the Movie recommender class \
Arguments :
1) **movies_df_path** - Path to the movies csv file. Must use the 
2) **num_rows (optional)** - This parameter specifies the no. of data points we want our vector database to hold. \
\
When the class is instantiated a vector database is created using the num of samples specified by the num_rows arguments by converting randomly sampled data points from the csv file. \
**Note**:
This process takes some time depending on the size of the data we want to store in our vector database. The larger data we want to process the more time it takes to convert the text information into vector database. But once done querying the database is extremely fast and robust. 

In [4]:
recommender = RecommendMovies(movies_df_path='wiki_movie_plots_deduped.csv',num_rows=1000)

Starting data preprocessing
Data Preprocessing completed in 5.0011 seconds
Starting database Engine
Creating a new collection
Starting Vector database creation
Generated Vector embeddings for 1000 movies
Vector database creation completed in 74.9106 seconds


# Step 3:
## Inference:
We use the recommend_movies function the class to obtain the set of recommended movies. \
Arguments:
1) **query** : A query string from the user to indicate which types of movies the user want. A typical query contains ceratin keyworsd related to the genre the user wants \
    **Ex: "I like to watch romantic movies with subtle comedy"**
2) **num_movies** : Number of movies the user wants us to recommend


In [5]:
recommender.recommend_movies(query='I love thrilling action movies set in space, with a comedic twist', num_movies=20)

Unnamed: 0,Title,Genre,Score
632,Sex & Fury,action,1.373893
427,Hotel California,action|comedy,1.417168
831,Chhalia,action,1.468545
467,Hiraasat,action|crime|drama,1.477725
272,Adhurs,action|comedy,1.482263
123,Circus,action|romance,1.527919
45,"Cold Light of Day, TheThe Cold Light of Day",action|thriller,1.536551
994,The Spy Next Door,action|comedy|family,1.556574
309,Asuravithu (അസുരവിത്ത്‌),action,1.566815
757,Pirates of the Caribbean: On Stranger Tides,action|adventure|family,1.605405


In [6]:
recommender.recommend_movies(query='Spy investigation', num_movies=15)

Unnamed: 0,Title,Genre,Score
866,Crime Story,action|crime,1.199918
644,Love and Bullets,crime,1.246174
829,Shamus,crime|drama,1.277492
502,Absolute Power,crime|drama,1.282111
11,After Tonight,spy,1.321421
941,The Long Dark Hall,crime,1.350478
936,Veronica Guerin,biography|crime,1.358506
598,The Mole Song: Hong Kong Capriccio,action|comedy|drama|science_fiction|spy|superhero,1.365715
320,The Diamond,crime,1.377871
811,Rent-A-Cop,crime|drama,1.378564


In [7]:
recommender.recommend_movies(query='I prefer war documentaries', num_movies=5)

Unnamed: 0,Title,Genre,Score
705,Dunkirk,war,1.290211
746,The Eagle and the Hawk,drama|war,1.366564
899,Al Franken: God Spoke,documentary,1.43053
737,Desperate Journey,war,1.477732
666,Hell Boats,war,1.529561


In [8]:
recommender.recommend_movies(query='comedy or rom-com or romance', num_movies=10)

Unnamed: 0,Title,Genre,Score
447,Insomnia Lover,comedy|romance,1.31518
969,Lunch Hour,comedy,1.337848
917,Gas-s-s-s,comedy,1.443974
153,Her Minor Thing,comedy,1.504155
507,Happy Though Married,comedy,1.5174
722,How to Murder Your Wife,comedy,1.518565
50,Angst,comedy,1.524497
943,Butterflies Are Free,comedy,1.527551
427,Hotel California,action|comedy,1.543629
18,Star of Midnight,comedy|mystery,1.545021


In [9]:
recommender.recommend_movies(query='horror', num_movies=10)

Unnamed: 0,Title,Genre,Score
208,The Avenging Conscience,drama|horror,1.371714
875,Halloween II,horror,1.413355
622,Mastan,romance,1.44787
245,Black Water Vampire,horror,1.448597
40,Vampire in Brooklyn,comedy|horror,1.487868
645,Aftershock,horror|thriller,1.494793
109,The Mirror,horror,1.507972
461,The Sender,horror,1.515146
464,Intruder,horror,1.526382
493,Voices,horror|thriller,1.528132


In [10]:
recommender.recommend_movies(query='Animation', num_movies=10)

Unnamed: 0,Title,Genre,Score
39,Who Killed Who?,animation|short,1.45507
841,Yeh Khula Aasmaan,romance,1.480498
647,The Magic Roundabout,animation,1.536697
254,Holiday,romance,1.566293
615,Iti Srikanto,romance,1.622743
480,Crayon Shin-chan: The Storm Called: Operation ...,animation,1.639123
197,That's My Mommy,animation,1.64474
789,Typhoon Noruda,animation,1.65315
761,Millennium Actress,animation|drama,1.681293
548,Sahara Hare,animation|short,1.685677


In [11]:
recommender.recommend_movies(query='I watch movies', num_movies=10)

Unnamed: 0,Title,Genre,Score
292,8 Thottakkal,crime|thriller,1.535536
467,Hiraasat,action|crime|drama,1.626542
323,The Hit,crime|drama,1.683899
578,Three Steps in the Dark,crime,1.694323
515,Cry of the Hunted,crime|thriller,1.694747
941,The Long Dark Hall,crime,1.695734
555,Once a Thief,action|comedy|crime,1.706638
810,Undoing,crime|drama,1.741588
560,Bin Bulaye Baraati,crime,1.744
522,Shinjuku Incident,crime|thriller,1.751114


In [27]:
recommender.recommend_movies(query='I am a geek of sports documentaries', num_movies=10)

Unnamed: 0,Title,Genre,Score
125,Borat! Cultural Learnings of America for Make ...,documentary,1.582455
814,Idol of the Crowds,drama|sport,1.612937
210,Chalk,documentary,1.617091
899,Al Franken: God Spoke,documentary,1.756013
923,Hurricane on the Bayou,documentary,1.837991


# Explanation
The choice of vector databases is not arbitrary. The same problem can be tackled with a simple cosine similarity based tf-idf and count vectorizer that naively maps the given query by tokenizing the words and compare with the database text. However, it is not an efficient approach both interms of computation and similarity.
1) The tf-idf based approach computes the similarity score for every vector of the text from the database which is slow and in-efficient.
2) Tokenization using nltk with stop words is helpful upto a point, but extracting similarity with just vectorized tokens is doesn't capture semantic meanings.
3) Also the cosine similarity is not a good metric to compare two vectors for semantic similarity because cosine similarity only gives the direction in which the vector is pointing not how the meaning is structured in the sentences. Different vectors pointing in same direction have higher similarity which might indicate high similarity score for two different sentences.


### References
1) The follwoing [Medium]('https://arupnanda.medium.com/lowdown-on-vector-databases-part-4-56840110babb') article has a decent explanation of vector databases and its implementation details.
2) Dataset [Link]('https://www.kaggle.com/datasets/jrobischon/wikipedia-movie-plots/data')