# 1. Install necessary requirements

In [None]:
%pip install -r requirements.txt

[Dataset Link](https://www.kaggle.com/datasets/jrobischon/wikipedia-movie-plots)

Gonna prepare deliverables in the morning.

# 2. Import necessary libraries

In [6]:
import sys
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 3. Implement Content-Based Recommendation Systems

**Simple Implementation**: Use TF-IDF vectorization built into sklearn alongside a cosine similarity metric to recommend movies.

- **Pros**: Simple, quick, gets the job done.
- **Cons**: TF-IDF vectorization is a bag-of-words model; it doesn't take context into account!

To improve upon the cons, different implementations like [SBERT](https://sbert.net/) for more contextualized sentence embeddings could be used. However, this model is also more computationally expensive than that of TfidfVectorizer.

In [66]:
def vectorize(text: str, vectorizer: TfidfVectorizer()):
    '''
    Vectorizes a text given a specified vectorizer. 

    :param text: Text to be vectorized.
    :type text: str
    :param vectorizer: Fitted vectorizer to use.
    :type vectorizer: TfidfVectorizer()
    :returns: Vectorized version of a text.
    :rtype: scipy.sparse_matrix
    '''
    return vectorizer.transform([text])

def compute_similarity(a: str, b: str):
    '''
    Computes the cosine similarity between two items. The cosine similarity formula is as follows:

    (a * b) / (||a|| * ||b||)

    However, this is implemented via sklearn due to the optimizations it takes using NumPy ndarrays.
    This is also compartmentalized due to possible customizations applicable to the similarity metric.
    
    :param a: First text to compute similarity with.
    :type a: str
    :param b: Second text to compute similarity with.
    :type b: str
    :returns: Similarity score between 0 and 1 of the two items
    :rtype: ndarray of shape (1, 1)
    '''
    return cosine_similarity(a, b)
    
def recommend_movies(df: pd.DataFrame, review: str, vectorizer: TfidfVectorizer(), top_k=5):
    '''
    Given a user review, recommend movies based on the cosine similarity.

    :param df: DataFrame containing movies and corresponding summaries.
    :type df: pd.DataFrame
    :param review: User sentiment on movies.
    :type review: str
    :param vectorizer: Trained vectorizer object to transform datapoints.
    :type vectorizer: TfIdfVectorizer()
    :param top_k: Top K recommendations and scores to return to the user; by default 5.
    :type top_k: int
    :returns: Top k recommended movies based on a user's review.
    :rtype: list of ((movie, plot), similarity score) tuples.
    '''
    similarities = dict()
    vectorized_review = vectorize(review, vectorizer)
    for movie in df.itertuples(): similarities[(movie.Title, movie.Plot)] = compute_similarity(vectorized_review, vectorize(movie.Plot, vectorizer))
    recommendations = sorted(list(similarities.items()), key=lambda x: x[1], reverse=True)
    return recommendations[:top_k]

In [67]:
def main():
    # We take a random subsample of 500 movies and corresponding plots
    df = pd.read_csv('movies.csv').drop(['Release Year', 'Origin/Ethnicity', 'Director', 'Cast', 'Genre', 'Wiki Page'], axis=1).sample(n=500, random_state=21)

    # Obtain user query
    user_review = input('Enter user review here:')
    
    # Create and fit vectorizer on given data
    vectorizer = TfidfVectorizer()
    vectorizer.fit_transform(df['Plot'])

    # Recommend users movies
    user_recommendations = recommend_movies(df, user_review, vectorizer)
    for i in range(5): print(f'{i+1}. {user_recommendations[i][0][0]} | Similarity Score: {user_recommendations[i][1].flatten()[0]}')

In [64]:
if __name__ == '__main__': main()

Enter user review here: I like to jump.


1. Thutturi | Similarity Score: 0.07066675571981461
2. The Falcon and the Co-eds | Similarity Score: 0.06031695632127222
3. Daruchini Dwip | Similarity Score: 0.05966847683344727
4. Deiva Magan | Similarity Score: 0.05549725812908711
5. Dhoom 3 | Similarity Score: 0.05535900403883018
