In [37]:
from typing import Union

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [38]:
# Retrieved from https://grouplens.org/datasets/movielens/

movies = pd.read_csv("dataset/ml-latest/movies.csv")
ratings = pd.read_csv("dataset/ml-latest/ratings.csv")

In [39]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [40]:
import re

def clean_text(text: str, re_pattern: str ="[^a-zA-Z0-9 ]", replace_string="") -> str:
    """
    This method is used to replace the given regular expression pattern with the replace_string
    :param text: the input text
    :param re_pattern: regular expression pattern
    :param replace_string: string to be used for replacement
    :return: text after replacement
    """
    text = re.sub(re_pattern, replace_string, text)
    return text

In [41]:
def find_match(text: str, re_pattern: str) -> Union[str, None]:
    """
    This method is used to find the substring that matches the given regular expression pattern
    :param text: input text
    :param re_pattern: regular expression pattern
    :return: The substring or None
    """
    match = re.search(re_pattern, text)
    if match:
        return match.group(1)
    else:
        return None

In [42]:
def concatenate_columns(df: pd.DataFrame, cols_list: list) -> pd.Series:
    """
    This method is used to concatenate the values in multiple columns into a new column
    :param df: input dataframe
    :param cols_list: list of columns to be concatenated
    :return: the new column with concatenated values (pandas series)
    """
    return df[cols_list].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [43]:
def clean_title(text: str, re_pattern: str) -> str:
    """
    This method is used to clean the input text by replacing the regular expression with empty string
    and removing unnecessary spaces
    :param text: input text
    :param re_pattern: regular expression
    :return: cleaned text
    """
    text = clean_text(text=text, re_pattern=re_pattern)
    text = text.strip()
    return text

In [44]:
def find_movie(title: str, year: Union[str, int]) -> pd.DataFrame:
    """
    This method is used to find a movie record in the movies dataset based on the given movie title and year values
    :param title: movie title
    :param year: year
    :return: the identified movie record
    """
    title = title.lower()
    year = str(year)
    movies_copy = movies.copy()
    movies_copy["title"] = movies_copy["title"].apply(lambda text: text.lower())
    movie_record = movies_copy[
        (movies_copy['title'] == title) &
        (movies_copy['year'] == year)
        ]
    return movie_record

In [45]:
movies["genres"] = movies["genres"].apply(lambda text: clean_text(text=text, replace_string=" "))

year_re_pattern = r'\((\d{4})\)'

movies["year"] = movies["title"].apply(lambda text: find_match(text=text, re_pattern=year_re_pattern))

movies["title"] = movies["title"].apply(lambda text: clean_title(text=text, re_pattern=year_re_pattern))

movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure Animation Children Comedy Fantasy,1995
1,2,Jumanji,Adventure Children Fantasy,1995
2,3,Grumpier Old Men,Comedy Romance,1995
3,4,Waiting to Exhale,Comedy Drama Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [46]:
def content_based_filtering(title: str, year: Union[int, str]) -> pd.DataFrame:
    """
    This method is used to perform content based filtering on the movies dataset to find 10 movies
    that are similar to the input movie
    :param movies:
    :param year:
    :param title: movie title
    :return:
    """

    movie_record = find_movie(title, year)
    genres = movie_record["genres"].iloc[0]

    print(genres)

    # concatenate title and genres into a single column for better results
    # movies["title_genres"] = concatenate_columns(movies, ["title", "genres"])

    # Initialize TfidfVectorizer
    vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    tfidf = vectorizer.fit_transform(movies["genres"])

    # title = clean_text(title)
    query_vector = vectorizer.transform([genres])
    similarity = cosine_similarity(query_vector, tfidf).flatten()
    indices = np.argpartition(similarity, -10)[-10: ]
    result = movies.iloc[indices].iloc[::-1]
    return result

In [47]:
def collaborative_filtering(title: str, year: Union[int, str]) -> pd.DataFrame:
    """
    This method is used to perform collaborative filtering on the movies and ratings dataset
    to find 10 movies that are similar to the given movie
    :param title: movie title
    :param year: movie release year
    :return: dataframe with 10 recommended movies
    """
    good_rating_threshold = 2.5
    movies_liked_by_similar_users_percentage = 0.1
    movie_record = find_movie(title, year)
    movie_id = movie_record["movieId"].iloc[0]

    # Find the other users who liked the given movie. Call them similar_users
    similar_users = ratings[
        (ratings["movieId"] == movie_id) &
        (ratings['rating']> good_rating_threshold)
    ]["userId"].unique()

    # Find the other movies liked by similar_users - call them similar_users_records
    similar_users_records = ratings[
        (ratings["userId"].isin(similar_users)) &
        (ratings["rating"] > good_rating_threshold)
    ]["movieId"]
    
    # Calculate the percentage of how many users in similar_users liked each movie
    # Get the number of users liked each movie.
    # Divide the number by the number of users.
    similar_users_records = similar_users_records.value_counts() / len(similar_users)

    # Find the movies liked by more than 10 percentage of the similar users
    similar_users_records = similar_users_records[similar_users_records > movies_liked_by_similar_users_percentage]

    # Find the other users who liked the movies that are liked by similar_users - call them all_users
    all_users = ratings[
        (ratings["movieId"].isin(similar_users_records.index)) &
        (ratings["rating"] > good_rating_threshold)
    ]

    # Calculate the percentage of how many users in the whole dataset liked each movie
    all_users_records = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    records_percentages = pd.concat([similar_users_records, all_users_records], axis=1)

    records_percentages.columns = ["similar", "all"]

    records_percentages["score"] = records_percentages["similar"]/records_percentages["all"]

    records_percentages = records_percentages.sort_values("score", ascending=False)

    recommendations = records_percentages.head(100).merge(movies, left_index=True, right_on="movieId")

    recommendations = recommendations.loc[:, ["title", "year", "genres"]]
    return recommendations

In [48]:
recommended_movies = collaborative_filtering("sherlock holmes", 2009)
print(recommended_movies.head(10))

                                     title  year  \
14108                      Sherlock Holmes  2009   
17476   Sherlock Holmes: A Game of Shadows  2011   
14737  Prince of Persia: The Sands of Time  2010   
15057                     Expendables, The  2010   
14811                          A-Team, The  2010   
20028                       Wolverine, The  2013   
15414                                  Red  2010   
17183                           Real Steel  2011   
19027                         Jack Reacher  2012   
25125                    X-Men: Apocalypse  2016   

                                               genres  
14108                   Action Crime Mystery Thriller  
17476  Action Adventure Comedy Crime Mystery Thriller  
14737           Action Adventure Fantasy Romance IMAX  
15057                       Action Adventure Thriller  
14811                          Action Comedy Thriller  
20028                 Action Adventure Fantasy Sci Fi  
15414                              

In [49]:
content_based_filtering("sherlock holmes", 2009)

Action Crime Mystery Thriller


Unnamed: 0,movieId,title,genres,year
29597,133923,Bulldog Drummond Comes Back,Action Crime Mystery Thriller,1937
62162,204602,Fly By Night,Action Crime Mystery Thriller,2019
71458,229961,Last Three Days,Action Crime Mystery Thriller,2020
81388,273129,Memory,Action Crime Mystery Thriller,2022
34356,144592,Overheard 3,Action Crime Mystery Thriller,2014
84353,281828,Paradise City,Action Crime Mystery Thriller,2022
18709,97742,Alex Cross,Action Crime Mystery Thriller,2012
17465,91509,Fire of Conscience (For lung),Action Crime Mystery Thriller,2010
33256,142162,The Payoff,Action Crime Mystery Thriller,1942
61746,203683,Killers Anonymous,Action Crime Mystery Thriller,2019
