In [90]:
import pandas as pd
# data sets taken from https://files.grouplens.org/datasets/movielens/ml-25m.zip
movies = pd.read_csv("movies.csv")

In [91]:
# we do have lots of special character in the movie title
# when we search for movie we don't usually add any special character
# let us remove those and have a separate column named clean_title for that
import re

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [92]:
# this will add another colum in our movies dataset
movies["clean_title"] = movies["title"].apply(clean_title)

In [93]:
# now that our movies dataset is ready let's start processing them
# first, get the tfidf matrix based on our dataset
# we need to install scikit-learn scipy matplotlib these modules
# pip3 install -U scikit-learn scipy matplotlib
# pip install -U scikit-learn scipy matplotlib

from sklearn.feature_extraction.text import TfidfVectorizer
# we will use unigram and bigram for search criteria
vectorizer = TfidfVectorizer(ngram_range=(1,2))

# let us fit and transform our data based on clean_title
tfidf_matrix = vectorizer.fit_transform(movies["clean_title"])

In [94]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# this method utilizes cosine similarity to understand how similar or different the search team is
def search_movie(title, num_results):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()
    indices = np.argpartition(similarity, -num_results)[-num_results:]
    results = movies.iloc[indices].iloc[::-1]
    
    return results

In [97]:
# for ui rendering purposes inside jupyterlab
# pip install ipywidgets
# jupyter labextension install @jupyter-widgets/jupyterlab-manager

import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search_movie(title, 5))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [98]:
ratings = pd.read_csv("ratings.csv")

In [99]:
def find_similar_movies(movie_id):
    # Get ratings of similar users who rated the movie we are searching highly
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

    # Get recommended movies by similar users based on what those users rated highly to other movies
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    # Filter recommended movies by popularity
    similar_user_recs = similar_user_recs[similar_user_recs > .10]

    # Get all ratings of users who rated recommended movies highly
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    
    # Calculate movie recommendations percentage based on all users
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    # Combine similar user and all user recommendations
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    # Calculate a score based on the percentage of similar user recommendations
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    # Sort movies based on the score in descending order
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    # Merge movie details and return the top 10 movie recommendations
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [100]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search_movie(title, 5)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()