# Movie Recommendation Engine

Started as a follow-along project with [Dataquest's walkthrough](https://www.youtube.com/watch?v=eyEabQRBMQA), and ended up being a more fleshed out personal project.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
movies = pd.read_csv("data/movies.csv")
ratings = pd.read_csv("data/ratings.csv")

links = pd.read_csv('data/links.csv')
tags = pd.read_csv('data/tags.csv')
genome_scores = pd.read_csv('data/genome-scores.csv')
genome_tags = pd.read_csv('data/genome-tags.csv')

In [3]:
def clean_title(title: str) -> str:
    '''
    Keeps only letters, numbers, whitespaces. 
    Replaces everything else with nothing.
    '''
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [4]:
movies['cleaned_title'] = movies['title'].apply(clean_title)

In [5]:
movies

Unnamed: 0,movieId,title,genres,cleaned_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


# Search Engine

Converting words into numbers:

1. Term frequency: get frequency of words in title
1. Inverse document frequency: helps search engine find unique terms (the is in a lot of title, but harry is not)
1. Tf\*Idf: Get a vector for each movie

Search:

1. Convert search text into into numbers like above
1. Find row with the most number of matches via similarity

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
# ngrams: number of words to search for together
# ex: Toy Story 1995: "toy", "story", "1995"; "toy story", "story 1995"
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies['cleaned_title'])

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

def search(title: str) -> pd.DataFrame:
    '''
    Turns search term into a vectory, resulting top 5 results
    '''
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    # returns how similar search title is to each title
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    movies_cp = movies.copy()
    # find the 5 most similar titles by idx, in asc order
    # kinda like df['col'].sort_values(ascending=False).loc[:-5]
    indices = np.argpartition(similarity, -5)[-5:]
    movies_cp['similarity'] = similarity
    results = movies_cp.iloc[indices][::-1] # return with most similar up top
    return results

## Search Box

In [9]:
import ipywidgets as widgets
from IPython.display import display

In [10]:
movie_input = widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type,names='value')
display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

# Recommendation Engine

1. Find all users who liked the search title
1. Find the other movies that those users liked

In [11]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [12]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [13]:
movie_id = 1
high_rating = 4

In [14]:
# find the users that liked the movie being searched for
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings['rating']>=high_rating)]['userId'].unique()

# find the movies that similar users liked
similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating']>=high_rating)]['movieId']

In [15]:
similar_user_recs

254              1
255             29
256             32
257             50
258            111
             ...  
24999332    166643
24999342    171763
24999348    177593
24999351    177765
24999378    198609
Name: movieId, Length: 5101989, dtype: int64

In [16]:
# find the top 10% of movies
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > 0.1]
similar_user_recs

1       1.000000
318     0.549604
260     0.531518
356     0.517224
296     0.495744
          ...   
235     0.101249
1242    0.100931
1907    0.100772
3527    0.100613
2761    0.100135
Name: movieId, Length: 273, dtype: float64

In [17]:
# find what percent of people not similar to us liked these movies
# if normies like what i like, its highly likely the "similar users" liked my
# movie because everyone liked it

In [18]:
# all users who watched the movie recommended to us
all_users = ratings[(ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] >= high_rating)]

In [19]:
# all users who watched the movie recommended to us
all_users = ratings[(ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] >= high_rating)]
all_user_recs = all_users['movieId'].value_counts()/len(all_users['userId'].unique())

We need to find a movie where:

1. Everyone who liked search_term, also liked rec_term
1. Only X% (40%, ex) of those who liked rec_term also liked search_term

In [20]:
# percent of all users who liked the given movie
all_user_recs

318     0.440215
296     0.389659
356     0.367553
593     0.361897
2571    0.347994
          ...   
3175    0.049325
2081    0.047128
1282    0.044712
2761    0.039855
1907    0.039805
Name: movieId, Length: 273, dtype: float64

In [21]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ['similar_ppl','all_ppl']

In [22]:
rec_percentages

Unnamed: 0,similar_ppl,all_ppl
1,1.000000,0.235415
2,0.105598,0.051667
6,0.162879,0.097377
10,0.122623,0.072924
11,0.101408,0.058952
...,...,...
91529,0.120422,0.085416
99114,0.112732,0.091209
109487,0.117426,0.102603
112852,0.102681,0.067698


In [23]:
# score = ratio of similar:avg users who liked movie
rec_percentages['score'] = rec_percentages['similar_ppl'] / rec_percentages['all_ppl']
rec_percentages = rec_percentages.sort_values('score', ascending=False)

In [24]:
# left_index is the movieId
rec_percentages.head(10).merge(movies, left_index=True, right_on='movieId')

Unnamed: 0,similar_ppl,all_ppl,score,movieId,title,genres,cleaned_title
0,1.0,0.235415,4.247819,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.328914,0.102241,3.217054,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
14813,0.161924,0.05771,2.80584,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
2264,0.191095,0.068978,2.770367,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
1992,0.120714,0.047128,2.561408,2081,"Little Mermaid, The (1989)",Animation|Children|Comedy|Musical|Romance,Little Mermaid The 1989
1818,0.100772,0.039805,2.531636,1907,Mulan (1998),Adventure|Animation|Children|Comedy|Drama|Musi...,Mulan 1998
2669,0.100135,0.039855,2.512494,2761,"Iron Giant, The (1999)",Adventure|Animation|Children|Drama|Sci-Fi,Iron Giant The 1999
1005,0.12806,0.054719,2.340299,1028,Mary Poppins (1964),Children|Comedy|Fantasy|Musical,Mary Poppins 1964
1047,0.231801,0.099113,2.338762,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
1249,0.103636,0.044712,2.317855,1282,Fantasia (1940),Animation|Children|Fantasy|Musical,Fantasia 1940


Not bad! For `Toy Story` we recommended the Toy Story series, Bugs Life, etc.

In [25]:
def find_similar_movies(movie_id: int, high_rating: int = 4) -> pd.DataFrame:
    # find the users that liked the movie being searched for
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings['rating']>=high_rating)]['userId'].unique()
    # find the movies that similar users liked
    similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating']>=high_rating)]['movieId']

    # find the top 10% of movies
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > 0.1]
    
    # all users who watched the movie recommended to us
    all_users = ratings[(ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] >= high_rating)]
    all_user_recs = all_users['movieId'].value_counts()/len(all_users['userId'].unique())
    
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ['similar_ppl','all_ppl']

    # score = ratio of similar:avg users who liked movie
    rec_percentages['score'] = rec_percentages['similar_ppl'] / rec_percentages['all_ppl']
    rec_percentages = rec_percentages.sort_values('score', ascending=False)
    
    # left_index is the movieId
    return rec_percentages.head(10).merge(movies, left_index=True, right_on='movieId')[['score','title','genres']]

## Widget

In [26]:
movie_input_name = widgets.Text(
    initial_value = 'Toy Story',
    description = 'Movie Title:',
    disabled=False
)

In [27]:
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data['new']
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]['movieId']
            display(find_similar_movies(movie_id))

movie_input_name.observe(on_type, names='value')
display(movie_input_name, recommendation_list)

Text(value='', description='Movie Title:')

Output()

# Adding Tags as a Feature

In [3]:
tags = tags.merge(genome_tags, how='left')
tags.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1093360 entries, 0 to 1093359
Data columns (total 5 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   userId     1093360 non-null  int64  
 1   movieId    1093360 non-null  int64  
 2   tag        1093344 non-null  object 
 3   timestamp  1093360 non-null  int64  
 4   tagId      501955 non-null   float64
dtypes: float64(1), int64(3), object(1)
memory usage: 50.1+ MB


In [4]:
num_movies = len(tags['movieId'].unique())
num_real_movies = len(tags[~tags['tagId'].isna()]['movieId'].unique())

By only using movies that are tagged with a tagId, we lose {{ num_movies - num_real_movies }} movies
And analyze {{ num_real_movies }} movies

In [5]:
def pick_highest_relevant_tag_score(sub_df: pd.DataFrame) -> pd.DataFrame:
    '''
    Used for groupby function, returns the row that contains the highest
    relevance value (minus movieId column, since groupby function will 
    make that the index anyways)
    '''
    sub_df = sub_df.sort_values('relevance', ascending=False)
    return sub_df.iloc[0,1:]
    

In [6]:
most_relevant_tag = genome_scores.groupby('movieId').apply(pick_highest_relevant_tag_score)
most_relevant_tag = most_relevant_tag.reset_index()

In [7]:
num_unique_tags = len(genome_scores['tagId'].unique())
num_movies_tagged = len(genome_scores['movieId'].unique())
(genome_scores['movieId'].value_counts() == num_unique_tags).sum() == num_movies_tagged

True

In [41]:
genome_scores[genome_scores['relevance'] >= 0.5].iloc[0,:]

movieId       1.00000
tagId        11.00000
relevance     0.58025
Name: 10, dtype: float64

In [42]:
genome_scores[genome_scores['relevance'] >= 0.5]

Unnamed: 0,movieId,tagId,relevance
10,1,11,0.58025
18,1,19,0.66250
28,1,29,0.89375
29,1,30,0.67625
60,1,61,0.61750
...,...,...,...
15584291,206499,972,0.60600
15584311,206499,992,0.51225
15584327,206499,1008,0.52500
15584333,206499,1014,0.59775


In [36]:
genome_scores.describe().round(2)

Unnamed: 0,movieId,tagId,relevance
count,15584448.0,15584448.0,15584448.0
mean,46022.49,564.5,0.12
std,55352.21,325.63,0.15
min,1.0,1.0,0.0
25%,3853.75,282.75,0.02
50%,8575.5,564.5,0.06
75%,80186.5,846.25,0.14
max,206499.0,1128.0,1.0


In [34]:
genome_scores.merge(movies).sort_values(['movieId', 'relevance'], ascending=False)

Unnamed: 0,movieId,tagId,relevance,title,genres
15584061,206499,742,0.88600,Between Two Ferns: The Movie (2019),Comedy
15584096,206499,777,0.80825,Between Two Ferns: The Movie (2019),Comedy
15583922,206499,603,0.76725,Between Two Ferns: The Movie (2019),Comedy
15584174,206499,855,0.75850,Between Two Ferns: The Movie (2019),Comedy
15584120,206499,801,0.74475,Between Two Ferns: The Movie (2019),Comedy
...,...,...,...,...,...
118,1,119,0.00375,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
975,1,976,0.00325,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
488,1,489,0.00275,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1001,1,1002,0.00275,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
