# Movie Recommendation Engine

Started as a follow-along project with [Dataquest's walkthrough](https://www.youtube.com/watch?v=eyEabQRBMQA), and ended up being a more fleshed out personal project.

In [1]:
import pandas as pd
import numpy as np
import sqlalchemy as sq

import matplotlib.pyplot as plt
import seaborn as sns

import re

In [2]:
def load_data(table_name: str) -> pd.DataFrame:
    '''
    Loads table from the movie_data sqlite database
    '''
    engine = sq.create_engine("sqlite:///data/movie_data.sql")
    with engine.connect() as cnx:
        return pd.read_sql(table_name, cnx)

In [3]:
def save_data(dataset: pd.DataFrame, table_name: str, if_exists: str = 'fail', index: bool = False) -> bool:
    '''
    Saves dataframe to the movie_data sqlite database
    '''
    engine = sq.create_engine("sqlite:///data/movie_data.sql")
    try:
        with engine.connect() as cnx:
            dataset.to_sql(name=table_name, con=cnx, if_exists=if_exists, index=index)
            return True
    except:
        print("Does this table already exist in movie_data.sql?")
        return False

In [4]:
def clean_title(title: str) -> str:
    '''
    Keeps only letters, numbers, whitespaces. 
    Replaces everything else with nothing.
    '''
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [5]:
def clean_genre(genre: str) -> list:
    '''
    Cleans the movies dataset genre columns so we can get a list of unique genres
    '''
    if '(' not in genre:
        genre_list = genre.split('|')
        return genre_list
    else:
        return ['None']

In [6]:
# separate list of genres into dummy variables
def col_lists_to_dummies(dataframe: pd.DataFrame, col_of_lists: str, identifier: str) -> pd.DataFrame:
    '''
    Turns a series of lists into dummy variables, then merges the resulting df
    with the original df.

    input
    -----
    col_of_lists: name of column where each value is a list of strings
                  this column will be turned into dummy variables
    identifier: name of column where each value uniquely identifies the row
                this column will be used to merge on
    '''
    from sklearn.preprocessing import MultiLabelBinarizer
    mlb = MultiLabelBinarizer()

    # create an array of 1s and 0s representing whether or not each row belongs in the var
    dummy_array = mlb.fit_transform(dataframe[col_of_lists])
    # grab the unique vars to create new column names
    unique_vars_from_col_of_lists = mlb.classes_
    identifier_array = dataframe[identifier].values
    # create dummy dataframe
    dummy_df = pd.DataFrame(dummy_array, columns=unique_vars_from_col_of_lists, index=identifier_array).reset_index()
    dummy_df = dummy_df.rename({'index':identifier}, axis=1)

    return dummy_df, unique_vars_from_col_of_lists

In [7]:
movies = load_data('movies')
ratings = pd.read_csv("data/ratings.csv")

links = load_data('links')
tags = load_data('tags')
genome_scores = pd.read_csv('data/genome-scores.csv')
genome_tags = load_data('genome_tags')

In [13]:
unique_genres

['Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Fantasy',
 'Romance',
 'Drama',
 'Action',
 'Crime',
 'Thriller',
 'Horror',
 'Mystery',
 'Sci-Fi',
 'IMAX',
 'Documentary',
 'War',
 'Musical',
 'Western',
 'Film-Noir']

In [None]:
movies['cleaned_title'] = movies['title'].apply(clean_title)
movies['genres_lists'] = movies['genres'].apply(clean_genre)
movies.head()

## Genre

In [None]:
# create dummy_df and get a unique list of genres
genre_dummies, unique_genres = col_lists_to_dummies(movies, 'genres_lists', 'movieId')
# mnerge dummy dataframe with original dataframe
movies_with_dummies = movies.merge(genre_dummies, on='movieId')
movies_with_dummies.head()

In [None]:
movies_per_genre = movies_with_dummies[unique_genres.tolist() + ['movieId']].melt('movieId').groupby('variable').sum()
movies_per_genre = movies_per_genre.sort_values('value', ascending=False)[['value']].reset_index()
movies_per_genre.head()

In [None]:
sns.catplot(x='variable', y='value', data=movies_per_genre, kind='bar')
plt.xticks(horizontalalignment='right', rotation=45)
plt.ylabel('Number of movies')
plt.xlabel('Genres')
plt.title('Number of movies that belong to each genre')

## Tags

# Search Engine

Converting words into numbers:

1. Term frequency: get frequency of words in title
1. Inverse document frequency: helps search engine find unique terms (the is in a lot of title, but harry is not)
1. Tf\*Idf: Get a vector for each movie

Search:

1. Convert search text into into numbers like above
1. Find row with the most number of matches via similarity

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tags = tags.dropna()

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(tags['tag'])

In [None]:
tfidf

In [None]:
# ngrams: number of words to search for together
# ex: Toy Story 1995: "toy", "story", "1995"; "toy story", "story 1995"
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies['cleaned_title'])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def search(title: str, dataframe: pd.DataFrame, tfidf) -> pd.DataFrame:
    '''
    Turns search term into a vectory, resulting top 5 results
    '''
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    # returns how similar search title is to each title
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    dataframe_cp = dataframe.copy()
    # find the 5 most similar titles by idx, in asc order
    # kinda like df['col'].sort_values(ascending=False).loc[:-5]
    indices = np.argpartition(similarity, -5)[-5:]
    dataframe_cp['similarity'] = similarity
    results = dataframe_cp.iloc[indices][::-1] # return with most similar up top
    return results

In [None]:
def check_genre(value: str, genre_list: list) -> bool:
    '''
    Returns true if any of genre from list is also in the given string.
    '''
    bool_list = [True if genre in value else False for genre in genre_list]
    if sum(bool_list) > 0:
        return True
    else: 
        return False

In [None]:
unique_genres

In [35]:
movies[~movies['genres'].apply(check_genre, genre_list = list(unique_genres))]

Unnamed: 0,movieId,title,genres,cleaned_title,genres_lists
15881,83773,Away with Words (San tiao ren) (1999),(no genres listed),Away with Words San tiao ren 1999,[None]
16060,84768,Glitterbug (1994),(no genres listed),Glitterbug 1994,[None]
16351,86493,"Age of the Earth, The (A Idade da Terra) (1980)",(no genres listed),Age of the Earth The A Idade da Terra 1980,[None]
16491,87061,Trails (Veredas) (1978),(no genres listed),Trails Veredas 1978,[None]
17404,91246,Milky Way (Tejút) (2007),(no genres listed),Milky Way Tejt 2007,[None]
...,...,...,...,...,...
62400,209101,Hua yang de nian hua (2001),(no genres listed),Hua yang de nian hua 2001,[None]
62401,209103,Tsar Ivan the Terrible (1991),(no genres listed),Tsar Ivan the Terrible 1991,[None]
62407,209133,The Riot and the Dance (2018),(no genres listed),The Riot and the Dance 2018,[None]
62415,209151,Mao Zedong 1949 (2019),(no genres listed),Mao Zedong 1949 2019,[None]


In [None]:
search('dogs talking', tags, tfidf)

## Search Box

In [None]:
import ipywidgets as widgets
from IPython.display import display

In [None]:
movie_input = widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type,names='value')
display(movie_input, movie_list)

# Recommendation Engine

1. Find all users who liked the search title
1. Find the other movies that those users liked

In [None]:
ratings

In [None]:
ratings.dtypes

In [None]:
movie_id = 1
high_rating = 4

In [None]:
# find the users that liked the movie being searched for
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings['rating']>=high_rating)]['userId'].unique()

# find the movies that similar users liked
similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating']>=high_rating)]['movieId']

In [None]:
similar_user_recs

In [None]:
# find the top 10% of movies
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > 0.1]
similar_user_recs

In [None]:
# find what percent of people not similar to us liked these movies
# if normies like what i like, its highly likely the "similar users" liked my
# movie because everyone liked it

In [None]:
# all users who watched the movie recommended to us
all_users = ratings[(ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] >= high_rating)]

In [None]:
# all users who watched the movie recommended to us
all_users = ratings[(ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] >= high_rating)]
all_user_recs = all_users['movieId'].value_counts()/len(all_users['userId'].unique())

We need to find a movie where:

1. Everyone who liked search_term, also liked rec_term
1. Only X% (40%, ex) of those who liked rec_term also liked search_term

In [None]:
# percent of all users who liked the given movie
all_user_recs

In [None]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ['similar_ppl','all_ppl']

In [None]:
rec_percentages

In [None]:
# score = ratio of similar:avg users who liked movie
rec_percentages['score'] = rec_percentages['similar_ppl'] / rec_percentages['all_ppl']
rec_percentages = rec_percentages.sort_values('score', ascending=False)

In [None]:
# left_index is the movieId
rec_percentages.head(10).merge(movies, left_index=True, right_on='movieId')

Not bad! For `Toy Story` we recommended the Toy Story series, Bugs Life, etc.

In [None]:
def find_similar_movies(movie_id: int, high_rating: int = 4) -> pd.DataFrame:
    # find the users that liked the movie being searched for
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings['rating']>=high_rating)]['userId'].unique()
    # find the movies that similar users liked
    similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating']>=high_rating)]['movieId']

    # find the top 10% of movies
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > 0.1]
    
    # all users who watched the movie recommended to us
    all_users = ratings[(ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] >= high_rating)]
    all_user_recs = all_users['movieId'].value_counts()/len(all_users['userId'].unique())
    
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ['similar_ppl','all_ppl']

    # score = ratio of similar:avg users who liked movie
    rec_percentages['score'] = rec_percentages['similar_ppl'] / rec_percentages['all_ppl']
    rec_percentages = rec_percentages.sort_values('score', ascending=False)
    
    # left_index is the movieId
    return rec_percentages.head(10).merge(movies, left_index=True, right_on='movieId')[['score','title','genres']]

## Widget

In [None]:
movie_input_name = widgets.Text(
    initial_value = 'Toy Story',
    description = 'Movie Title:',
    disabled=False
)

In [None]:
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data['new']
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]['movieId']
            display(find_similar_movies(movie_id))

movie_input_name.observe(on_type, names='value')
display(movie_input_name, recommendation_list)

# Adding Tags as a Feature

In [None]:
tags = tags.merge(genome_tags, how='left')
tags.info()

In [None]:
num_movies = len(tags['movieId'].unique())
num_real_movies = len(tags[~tags['tagId'].isna()]['movieId'].unique())

By only using movies that are tagged with a tagId, we lose {{ num_movies - num_real_movies }} movies
And analyze {{ num_real_movies }} movies

In [None]:
def pick_highest_relevant_tag_score(sub_df: pd.DataFrame) -> pd.DataFrame:
    '''
    Used for groupby function, returns the row that contains the highest
    relevance value (minus movieId column, since groupby function will 
    make that the index anyways)
    '''
    sub_df = sub_df.sort_values('relevance', ascending=False)
    return sub_df.iloc[0,1:]
    

In [None]:
most_relevant_tag = genome_scores.groupby('movieId').apply(pick_highest_relevant_tag_score)
most_relevant_tag = most_relevant_tag.reset_index()

In [None]:
num_unique_tags = len(genome_scores['tagId'].unique())
num_movies_tagged = len(genome_scores['movieId'].unique())
(genome_scores['movieId'].value_counts() == num_unique_tags).sum() == num_movies_tagged

In [None]:
genome_scores[genome_scores['relevance'] >= 0.5].iloc[0,:]

In [None]:
genome_scores[genome_scores['relevance'] >= 0.5]

In [None]:
genome_scores.describe().round(2)

In [None]:
genome_scores.merge(movies).sort_values(['movieId', 'relevance'], ascending=False)