### DEVELOPING A MOVIE RECOMMENDATION SYSTEM  BY MOVIE GENRE

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
##datasets to be used
file = "C:/Users/user/Documents/DATA ANALYSIS FILES/Videos/Dataquest/movie recommendation/movies.csv"
file1 = "C:/Users/user/Documents/DATA ANALYSIS FILES/Videos/Dataquest/movie recommendation/ratings.csv"

movies = pd.read_csv(file)
ratings = pd.read_csv(file1)

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


###### USING REGEX TO CLEAN MOVIE TITLES

In [4]:
def clean_titles (title):
    return re.sub("[^a-zA-Z0-9 ]", "", title) ##removes all characters that are letters, numbers or spaces

In [5]:
movies['clean_title'] = movies['title'].apply(clean_titles)
movies.head()

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


###### CREATING A TFIDF TABLE TO CONVERT OUR GENRES TO NUMBERS

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [79]:
#initializing vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

In [8]:
tfidf_genre = vectorizer.fit_transform(movies['genres'])

###### CREATING A SEARCH FUNCTION TO COME UP WITH GENRES AND COMPARE THEM WITH OUR TFIDF_GENRES

In [9]:
from sklearn.metrics.pairwise import cosine_similarity ##library to perform the comparison

In [10]:
def search_genre (genre):
    genre = clean_titles(genre) #cleaning the genre
    q_vec = vectorizer.transform([genre]) #converting our genre to digits
    similarity = cosine_similarity(q_vec, tfidf_genre).flatten() #comparing our searched genre to our tfidf table of genres
    indices = np.argpartition(similarity, -5)[-5:] #picks the 5 most similar figures based on our search
    results = movies.iloc[indices]
    return results

###### CREATING WIDGETS TO IMPLEMENT OUR SEARCH FUNCTION

In [11]:
import ipywidgets as widgets
from IPython.display import display

In [12]:
##creating an input widget

input_widget = widgets.Text(
    value = "", #setting the initial value of the input widget
    description = "Movie Genre:",#describing the widget
    disabled = False
)

## creating an output widget

output_widget = widgets.Output()

##a function to link the widgets amd the search_movie function

def on_search_genre (data):
    with output_widget:
        output_widget.clear_output()
        genre = data['new']
        if len(genre) > 5:
            display(search_genre(genre))
            
input_widget.observe(on_search_genre, names = 'value')

display(input_widget, output_widget)

Text(value='', description='Movie Genre:')

Output()

###### DEVELOPING THE RECOMMENDATION SYSTEM

In [68]:
genre = 'Comedy'

In [74]:
#finding users that watch movies with the same genres as us

#highly rated movies with the genre
similar_movies = ratings[(movies['genres'] == genre) & (ratings['rating'] > 4)]['movieId']
# similar_movies

#users who rated them high
similar_users = ratings[(ratings['movieId'].isin(similar_movies)) & (movies['genres'] == genre)]['userId'].unique()
# similar_users

# #percentage of similar users that like movies with the genre
similar_perc = (similar_movies.value_counts() / len(similar_users)) * 100
# similar_perc = similar_perc[similar_perc > 10]
# similar_perc

# #percentage of all users who rated those movies high
all_perc = ratings[(ratings['movieId'].isin(similar_perc.index)) & (ratings['rating'] > 4)]
all_perc_users = (all_perc['movieId'].value_counts() / len(all_perc['userId'].unique())) * 100
# all_perc_users

# #joining all percentages
perc_table = pd.concat([similar_perc, all_perc_users], axis = 1)
perc_table.columns = ['similar', 'all']
# perc_table

#score to know ratio of similar viewers to all viewers
perc_table['score'] = perc_table['similar'] / perc_table['all']
perc_table = perc_table.sort_values('score', ascending = False)
# perc_table


#joining with our movies dataset the top 10 suggestions
perc_table.head(10).merge(movies, on = 'movieId')

Unnamed: 0,movieId,similar,all,score,title,genres,clean_title
0,146654,0.224215,0.001263,177.528027,Nous trois ou rien (2015),Comedy|Drama,Nous trois ou rien 2015
1,167172,0.224215,0.001263,177.528027,The Bunker (2015),Comedy|Drama|Horror,The Bunker 2015
2,31998,0.224215,0.003157,71.011211,"Hole in My Heart, A (H책l i mitt hj채rta, Ett) (...",Drama,Hole in My Heart A Hl i mitt hjrta Ett 2004
3,1316,0.224215,0.00442,50.722293,Anna (1996),Drama,Anna 1996
4,34111,0.224215,0.006946,32.277823,"God Who Wasn't There, The (2005)",Documentary,God Who Wasnt There The 2005
5,51418,0.224215,0.010104,22.191003,Breaking and Entering (2006),Drama,Breaking and Entering 2006
6,99957,0.224215,0.010735,20.88565,Broken City (2013),Crime|Drama|Thriller,Broken City 2013
7,984,0.224215,0.011367,19.725336,"Pompatus of Love, The (1996)",Comedy|Drama,Pompatus of Love The 1996
8,71484,0.224215,0.011998,18.687161,Metropia (2009),Animation|Sci-Fi,Metropia 2009
9,6304,0.224215,0.013261,16.907431,Brainscan (1994),Comedy|Horror|Sci-Fi|Thriller,Brainscan 1994


###### TURNING ALL THIS INTO A FUNCTION

In [77]:
def finding_genre(genre):
    #highly rated movies with the genre
    similar_movies = ratings[(movies['genres'] == genre) & (ratings['rating'] > 4)]['movieId']
    # similar_movies

    #users who rated them high
    similar_users = ratings[(ratings['movieId'].isin(similar_movies)) & (movies['genres'] == genre)]['userId'].unique()
    # similar_users

    # #percentage of similar users that like movies with the genre
    similar_perc = (similar_movies.value_counts() / len(similar_users)) * 100
    # similar_perc = similar_perc[similar_perc > 10]
    # similar_perc

    # #percentage of all users who rated those movies high
    all_perc = ratings[(ratings['movieId'].isin(similar_perc.index)) & (ratings['rating'] > 4)]
    all_perc_users = (all_perc['movieId'].value_counts() / len(all_perc['userId'].unique())) * 100
    # all_perc_users

    # #joining all percentages
    perc_table = pd.concat([similar_perc, all_perc_users], axis = 1)
    perc_table.columns = ['similar', 'all']
    # perc_table

    #score to know ratio of similar viewers to all viewers
    perc_table['score'] = perc_table['similar'] / perc_table['all']
    perc_table = perc_table.sort_values('score', ascending = False)
    # perc_table


    #joining with our movies dataset the top 10 suggestions
    return perc_table.head(10).merge(movies, on = 'movieId')[['movieId', 'clean_title', 'genres', 'score']]

###### CREATING WIDGETS FOR OUR RECCOMMENDATION

In [78]:
#input widget
movie_search = widgets.Text(
    value = "",
    description = "Movie Genre:",
    disabled = False
)

#output widget
recommendation = widgets.Output()

def recommend(data):
    with recommendation:
        recommendation.clear_output()
        genre = data['new']
        if len(genre) > 5:
            results = search_genre(genre)
            movie_genre = results.iloc[0]['genres']
            display(finding_genre(movie_genre))
            
#observe
movie_search.observe(recommend, names = 'value')

#display
display(movie_search, recommendation)

Text(value='', description='Movie Genre:')

Output()