#### BUILDING A MOVIE RECOMMENDATION SYSTEM BASED ON RATING AND SIMILAR USERS

#### WE'LL BE USING A MOVIE DATASET WHICH HAS A LIST OF MOVIES AND A RATINGS DATASET WHICH HAS THE RATINGS OF THE MOVIES BY VARIOUS VIEWERS

In [1]:
import pandas as pd

##### LOADING THE MOVIE DATASET

In [2]:
file = "C:/Users/user/Documents/DATA ANALYSIS FILES/Videos/Dataquest/movie recommendation/movies.csv"
movie = pd.read_csv(file)
movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


###### CLEANING MOVIE NAMES WITH REGEX

In [3]:
import re ##regex

###creating a function to cleam the movie titles

def clean_title (title):
    return re.sub("[^a-zA-Z0-9 ]", "", title) ##removes any non characters, numbers or spaces

In [4]:
###creating a new column for the cleaned titles

movie["new_titles"] = movie['title'].apply(clean_title)

In [5]:
movie.head()

Unnamed: 0,movieId,title,genres,new_titles
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


##### CREATING A TFIDF TABLE (THIS CONVERTS OUR TITLES INTO NUMBERS TO ENABLE OUR MOVIE SEARCHING) USING A LIBRARY FROM SCIKIT LEARN

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

#initializing the library
vectorizer = TfidfVectorizer(ngram_range=(1,2)) ##this allows it to pick more than 1 word

###turning our titles into sets of numbers (matrix)
tfidf = vectorizer.fit_transform(movie['new_titles'])

##### COMPUTING THE SIMILARITIES BETWEEN A MOVIE WE WANT TO SEARCH FOR AND ALL THE MOVIES ON OUR DATASET USING A MACHINE LEARNING LIBRARY (COSINE SIMILARITY)

In [7]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
###creating a function that takes our searched movie and compares it with the dataset to get the most similar movies to our search

def search (title):
    title = clean_title(title)
    q_vectorizer = vectorizer.transform([title]) #this turns our movie name into a set of numbers
    similarity = cosine_similarity(q_vectorizer, tfidf).flatten() #compares the numbers with other numbers (movies)
    indices = np.argpartition(similarity, -5)[-5:] #finds the top 5 movies that have the highest similarity with our search
    results = movie.iloc[indices].iloc[::-1] ##gives us the movies from our dataset
    return results

##### BUILDING AN INTERACTIVE SEARCH BOX

In [9]:
import ipywidgets as widgets
from IPython.display import display

In [10]:
###creating an input widget
input_widget = widgets.Text(
    value = "", #setting the default value of the widget
    description = "Movie Title:", #what the widget will display as a call out or description
    disabled = False
)

###creating an output widget

output_widget = widgets.Output()

##linking our input and output widgets by creating a function that displays all movies we search

def on_search(data):
    with output_widget:
        output_widget.clear_output() #clears our output widget first
        title = data["new"] #allows a new output
        if len(title) > 5:
            display(search(title)) ##if the length of our title is greater than 5 display the results of searching our title
            
##linking our input widget to the function
input_widget.observe(on_search, names = 'value')

display(input_widget, output_widget)

Text(value='', description='Movie Title:')

Output()

##### IMPORTING OUR RATINGS DATASET TO BUILD OUR RECOMMENDATION SYSTEM

In [11]:
ratings = pd.read_csv("C:/Users/user/Documents/DATA ANALYSIS FILES/Videos/Dataquest/movie recommendation/ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [12]:
movie_id = 1

In [13]:
##finding users witht the same movie taste as ours

##everyone who watched the same movie as we did and gave it a rating greater than 4
similar_users = ratings[(ratings['movieId'] == movie_id) & (ratings['rating'] > 4)]['userId'].unique()

##movies they have also watched
similar_users_movies = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating'] > 4)]['movieId']
# similar_users_movies

##movies that majority of viewers similar to us liked
majority_users = (similar_users_movies.value_counts()/len(similar_users)) * 100 ##converting to percentage
majority_users = majority_users[majority_users > 10] #movies that greater than 10% of viewers similar to us liked
# majority_users 

In [14]:
###finding out how much all users rate our movies as well as those similar to us
all_users = ratings[(ratings['movieId'].isin(majority_users.index)) & (ratings['rating'] > 4)]
majority_all_users = (all_users['movieId'].value_counts() / len(all_users['userId'].unique())) *100
# majority_all_users

In [15]:
###joining all percentages

percentages = pd.concat([majority_users, majority_all_users], axis = 1)
percentages.columns = ['similar', 'all']
# percentages

In [16]:
###finding the ratio of the percerntages
percentages['score'] = percentages['similar'] / percentages['all']
percentages = percentages.sort_values('score', ascending=False)
# percentages.head(10)

In [17]:
###taking our top ten scores and joining them to the movie dataset to get the movie name

percentages.head(10).merge(movie, on = 'movieId')

Unnamed: 0,movieId,similar,all,score,title,genres,new_titles
0,1,100.0,12.472849,8.017414,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,3114,28.064773,5.370576,5.225654,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2,2355,11.053889,2.509139,4.405452,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
3,78499,15.295992,3.513059,4.354038,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4,4886,23.514733,7.081082,3.320783,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
5,588,21.6618,6.751298,3.208539,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
6,6377,22.81391,7.226769,3.156862,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
7,595,17.940005,5.997695,2.99115,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8,8961,20.350411,6.845333,2.972889,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
9,364,25.34112,8.576367,2.954762,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


##### PUTTING ALL OF THIS INTO A FUNCTION

In [18]:
def find_similar_movie(movie_id):
    ##finding users witht the same movie taste as ours

    ##everyone who watched the same movie as we did and gave it a rating greater than 4
    similar_users = ratings[(ratings['movieId'] == movie_id) & (ratings['rating'] > 4)]['userId'].unique()

    ##movies they have also watched
    similar_users_movies = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating'] > 4)]['movieId']

    ##movies that majority of viewers similar to us liked
    majority_users = (similar_users_movies.value_counts()/len(similar_users)) * 100 ##converting to percentage
    majority_users = majority_users[majority_users > 10] #movies that greater than 10% of viewers similar to us liked
    
    ###finding out how much all users rate our movies as well as those similar to us
    all_users = ratings[(ratings['movieId'].isin(majority_users.index)) & (ratings['rating'] > 4)]
    majority_all_users = (all_users['movieId'].value_counts() / len(all_users['userId'].unique())) *100
    
    ###joining all percentages
    percentages = pd.concat([majority_users, majority_all_users], axis = 1)
    percentages.columns = ['similar', 'all']
    
    ###finding the ratio of the percerntages
    percentages['score'] = percentages['similar'] / percentages['all']
    percentages = percentages.sort_values('score', ascending=False)
    
    ###taking our top ten scores and joining them to the movie dataset to get the movie name
    return percentages.head(10).merge(movie, on = 'movieId')[['movieId', 'score', 'new_titles']]

##### CREATING WIDGETS FOR OUR RECOMMENDATION

In [20]:
#creating input widget
movie_search = widgets.Text(
    value = " ",
    description = 'Movie Title:',
    disabled = False
)

#creating output widget

recommendation = widgets.Output()

###function to recommend movies based on search

def recommend(data):
    with recommendation:
        recommendation.clear_output()
        title = data['new']
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[2]['movieId']
            display(find_similar_movie(movie_id))

#observe our input widget
movie_search.observe(recommend, names = 'value')

#display our widgets

display(movie_search, recommendation)

Text(value=' ', description='Movie Title:')

Output()