### DEVELOPING A MOVIE RECOMMENDATION SYSTEM

In [1]:
import pandas as pd
import numpy as np
import re

##### LOADING THE MOVIES DATASET

In [2]:
file1 = "C:/Users/user/Documents/DATA ANALYSIS FILES/Videos/Dataquest/movie recommendation/movies.csv"
movies = pd.read_csv(file1)
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


##### CLEANING THE MOVIES DATASET

In [3]:
##using regex to clean the movie title

def clean_titles (title):
    return re.sub("[^a-zA-Z0-9 ]", "", title) ##removes all characters that are letters, numbers or spaces

In [4]:
movies['clean_title'] = movies['title'].apply(clean_titles)
movies.head()

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


#### IMPORTING OUR RATINGS AND TAGS DATASET TO AID OUR RECOMMENDATION

In [6]:
file2 = "C:/Users/user/Documents/DATA ANALYSIS FILES/Videos/Dataquest/movie recommendation/ratings.csv"
file3 = "C:/Users/user/Documents/DATA ANALYSIS FILES/Videos/Dataquest/movie recommendation/tags.csv"

In [7]:
ratings = pd.read_csv(file2)
tags = pd.read_csv(file3)

In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [9]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


#### DEVELOPING A RECOMMENDATION SYSTEM BASED ON TAGS

##### CLEANING THE DATASET

In [10]:
tags.isna().sum()

userId        0
movieId       0
tag          16
timestamp     0
dtype: int64

In [11]:
##filling the missing values in tags with none
nan = tags.isna().sum().reset_index() #reseting index of our values to give it columns
nan.columns = ['variables', 'counts']
missing = nan[nan['counts'] > 0] #every column or variable with na
missing_variables = np.array(missing['variables']) #turning them into an array

for i in missing_variables:
    tags[i] = tags[i].fillna('None')

In [12]:
tags.isna().sum()

userId       0
movieId      0
tag          0
timestamp    0
dtype: int64

In [13]:
##using regex to clean the tags

tags['clean_tag'] = tags['tag'].apply(clean_titles)
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp,clean_tag
0,3,260,classic,1439472355,classic
1,3,260,sci-fi,1439472256,scifi
2,4,1732,dark comedy,1573943598,dark comedy
3,4,1732,great dialogue,1573943604,great dialogue
4,4,7569,so bad it's good,1573943455,so bad its good


In [14]:
tags = tags.drop(['timestamp'], axis = 1)
tags.head()

Unnamed: 0,userId,movieId,tag,clean_tag
0,3,260,classic,classic
1,3,260,sci-fi,scifi
2,4,1732,dark comedy,dark comedy
3,4,1732,great dialogue,great dialogue
4,4,7569,so bad it's good,so bad its good


#### CREATING A TFIDF TABLE FOR OUR TAGS AS WE DID FOR OUR MOVIE TITLES

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

#initializing vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

In [16]:
tfidf_tags = vectorizer.fit_transform(tags['clean_tag'])

##### CREATING  A SEARCH FUNCTION FOR OUR TAGS TO COMPARE THEM 

In [17]:
from sklearn.metrics.pairwise import cosine_similarity ##library to perform the comparison

In [18]:
def search_tag(tag):
    tag = clean_titles(tag)
    q_vec2 = vectorizer.transform([tag])
    similarity2 = cosine_similarity(q_vec2, tfidf_tags).flatten()
    indices = np.argpartition(similarity2, -5)[-5:]
    results = tags.iloc[indices]
    return results

In [19]:
search_tag('classic')

Unnamed: 0,userId,movieId,tag,clean_tag
1078394,160540,8507,classic,classic
1036480,155146,527,classic,classic
803556,109697,40629,classic,classic
14957,3448,2959,classic,classic
0,3,260,classic,classic


##### GETTING SIMILAR USERS BASED ON OUR TAGS

In [20]:
tag_word = 'classic'

In [32]:
##find all movies with the same tag-word
similar_tags = ratings[(tags['clean_tag'] == tag_word) & (ratings['rating'] > 4)]['movieId']
# similar_tags

# #users that rated those movies high
similar_users = ratings[(ratings['movieId'].isin(similar_tags)) & (tags['clean_tag'] == tag_word)]['userId'].unique()
# similar_users

# ###percentage of users that rated movies with that tag high
maj_tag = (similar_tags.value_counts() / len(similar_users)) * 100
# maj_tag = maj_tag[maj_tag > 10]
# maj_tag

# # #what perentage of all users actually rated movies with that tag high
all_tags = ratings[(tags['movieId'].isin(maj_tag.index)) & (ratings['rating'] > 4)]
perc_all_tags = (all_tags['movieId'].value_counts() / len(all_tags['userId'].unique())) * 100
# perc_all_tags

# ###combining the percentages
tags_perc = pd.concat([maj_tag, perc_all_tags], axis = 1)
tags_perc.columns = ['similar', 'all']
# # tags_perc

# ##developina a score 
tags_perc['score'] = tags_perc['similar']/tags_perc['all']
tags_perc = tags_perc.sort_values('score', ascending = False)
tags_perc

# ##joining our top 10 tags_perc with our movies data
tags_perc.head(10).merge(movies, on= 'movieId')


Unnamed: 0,movieId,similar,all,score,title,genres,clean_title
0,31851,0.112867,0.020513,5.502257,Sons of the Desert (1933),Comedy,Sons of the Desert 1933
1,118880,0.112867,0.020513,5.502257,"Girl Walks Home Alone at Night, A (2014)",Horror|Romance|Thriller,Girl Walks Home Alone at Night A 2014
2,47382,0.112867,0.020513,5.502257,Step Up (2006),Drama|Romance,Step Up 2006
3,879,0.112867,0.020513,5.502257,"Relic, The (1997)",Horror|Thriller,Relic The 1997
4,153766,0.112867,0.020513,5.502257,Welcome (2007),Action|Comedy|Drama|Romance,Welcome 2007
5,3074,0.112867,0.020513,5.502257,Jeremiah Johnson (1972),Western,Jeremiah Johnson 1972
6,66130,0.112867,0.020513,5.502257,Chocolate (2008),Action|Drama,Chocolate 2008
7,26156,0.112867,0.020513,5.502257,Dragon Gate Inn (Dragon Inn) (Long men kezhan)...,Action|Adventure,Dragon Gate Inn Dragon Inn Long men kezhan 1967
8,3566,0.112867,0.020513,5.502257,"Big Kahuna, The (2000)",Comedy|Drama,Big Kahuna The 2000
9,5794,0.112867,0.020513,5.502257,Good Work (Beau travail) (1999),Drama,Good Work Beau travail 1999


##### PUTTING IT ALL IN A FUNCTION

In [33]:
def tag_recommend(tag_word):
    ##find all movies with the same tag-word
    similar_tags = ratings[(tags['clean_tag'] == tag_word) & (ratings['rating'] > 4)]['movieId']
    # similar_tags

    # #users that rated those movies high
    similar_users = ratings[(ratings['movieId'].isin(similar_tags)) & (tags['clean_tag'] == tag_word)]['userId'].unique()
    # similar_users

    # ###percentage of users that rated movies with that tag high
    maj_tag = (similar_tags.value_counts() / len(similar_users)) * 100
    # maj_tag = maj_tag[maj_tag > 10]
    # maj_tag

    # # #what perentage of all users actually rated movies with that tag high
    all_tags = ratings[(tags['movieId'].isin(maj_tag.index)) & (ratings['rating'] > 4)]
    perc_all_tags = (all_tags['movieId'].value_counts() / len(all_tags['userId'].unique())) * 100
    # perc_all_tags

    # ###combining the percentages
    tags_perc = pd.concat([maj_tag, perc_all_tags], axis = 1)
    tags_perc.columns = ['similar', 'all']
    # # tags_perc

    # ##developina a score 
    tags_perc['score'] = tags_perc['similar']/tags_perc['all']
    tags_perc = tags_perc.sort_values('score', ascending = False)
    # tags_perc

    # ##joining our top 10 tags_perc with our movies data
    return tags_perc.head(10).merge(movies, on= 'movieId')[['movieId', 'clean_title', 'score', 'genres']]


##### CREATING WIDGETS FOR OUR TAG RECOMMENDATION

In [29]:
import ipywidgets as widgets
from IPython.display import display

In [34]:
#input widget
tag_search = widgets.Text(
    value = "",
    description = "Movie Tag:",
    disabled = False
)

#output widget

tag_recommendation = widgets.Output()

def recommend_movie_by_tag(data):
    with tag_recommendation:
        tag_recommendation.clear_output()
        tag = data['new']
        if len(tag) > 5:
            result = search_tag(tag)
            tag_word = result.iloc[0]['tag']
            display(tag_recommend(tag_word))
            
tag_search.observe(recommend_movie_by_tag, names = 'value')

display(tag_search, tag_recommendation)

Text(value='', description='Movie Tag:')

Output()