In [1]:
import re
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import ipywidgets as widgets
from IPython.display import display

In [2]:
movies = pd.read_csv("/kaggle/input/movielens-20m-dataset/movie.csv")

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
27273,131254,Kein Bund für's Leben (2007),Comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy
27275,131258,The Pirates (2014),Adventure
27276,131260,Rentun Ruusu (2001),(no genres listed)


In [4]:
"""Let do some cleaning on the title"""

# let's clean the movie title - and only take this pattern by applying regex:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "",title) 

In [5]:
movies['clean_title'] = movies['title'].apply(clean_title) # Applying clean_title function on the title col

In [6]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
27273,131254,Kein Bund für's Leben (2007),Comedy,Kein Bund frs Leben 2007
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,Feuer Eis Dosenbier 2002
27275,131258,The Pirates (2014),Adventure,The Pirates 2014
27276,131260,Rentun Ruusu (2001),(no genres listed),Rentun Ruusu 2001


### Creating TF-IDF Matrix

In [7]:
"""we are going to use TF-IDF to create a search engine"""

vectorizer = TfidfVectorizer(ngram_range =(1,2)) # we're considering both unigrams and bigrams for searching.

tfidf = vectorizer.fit_transform(movies['clean_title'])

In [8]:
# converting the sparse matrix into dense matrix, to pandas DataFrame for visualization
tfidf_dense = tfidf.toarray()

# Get the feature names from the vectorizer
feature_names = vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(tfidf_dense, columns=feature_names)
tfidf_df.head(3)


Unnamed: 0,000,000 timmar,007,007 2012,008,008 1996,009,009 re,01,01 1973,...,zwierze,zwierze 2000,zwischenflle,zwischenflle 1996,zycia,zycia 2008,zycie,zycie jako,zyciu,zyciu 2004
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
"""Now we are going to apply cosine similarity"""

def search(title):
    title = clean_title(title)
    quey_vector = vectorizer.transform([title])
    similarity = cosine_similarity(quey_vector,tfidf).flatten()
    
    indicies = np.argpartition(similarity, -5)[-5:] # we are doing partial sorting here and taking the 5 largest values rtom that 1D array
    result = movies.iloc[indicies][::-1]
    return result

## Now we are going to create an interactive search box 

In [10]:

"""let's use widgets to create an interactive search box"""
movie_input = widgets.Text(

    value = 'Jumanji 1995',
    description ='Movie Title',
    disabled = False
)

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title)>5:
            display(search(title))

movie_input.observe(on_type, names = 'value')
display(movie_input,movie_list)

Text(value='Jumanji 1995', description='Movie Title')

Output()

# We Have Completed The Interactive Search Box.

# Now let's start work with Movie Recommendation

In [11]:
ratings = pd.read_csv("/kaggle/input/movielens-20m-dataset/rating.csv")

In [12]:
ratings.head(4)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07


In [13]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 610.4+ MB


In [14]:
ratings.shape

(20000263, 4)

## Finding users who liked the same movies as us

In [15]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16


In [16]:
movie_id = 1

In [17]:
ratings['rating'].mean()

3.5255285642993797

In [18]:
# we are finding similar people using ratings

similar_users = ratings[(ratings['movieId']==movie_id) & (ratings['rating']>=.54)]['userId'].unique()

In [19]:
similar_users

array([     3,      6,      8, ..., 138488, 138491, 138493])

In [20]:
# now we'll find out the other movies that those people liked
similar_users_likes = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating']>=4.5)]['movieId']

In [21]:
similar_users_likes

239            50
242           175
244           223
245           260
246           316
            ...  
20000256    66762
20000257    68319
20000258    68954
20000259    69526
20000261    70286
Name: movieId, Length: 2505338, dtype: int64

In [22]:
"""we'll find only those movies that is > then 10% of the users who are similar to us liked
    meaning 10% of the people who are similar to us also like that movie"""

# how many times each movies appears in our dataset
# similar users recommended
similar_users_likes = similar_users_likes.value_counts()/len(similar_users)
similar_users_likes = similar_users_likes[similar_users_likes>.1]

In [23]:
similar_users_likes 

movieId
318     0.372821
296     0.335158
1       0.326312
260     0.325666
2571    0.276551
          ...   
2918    0.103197
590     0.102914
6874    0.102490
8961    0.101238
2324    0.100773
Name: count, Length: 77, dtype: float64

In [24]:
""" Now we will go through some filtering process"""

# find out how all users liked the movie i liked
all_users = ratings[(ratings['movieId'].isin(similar_users_likes.index)) & (ratings['rating']>=4.5)]

In [25]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
30,1,1196,4.5,2005-04-02 23:32:22
31,1,1198,4.5,2005-04-02 23:30:24
131,1,4993,5.0,2005-04-02 23:31:22
142,1,5952,5.0,2005-04-02 23:30:19
158,1,7153,5.0,2005-04-02 23:30:33
...,...,...,...,...
20000106,138493,4973,5.0,2009-10-17 19:04:58
20000138,138493,6377,5.0,2009-10-28 17:20:00
20000142,138493,6539,4.5,2009-10-17 20:17:37
20000150,138493,6874,5.0,2009-10-17 19:08:23


In [26]:
# all users recommendations 

all_users_likes = all_users['movieId'].value_counts()/ len(all_users['userId'].unique())

In [27]:
all_users_likes

movieId
318     0.324075
296     0.284613
593     0.233696
527     0.224286
356     0.222033
          ...   
6377    0.061973
4886    0.058905
1073    0.058458
3114    0.056445
8961    0.055454
Name: count, Length: 77, dtype: float64

In [28]:
like_percentage = pd.concat([similar_users_likes,all_users_likes],axis = 1)
like_percentage.columns = ['similar','all']

In [29]:
like_percentage

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
318,0.372821,0.324075
296,0.335158,0.284613
1,0.326312,0.129074
260,0.325666,0.220699
2571,0.276551,0.213326
...,...,...
2918,0.103197,0.063251
590,0.102914,0.089348
6874,0.102490,0.068196
8961,0.101238,0.055454


In [30]:
like_percentage['score'] = like_percentage['similar']/like_percentage['all']

In [31]:
like_percentage = like_percentage.sort_values('score', ascending = False)

In [32]:
like_percentage

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.326312,0.129074,2.528101
3114,0.112931,0.056445,2.000728
8961,0.101238,0.055454,1.825608
4886,0.106368,0.058905,1.805738
6377,0.111840,0.061973,1.804669
...,...,...,...
457,0.136054,0.117715,1.155797
590,0.102914,0.089348,1.151829
318,0.372821,0.324075,1.150417
527,0.257810,0.224286,1.149473


In [33]:
like_percentage.head(10).merge(movies, left_index=True,right_on ='movieId')

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,0.326312,0.129074,2.528101,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3027,0.112931,0.056445,2.000728,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
8278,0.101238,0.055454,1.825608,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
4790,0.106368,0.058905,1.805738,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
6271,0.11184,0.061973,1.804669,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
1052,0.105378,0.058458,1.802628,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
1237,0.129067,0.076519,1.686721,1265,Groundhog Day (1993),Comedy|Fantasy|Romance,Groundhog Day 1993
2832,0.103197,0.063251,1.631548,2918,Ferris Bueller's Day Off (1986),Comedy,Ferris Buellers Day Off 1986
4211,0.126724,0.078101,1.622567,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...,Shrek 2001
6429,0.108205,0.066981,1.615452,6539,Pirates of the Caribbean: The Curse of the Bla...,Action|Adventure|Comedy|Fantasy,Pirates of the Caribbean The Curse of the Blac...


In [34]:
def movie_recommendation(movie_id):
    # find users similar to us
    similar_users = ratings[(ratings['movieId']==movie_id) & (ratings['rating']>=.54)]['userId'].unique() #find similar users
    similar_users_likes = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating']>=4.5)]['movieId'] # movies that similar users likes
    
    similar_users_likes = similar_users_likes.value_counts()/len(similar_users)
    similar_users_likes = similar_users_likes[similar_users_likes>.1] # filtering the movies of the silimilar users choice

    #find all user's recommendation - like how common is that movie on general level
    all_users = ratings[(ratings['movieId'].isin(similar_users_likes.index)) & (ratings['rating']>=4.5)]
    all_users_likes = all_users['movieId'].value_counts()/ len(all_users['userId'].unique())


    #create score for the df
    like_percentage = pd.concat([similar_users_likes,all_users_likes],axis = 1)
    like_percentage.columns = ['similar','all']
    
    like_percentage['score'] = like_percentage['similar']/like_percentage['all']
    like_percentage = like_percentage.sort_values('score', ascending = False)

    return like_percentage.head(10).merge(movies, left_index=True,right_on ='movieId')[['score','title','genres']]

In [35]:
movie_name_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title",
    disable = False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data['new']
        if len(title)>5:
            result = search(title)
            movie_id = result.iloc[0]['movieId']
            display(movie_recommendation(movie_id))

movie_name_input.observe(on_type, names = 'value')
display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title')

Output()

# Well that's the end