In [2]:
import pandas as pd

movies = pd.read_csv("C:/Users/DELL/Downloads/ml-25m/ml-25m/movies.csv")

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


### creating movie title with regex

In [4]:

#This function remove any special char other than numbs or alphabets 
import re

def clean_title(title):
    return re.sub("[^a-zA-Z0-9]", " ",title)


In [5]:
movies["cleaned_title"]= movies["title"].apply(clean_title)

In [6]:
movies

Unnamed: 0,movieId,title,genres,cleaned_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [7]:
movies.isna().sum()

movieId          0
title            0
genres           0
cleaned_title    0
dtype: int64

### Creating a TFIDF matrix

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer =TfidfVectorizer(ngram_range=(1,2))
#it makes search two words together taken into considration

tfidf = vectorizer.fit_transform(movies["cleaned_title"])

In [9]:
tfidf

<62423x168482 sparse matrix of type '<class 'numpy.float64'>'
	with 448766 stored elements in Compressed Sparse Row format>

### Creating a search function

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# def search(title):
title="Jumanji"
title =clean_title(title)
query_vec= vectorizer.transform([title])
similarity = cosine_similarity(query_vec,tfidf).flatten()

indices = np.argpartition(similarity,-5)[-5:] #gives top 5 indices of most similar movies
results= movies.iloc[indices][::-1] # puts it in reverse order

In [11]:
query_vec

<1x168482 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [12]:
similarity

array([0.        , 0.65562261, 0.        , ..., 0.        , 0.        ,
       0.        ])

In [13]:
indices

array([20804, 20803, 20807, 49687,     1], dtype=int64)

In [14]:
results

Unnamed: 0,movieId,title,genres,cleaned_title
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
49687,179401,Jumanji: Welcome to the Jungle (2017),Action|Adventure|Children,Jumanji Welcome to the Jungle 2017
20807,107565,"Fuck You, Goethe (Fack Ju Göhte) (2013)",Comedy,Fuck You Goethe Fack Ju G hte 2013
20803,107548,Ice Quake (2010),Action|Sci-Fi|Thriller,Ice Quake 2010
20804,107557,Fun Size (2012),Comedy,Fun Size 2012


In [15]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    
    title =clean_title(title)
    query_vec= vectorizer.transform([title])
    similarity = cosine_similarity(query_vec,tfidf).flatten()

    indices = np.argpartition(similarity,-5)[-5:] #gives top 5 indices of most similar movies
    results= movies.iloc[indices][::-1] # puts it in reverse order
    
    return results

### Building an interactive search box

In [16]:
import ipywidgets as widgets
from IPython.display import display

movie_input= widgets.Text(
    value="Jumanji",
    description= "Movie Title",
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names ="value")

display(movie_input, movie_list)


Text(value='Jumanji', description='Movie Title')

Output()

### Reading in movie ratings data

In [17]:
ratings = pd.read_csv("C:/Users/DELL/Downloads/ml-25m/ml-25m/ratings.csv")

In [18]:
ratings


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [19]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

### Finding the user who liked the same movie

In [20]:
movie_id = 1

In [21]:
similar_users = ratings[(ratings["movieId"] == movie_id)& (ratings["rating"]>=5)]["userId"].unique()

In [22]:
ratings[ratings["movieId"] == movie_id]

Unnamed: 0,userId,movieId,rating,timestamp
70,2,1,3.5,1141415820
254,3,1,4.0,1439472215
910,4,1,3.0,1573944252
1152,5,1,4.0,858625949
1304,8,1,4.0,890492517
...,...,...,...,...
24997918,162529,1,2.0,888181499
24998300,162530,1,5.0,989808332
24998525,162533,1,4.5,1280920369
24998892,162534,1,4.0,1526714137


In [23]:
similar_users # this gives us a set of users who liked same movie

array([    36,     75,     86, ..., 162518, 162519, 162530], dtype=int64)

In [24]:
similar_users_recs=ratings[(ratings["userId"].isin(similar_users))& (ratings["rating"]>=5)]["movieId"]

In [25]:
similar_users_recs #bunch of movies(ID) that users who liked same movie as us

5101           1
5105          34
5111         110
5114         150
5127         260
            ... 
24998388    3706
24998389    3735
24998391    3763
24998392    4187
24998393    4321
Name: movieId, Length: 678648, dtype: int64

In [26]:
similar_users_recs.value_counts()

1         13506
318        4967
260        4839
296        4040
356        3947
          ...  
4435          1
188477        1
179439        1
179427        1
97957         1
Name: movieId, Length: 14179, dtype: int64

In [27]:
similar_user_recs=similar_users_recs.value_counts() / len(similar_users)

similar_user_recs= similar_user_recs[similar_user_recs > 0.1]

In [28]:
similar_user_recs

1        1.000000
318      0.367762
260      0.358285
296      0.299126
356      0.292240
           ...   
1089     0.105064
590      0.104620
780      0.102991
78499    0.101436
750      0.100992
Name: movieId, Length: 64, dtype: float64

### Finding how much all users like movies

In [29]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] >=5)]

In [30]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
79,2,318,5.0,1141417181
85,2,457,5.0,1141416618
...,...,...,...,...
25000018,162541,2858,5.0,1240950804
25000020,162541,2959,5.0,1240953488
25000057,162541,4993,5.0,1240952610
25000065,162541,5952,5.0,1240952617


In [31]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [32]:
all_user_recs  # what % of all users recommend each of the movie

318      0.293673
296      0.238848
356      0.192436
260      0.191589
2571     0.189198
           ...   
6377     0.039507
1073     0.039218
1148     0.039099
8961     0.037465
78499    0.020982
Name: movieId, Length: 64, dtype: float64

### Creating recommendation scores

In [33]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [34]:
rec_percentages

Unnamed: 0,similar,all
1,1.000000,0.100279
32,0.129720,0.072993
34,0.143418,0.051157
47,0.161336,0.103479
50,0.222272,0.160264
...,...,...
6377,0.131423,0.039507
7153,0.152303,0.124580
8961,0.112469,0.037465
58559,0.117577,0.098468


In [35]:
rec_percentages["score"]= rec_percentages["similar"] / rec_percentages["all"]

In [36]:
rec_percentages =  rec_percentages.sort_values("score", ascending = False)

In [37]:
rec_percentages

Unnamed: 0,similar,all,score
1,1.000000,0.100279,9.972161
3114,0.236043,0.039849,5.923467
78499,0.101436,0.020982,4.834345
4886,0.149119,0.041111,3.627223
588,0.193988,0.055753,3.479433
...,...,...,...
858,0.222049,0.181298,1.224773
7153,0.152303,0.124580,1.222524
2858,0.154376,0.126645,1.218969
58559,0.117577,0.098468,1.194073


In [38]:
rec_percentages.head(10)

Unnamed: 0,similar,all,score
1,1.0,0.100279,9.972161
3114,0.236043,0.039849,5.923467
78499,0.101436,0.020982,4.834345
4886,0.149119,0.041111,3.627223
588,0.193988,0.055753,3.479433
6377,0.131423,0.039507,3.326552
595,0.170295,0.053496,3.183341
364,0.211832,0.069229,3.059883
1073,0.119502,0.039218,3.047154
8961,0.112469,0.037465,3.001925


In [39]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,cleaned_title
0,1.0,0.100279,9.972161,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.236043,0.039849,5.923467,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
14813,0.101436,0.020982,4.834345,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,0.149119,0.041111,3.627223,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
580,0.193988,0.055753,3.479433,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
6258,0.131423,0.039507,3.326552,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,0.170295,0.053496,3.183341,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
359,0.211832,0.069229,3.059883,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994
1047,0.119502,0.039218,3.047154,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
8246,0.112469,0.037465,3.001925,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004


### Creating a function to sum up all

In [40]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id)& (ratings["rating"]>=5)]["userId"].unique()
    similar_users_recs=ratings[(ratings["userId"].isin(similar_users))& (ratings["rating"]>=5)]["movieId"]
    
    similar_user_recs=similar_users_recs.value_counts() / len(similar_users)
    similar_user_recs= similar_user_recs[similar_user_recs > 0.1]
    
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] >=5)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"]= rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages =  rec_percentages.sort_values("score", ascending = False)
    
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score","title","genres"]]

### Creating an interactive recommendattion widget

In [41]:
movie_name_input = widgets.Text(
    value= "Toy Story",
    description = "Movie Title",
    disabled= False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title= data["new"]
        if len(title)>5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))
            
movie_name_input.observe(on_type , names= "value")

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title')

Output()

In [42]:
find_similar_movies(1)

Unnamed: 0,score,title,genres
0,9.972161,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3021,5.923467,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
14813,4.834345,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX
4780,3.627223,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy
580,3.479433,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
6258,3.326552,Finding Nemo (2003),Adventure|Animation|Children|Comedy
587,3.183341,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
359,3.059883,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX
1047,3.047154,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical
8246,3.001925,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy


In [43]:
import pickle

In [43]:
pickle.dump(movies,open("movies.pkl","wb"))

In [50]:
movies["title"]

0                          Toy Story (1995)
1                            Jumanji (1995)
2                   Grumpier Old Men (1995)
3                  Waiting to Exhale (1995)
4        Father of the Bride Part II (1995)
                        ...                
62418                             We (2018)
62419             Window of the Soul (2001)
62420                      Bad Poems (2018)
62421                   A Girl Thing (2001)
62422        Women of Devil's Island (1962)
Name: title, Length: 62423, dtype: object

In [51]:
movies.to_dict()

{'movieId': {0: 1,
  1: 2,
  2: 3,
  3: 4,
  4: 5,
  5: 6,
  6: 7,
  7: 8,
  8: 9,
  9: 10,
  10: 11,
  11: 12,
  12: 13,
  13: 14,
  14: 15,
  15: 16,
  16: 17,
  17: 18,
  18: 19,
  19: 20,
  20: 21,
  21: 22,
  22: 23,
  23: 24,
  24: 25,
  25: 26,
  26: 27,
  27: 28,
  28: 29,
  29: 30,
  30: 31,
  31: 32,
  32: 33,
  33: 34,
  34: 35,
  35: 36,
  36: 37,
  37: 38,
  38: 39,
  39: 40,
  40: 41,
  41: 42,
  42: 43,
  43: 44,
  44: 45,
  45: 46,
  46: 47,
  47: 48,
  48: 49,
  49: 50,
  50: 51,
  51: 52,
  52: 53,
  53: 54,
  54: 55,
  55: 56,
  56: 57,
  57: 58,
  58: 59,
  59: 60,
  60: 61,
  61: 62,
  62: 63,
  63: 64,
  64: 65,
  65: 66,
  66: 67,
  67: 68,
  68: 69,
  69: 70,
  70: 71,
  71: 72,
  72: 73,
  73: 74,
  74: 75,
  75: 76,
  76: 77,
  77: 78,
  78: 79,
  79: 80,
  80: 81,
  81: 82,
  82: 83,
  83: 84,
  84: 85,
  85: 86,
  86: 87,
  87: 88,
  88: 89,
  89: 90,
  90: 92,
  91: 93,
  92: 94,
  93: 95,
  94: 96,
  95: 97,
  96: 98,
  97: 99,
  98: 100,
  99: 101,
  100:

In [52]:
pickle.dump(movies.to_dict(),open("movies_dict.pkl","wb"))

In [54]:
pickle.dump(ratings, open("ratings.pkl","wb"))

In [59]:
def find_similar_movies1(movie):
    movieId= movies[(movies['title']== movie)]["movieId"].iloc[0]
    similar_users = ratings[(ratings["movieId"] == movie_id)& (ratings["rating"]>=5)]["userId"].unique()
    similar_users_recs=ratings[(ratings["userId"].isin(similar_users))& (ratings["rating"]>=5)]["movieId"]
    
    similar_user_recs=similar_users_recs.value_counts() / len(similar_users)
    similar_user_recs= similar_user_recs[similar_user_recs > 0.1]
    
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] >=5)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"]= rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages =  rec_percentages.sort_values("score", ascending = False)
    
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score","title","genres"]]

In [58]:
# movieeeee= "Toy Story (1995)"
# MovieIDD = movies[(movies['title']== movieeeee)]["movieId"].iloc[0]
# MovieIDD

1

In [67]:
find_similar_movies1("Toy Story (1995)")["title"]


0                                  Toy Story (1995)
3021                             Toy Story 2 (1999)
14813                            Toy Story 3 (2010)
4780                          Monsters, Inc. (2001)
580                                  Aladdin (1992)
6258                            Finding Nemo (2003)
587                     Beauty and the Beast (1991)
359                           Lion King, The (1994)
1047     Willy Wonka & the Chocolate Factory (1971)
8246                        Incredibles, The (2004)
Name: title, dtype: object

In [None]:
find=find_similar_movies1("Toy Story (1995)")

for i in find:
    print(i)