## Recommending movies using Collaborative Filtering 

In [34]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import hamming

In [35]:
import warnings
warnings.simplefilter(action='ignore',category=Warning)

In [36]:
df_ratings = pd.read_csv("recent_ratings.csv")
df_movies = pd.read_csv('recent_movies.csv')

In [37]:
df_ratings.shape, df_movies.shape

((552, 4), (188, 4))

In [38]:
df_ratings.userId.unique().size  # No. of unique users 

63

In [39]:
df_ratings.movieId.unique().size  # No. of unique movies 

188

In [40]:
df_ratings.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
182,209,177593,4.5,1524522248
255,249,185029,4.5,1537293500
221,233,183897,3.5,1536181590
477,567,182823,1.0,1525289422
285,306,168418,3.5,1518381253


In [41]:
# Remove timestamp column  
df_ratings.drop(columns='timestamp', inplace=True)

In [42]:
df_ratings.sample(5)

Unnamed: 0,userId,movieId,rating
379,414,176751,4.0
143,153,184471,0.5
478,567,183897,3.5
457,567,122918,3.5
305,331,187593,4.0


In [43]:
df_movies.sample(5)

Unnamed: 0,movieId,title,genres,year
114,179749,Creep 2 (2017),Comedy|Horror,2017
173,188797,Tag (2018),Comedy,2018
131,181315,Phantom Thread (2017),Drama|Romance,2017
64,173291,Valerian and the City of a Thousand Planets (2...,Action|Adventure|Sci-Fi,2017
85,176101,Kingsman: The Golden Circle (2017),Action|Adventure|Comedy,2017


In [44]:
# Get rating for each user and movie - userid is row label and movieid is column label 
ratings = df_ratings.pivot(
    index='userId',
    columns='movieId',
    values='rating')

In [45]:
ratings.sample(5)

movieId,122896,122898,122906,122912,122916,122918,122926,143355,166534,167064,...,189381,189713,190183,190209,190215,191005,193581,193583,193585,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
401,,,,,,4.5,,,,,...,,,,,,,,,,
305,,,,4.5,4.5,3.5,4.0,,,,...,,,,,,,,,,
119,,,,,,,,,,,...,,,,,,,,,,
551,,,,,,4.0,,,,,...,,,,,,,,,,
306,,,,,,3.0,,,4.0,,...,,,,,,,,,,


In [46]:
ratings.loc[233, ratings.loc[233,:].notna()]  # Ratings given by 233 

movieId
122912    2.0
168266    3.5
174055    2.5
177593    5.0
178061    3.0
180031    3.5
183011    1.5
183897    3.5
187593    2.5
Name: 233, dtype: float64

### Hamming Distance
Measures how different two sequences are. It is <b>% of disagreement </b>. A value of 1 indicates sequences are very different, 0 indicates they are very similar.

In [47]:
l1 = (1,2,4,np.nan)
l2 = (1,2,np.nan,3)
l3 = (1,np.nan, 3,5)
print(hamming(l1,l2))
print(hamming(l1,l3))

0.5
0.75


In [48]:
# Find out hamming distance between ratings of two users
def hamming_distance(user1,user2):
    # Ratings of a user 
    try:
        user1_ratings = ratings.loc[user1,:]
        user2_ratings = ratings.loc[user2,:]
        distance = hamming(user1_ratings, user2_ratings)
    except:
        distance = np.NaN

    return distance    

In [49]:
# Get neighbours of the given user 
def get_nearest_users(active_user, k = 10):
    all_users = pd.DataFrame(ratings.index)    # UserIds 
    other_users = all_users[all_users.userId != active_user] # all other users 
    
    other_users['distance'] = other_users['userId'].apply(lambda x: hamming_distance(active_user,x))
    
    # find out hamming distance and return users with low hamming distance from active user 
    return  other_users.sort_values(['distance'], ascending = True).userId[:k]

In [54]:
def get_recommended_movies(ratings, movies, user, top=5):
    # Find out nearest neighbours based on hamming distance 
    nn_users = get_nearest_users(user,10)
    print(nn_users)
    # Get ratings of nearest neighbours(users)
    nn_ratings = ratings[ratings.index.isin(nn_users)]
    
    # Average ratings given by nearest neighbours for all movies
    avg_ratings = nn_ratings.apply(np.nanmean).dropna()
    print(avg_ratings[:20])
    
    # Find out movies that user had already watched
    movies_watched = ratings.transpose()[user].dropna().index
    
    # remove movies that user already watched
    avg_ratings = avg_ratings[~ avg_ratings.index.isin(movies_watched)]
    
    # Findout top n movies based on avg ratings given by nearest neighbours 
    top_movies_ids = avg_ratings.sort_values(ascending=False).index[:top]
   
    print(avg_ratings.sort_values(ascending=False)[:top])
    
    # Return recommended movies 
    return movies[movies.movieId.isin(top_movies_ids)].title

In [56]:
get_recommended_movies(ratings,df_movies,249,5)

18    210
15    184
39    414
28    305
58    586
5      62
62    610
19    212
13    125
11    111
Name: userId, dtype: int64
movieId
122896    3.875000
122898    3.000000
122906    4.100000
122912    4.500000
122916    4.357143
122918    4.166667
122926    4.285714
143355    3.600000
166534    3.000000
167634    4.500000
167746    3.750000
168218    4.500000
168248    4.500000
168250    4.000000
168252    4.500000
168254    3.000000
168266    3.750000
168326    4.500000
168366    4.000000
168418    4.750000
dtype: float64
movieId
172705    5.00
178827    5.00
183897    5.00
177593    4.75
168418    4.75
dtype: float64


22                                 The Boss Baby (2017)
56                               Tickling Giants (2017)
97     Three Billboards Outside Ebbing, Missouri (2017)
105                                 Paddington 2 (2017)
145                                 Isle of Dogs (2018)
Name: title, dtype: object

In [52]:
get_recommended_movies(ratings, df_movies, 433,5)

14    153
0      18
33    339
34    362
35    363
36    380
37    400
38    401
39    414
40    417
Name: userId, dtype: int64
movieId
122898    3.000000
122906    3.750000
122912    5.000000
122916    4.500000
122918    4.500000
122926    4.333333
143355    3.750000
167746    5.000000
168248    5.000000
168250    3.500000
168252    4.200000
168254    4.000000
168326    5.000000
168492    4.500000
168612    4.000000
169982    3.000000
169984    2.000000
169992    3.500000
170939    3.500000
171023    4.000000
dtype: float64


3     Avengers: Infinity War - Part I (2018)
12              The Lego Batman Movie (2017)
15             John Wick: Chapter Two (2017)
20                       The Big Sick (2017)
23               Call Me by Your Name (2017)
Name: title, dtype: object

In [57]:
get_recommended_movies(ratings, df_movies, 125, 10)

60    599
22    249
5      62
6      68
28    305
35    363
49    514
24    258
46    475
39    414
Name: userId, dtype: int64
movieId
122896    3.500000
122898    4.000000
122906    3.500000
122912    4.333333
122916    3.833333
122918    3.777778
122926    4.083333
143355    3.625000
166534    4.500000
167746    4.000000
168248    3.750000
168250    3.375000
168252    4.187500
168254    4.000000
168266    3.750000
168326    4.000000
168492    3.000000
168608    3.000000
168612    4.000000
169958    1.000000
dtype: float64
movieId
177593    4.500000
185031    4.500000
166534    4.500000
173197    4.500000
187593    4.375000
122912    4.333333
185029    4.250000
173925    4.250000
122926    4.083333
122898    4.000000
dtype: float64


1                                 Justice League (2017)
3                Avengers: Infinity War - Part I (2018)
6                     Untitled Spider-Man Reboot (2017)
8                                          Split (2017)
58                                    The Square (2017)
65                                 Seven Sisters (2017)
97     Three Billboards Outside Ebbing, Missouri (2017)
157                                A Quiet Place (2018)
158                                        Alpha (2018)
166                                   Deadpool 2 (2018)
Name: title, dtype: object