## Recommending movies using Collaborative Filtering 

In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import hamming

In [2]:
import warnings
warnings.simplefilter(action='ignore',category=Warning)

In [3]:
df_ratings = pd.read_csv("recent_ratings.csv")
df_movies = pd.read_csv('recent_movies.csv')

In [4]:
df_ratings.shape, df_movies.shape

((552, 4), (188, 4))

In [5]:
df_ratings.userId.unique().size  # No. of unique users 

63

In [6]:
df_ratings.movieId.unique().size  # No. of unique movies 

188

In [7]:
df_ratings.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
286,306,175199,4.0,1518380703
82,105,179133,5.0,1526207351
419,514,175303,3.0,1533874664
540,601,122916,3.5,1521397630
379,414,176751,4.0,1521844173


In [8]:
# Remove timestamp column  
df_ratings.drop(columns='timestamp', inplace=True)

In [9]:
df_ratings.sample(5)

Unnamed: 0,userId,movieId,rating
313,338,185029,1.0
258,252,122896,3.0
237,249,166534,4.5
227,248,176371,2.5
481,586,122906,5.0


In [10]:
df_movies.sample(5)

Unnamed: 0,movieId,title,genres,year
60,173209,War Machine (2017),Comedy|Drama|War,2017
56,172705,Tickling Giants (2017),Documentary,2017
159,185033,I Kill Giants (2018),Drama|Fantasy|Thriller,2018
103,178111,"Fireworks, Should We See It from the Side or t...",Animation,2017
145,183897,Isle of Dogs (2018),Animation|Comedy,2018


In [11]:
# Get rating for each user and movie - userid is row label and movieid is column label 
ratings = df_ratings.pivot(
    index='userId',
    columns='movieId',
    values='rating')

In [12]:
ratings.sample(5)

movieId,122896,122898,122906,122912,122916,122918,122926,143355,166534,167064,...,189381,189713,190183,190209,190215,191005,193581,193583,193585,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
18,,,,,,4.0,3.5,,,,...,,,,,,,,,,
50,,0.5,,,,1.0,,,,,...,,,3.5,,,,,,,
318,,,4.0,,,,,3.5,,,...,2.5,,,,,,,,,
448,,,,,,,,,,,...,,,,,,,,,,
610,,,,,,,,,4.0,,...,,,,,,,,,,


In [13]:
ratings.loc[233, ratings.loc[233,:].notna()]  # Ratings given by 233 

movieId
122912    2.0
168266    3.5
174055    2.5
177593    5.0
178061    3.0
180031    3.5
183011    1.5
183897    3.5
187593    2.5
Name: 233, dtype: float64

### Hamming Distance
Measures how different two sequences are. It is <b>% of disagreement </b>. A value of 1 indicates sequences are very different, 0 indicates they are very similar.

In [14]:
l1 = (1,2,4,np.nan)
l2 = (1,2,np.nan,3)
l3 = (1,np.nan, 3,5)
print(hamming(l1,l2))
print(hamming(l1,l3))

0.5
0.75


In [15]:
# Find out hamming distance between ratings of two users
def hamming_distance(user1,user2):
    # Ratings of a user 
    try:
        user1_ratings = ratings.loc[user1,:]
        user2_ratings = ratings.loc[user2,:]
        distance = hamming(user1_ratings, user2_ratings)
    except:
        distance = np.NaN

    return distance    

In [16]:
# Get neighbours of the given user 
def get_nearest_users(active_user, k = 10):
    all_users = pd.DataFrame(ratings.index) # UserIds 
    other_users = all_users[all_users.userId != active_user]
    
    other_users['distance'] = other_users['userId'].apply(lambda x: hamming_distance(active_user,x))
    
    # find out hamming distance and return users with low hamming distance from active user 
    return  other_users.sort_values(['distance'], ascending = True).userId[:k]

In [17]:
def get_recommended_movies(ratings, movies, user, top=5):
    # Find out nearest neighbours based on hamming distance 
    nn_users = get_nearest_users(user,10)
    print(nn_users)
    # Get ratings of nearest neighbours(users)
    nn_ratings = ratings[ratings.index.isin(nn_users)]
    
    # Average ratings given by nearest neighbours for all movies
    avg_ratings = nn_ratings.apply(np.nanmean).dropna()
    print(avg_ratings[:20])
    
    # Find out movies that user had already watched
    movies_watched = ratings.transpose()[user].dropna().index
    
    # remove movies that user already watched
    avg_ratings = avg_ratings[~ avg_ratings.index.isin(movies_watched)]
    
    # Findout top n movies based on avg ratings given by nearest neighbours 
    top_movies_ids = avg_ratings.sort_values(ascending=False).index[:top]
   
    # Return recommended movies 
    return movies[movies.movieId.isin(top_movies_ids)].title

In [18]:
get_recommended_movies(ratings,df_movies,249,5)

18    210
15    184
39    414
28    305
58    586
5      62
62    610
19    212
13    125
11    111
Name: userId, dtype: int64
movieId
122896    3.875000
122898    3.000000
122906    4.100000
122912    4.500000
122916    4.357143
122918    4.166667
122926    4.285714
143355    3.600000
166534    3.000000
167634    4.500000
167746    3.750000
168218    4.500000
168248    4.500000
168250    4.000000
168252    4.500000
168254    3.000000
168266    3.750000
168326    4.500000
168366    4.000000
168418    4.750000
dtype: float64


22                                 The Boss Baby (2017)
56                               Tickling Giants (2017)
97     Three Billboards Outside Ebbing, Missouri (2017)
105                                 Paddington 2 (2017)
145                                 Isle of Dogs (2018)
Name: title, dtype: object

In [19]:
get_recommended_movies(ratings, df_movies, 433,5)

14    153
0      18
33    339
34    362
35    363
36    380
37    400
38    401
39    414
40    417
Name: userId, dtype: int64
movieId
122898    3.000000
122906    3.750000
122912    5.000000
122916    4.500000
122918    4.500000
122926    4.333333
143355    3.750000
167746    5.000000
168248    5.000000
168250    3.500000
168252    4.200000
168254    4.000000
168326    5.000000
168492    4.500000
168612    4.000000
169982    3.000000
169984    2.000000
169992    3.500000
170939    3.500000
171023    4.000000
dtype: float64


3     Avengers: Infinity War - Part I (2018)
12              The Lego Batman Movie (2017)
15             John Wick: Chapter Two (2017)
20                       The Big Sick (2017)
23               Call Me by Your Name (2017)
Name: title, dtype: object

In [20]:
get_recommended_movies(ratings, df_movies, 125, 10)

60    599
22    249
5      62
6      68
28    305
35    363
49    514
24    258
46    475
39    414
Name: userId, dtype: int64
movieId
122896    3.500000
122898    4.000000
122906    3.500000
122912    4.333333
122916    3.833333
122918    3.777778
122926    4.083333
143355    3.625000
166534    4.500000
167746    4.000000
168248    3.750000
168250    3.375000
168252    4.187500
168254    4.000000
168266    3.750000
168326    4.000000
168492    3.000000
168608    3.000000
168612    4.000000
169958    1.000000
dtype: float64


1                                 Justice League (2017)
3                Avengers: Infinity War - Part I (2018)
6                     Untitled Spider-Man Reboot (2017)
8                                          Split (2017)
58                                    The Square (2017)
65                                 Seven Sisters (2017)
97     Three Billboards Outside Ebbing, Missouri (2017)
157                                A Quiet Place (2018)
158                                        Alpha (2018)
166                                   Deadpool 2 (2018)
Name: title, dtype: object