## Recommending movies using Collaborative Filtering 

In [47]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import hamming

In [29]:
import warnings
warnings.simplefilter(action='ignore',category=Warning)

In [30]:
df_ratings = pd.read_csv("recent_ratings.csv")
df_movies = pd.read_csv('recent_movies.csv')

In [31]:
df_ratings.shape, df_movies.shape

((552, 4), (188, 4))

In [32]:
df_ratings.userId.unique().size  # No. of unique users 

63

In [33]:
df_ratings.movieId.unique().size  # No. of unique movies 

188

In [34]:
df_ratings.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
391,417,168250,3.0,1529285245
59,68,122918,3.5,1526947569
344,380,173291,2.0,1508110319
235,249,122926,4.5,1501416792
36,62,122912,4.0,1526028975


In [35]:
# Remove timestamp column  
df_ratings.drop(columns='timestamp', inplace=True)

In [36]:
df_ratings.sample(5)

Unnamed: 0,userId,movieId,rating
186,210,122918,5.0
114,111,181065,3.5
370,414,171765,4.0
209,212,180031,4.0
468,567,176419,3.0


In [37]:
# Get rating for each user and movie - userid is row label and movieid is column label 
ratings = df_ratings.pivot(
    index='userId',
    columns='movieId',
    values='rating')

In [38]:
ratings.sample(5)

movieId,122896,122898,122906,122912,122916,122918,122926,143355,166534,167064,...,189381,189713,190183,190209,190215,191005,193581,193583,193585,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
153,,,,,,,,,,,...,,,,,,,,,,
414,,,4.0,,4.0,4.0,4.5,4.5,,,...,,,,,,,,,,
556,,,,,,,,,,,...,,,,,,,,,,
417,,,,,,,,,,,...,,,,,,,,,,
98,,,,5.0,5.0,4.0,5.0,,,,...,,,,,,,,,,


### Hamming Distance
Measures how different two sequences are. It is % of disagreement. A value of 1 indicates sequences are very different, 0 indicates they are very similar.

In [39]:
l1 = (1,2,3)
l2 = (1,2,3)
l3 = (1,4,5)
print(hamming(l1,l2))
print(hamming(l1,l3))

0.0
0.6666666666666666


In [40]:
# Find out hamming distance between two users
def hamming_distance(user1,user2):
    # Ratings of a user 
    try:
        user1_ratings = ratings.loc[user1,:]
        user2_ratings = ratings.loc[user2,:]
        distance = hamming(user1_ratings, user2_ratings)
    except:
        distance = np.NaN

    return distance    

In [41]:
# Get neighbours of the give user 
def get_nearest_users(active_user, k = 10):
    all_users = pd.DataFrame(ratings.index) # UserIds 
    other_users = all_users[all_users.userId != active_user]
    
    other_users['distance'] = other_users['userId'].apply(lambda x: hamming_distance(active_user,x))
    
    # find out hamming distance and return users with low hamming distance from active user 
    return  other_users.sort_values(['distance'], ascending = True).userId[:k]

In [42]:
def get_recommended_movies(ratings,movies, user,top=5):
    # Find out nearest neighbours based on hamming distance 
    nn_users = get_nearest_users(user,10)
    
    # Get ratings of other nearest neighbours(users)
    nn_ratings = ratings[ratings.index.isin(nn_users)]
    
    # Average ratings gived by nearest neighbours for all movies
    avg_ratings = nn_ratings.apply(np.nanmean).dropna()
    
    # Find out movies that user had already watched
    movies_watched = ratings.transpose()[user].dropna().index
    
    # remove movies that user already watched
    avg_ratings = avg_ratings[~ avg_ratings.index.isin(movies_watched)]
    
    # Findout top n movies based on avg ratings given by other nearest neighbours 
    top_movies_ids = avg_ratings.sort_values(ascending=False).index[:top]
   
    # Return recommended movies 
    return movies[movies.movieId.isin(top_movies_ids)].title

In [43]:
df_ratings.userId.unique()    # Unique userids

array([ 18,  21,  25,  49,  50,  62,  68,  89,  98, 103, 105, 111, 119,
       125, 153, 184, 205, 209, 210, 212, 233, 248, 249, 252, 258, 272,
       279, 296, 305, 306, 318, 331, 338, 339, 362, 363, 380, 400, 401,
       414, 417, 433, 448, 459, 462, 471, 475, 491, 511, 514, 515, 517,
       523, 548, 551, 556, 561, 567, 586, 596, 599, 601, 610], dtype=int64)

In [44]:
get_recommended_movies(ratings,df_movies,249,5)

22                                 The Boss Baby (2017)
56                               Tickling Giants (2017)
97     Three Billboards Outside Ebbing, Missouri (2017)
105                                 Paddington 2 (2017)
145                                 Isle of Dogs (2018)
Name: title, dtype: object

In [45]:
get_recommended_movies(ratings,df_movies, 433,5)

3               Avengers: Infinity War - Part I (2018)
12                        The Lego Batman Movie (2017)
15                       John Wick: Chapter Two (2017)
20                                 The Big Sick (2017)
97    Three Billboards Outside Ebbing, Missouri (2017)
Name: title, dtype: object

In [46]:
get_recommended_movies(ratings,df_movies, 125,10)

3                Avengers: Infinity War - Part I (2018)
6                     Untitled Spider-Man Reboot (2017)
8                                          Split (2017)
58                                    The Square (2017)
65                                 Seven Sisters (2017)
90                                        Icarus (2017)
97     Three Billboards Outside Ebbing, Missouri (2017)
157                                A Quiet Place (2018)
158                                        Alpha (2018)
166                                   Deadpool 2 (2018)
Name: title, dtype: object