## Recommending movies using Collaborative Filtering 

In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import hamming

In [2]:
import warnings
warnings.simplefilter(action='ignore',category=Warning)

In [3]:
df_ratings = pd.read_csv("recent_ratings.csv")
df_movies = pd.read_csv('recent_movies.csv')

In [4]:
df_ratings.shape, df_movies.shape

((552, 4), (188, 4))

In [5]:
df_ratings.userId.unique().size  # No. of unique users 

63

In [6]:
df_ratings.movieId.unique().size  # No. of unique movies 

188

In [7]:
df_ratings.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
471,567,179491,1.0,1525290389
321,339,176371,4.5,1507502471
363,414,122926,4.5,1511535806
179,184,193585,3.5,1537109805
396,462,122926,3.0,1506203358


In [8]:
# Remove timestamp column  
df_ratings.drop(columns='timestamp', inplace=True)

In [9]:
df_ratings.sample(5)

Unnamed: 0,userId,movieId,rating
384,414,179819,4.5
364,414,143355,4.5
422,514,179819,2.0
133,153,168492,4.5
183,209,185029,1.5


In [10]:
df_movies.sample(5)

Unnamed: 0,movieId,title,genres,year
116,179815,"Roman J. Israel, Esq. (2017)",Drama|Thriller,2017
96,177285,Sword Art Online The Movie: Ordinal Scale (2017),Action|Adventure|Animation|Fantasy|Sci-Fi,2017
162,185585,Pacific Rim: Uprising (2018),Action|Fantasy|Sci-Fi,2018
119,179953,A Bad Moms Christmas (2017),Comedy,2017
84,176051,LEGO DC Super Hero Girls: Brain Drain (2017),Animation,2017


In [11]:
# Get rating for each user and movie - userid is row label and movieid is column label 
ratings = df_ratings.pivot(
    index='userId',
    columns='movieId',
    values='rating')

In [13]:
ratings.shape

(63, 188)

In [14]:
ratings.sample(5)

movieId,122896,122898,122906,122912,122916,122918,122926,143355,166534,167064,...,189381,189713,190183,190209,190215,191005,193581,193583,193585,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
471,,,,,,,,,,,...,,,,,,,,,,
296,,,,,,,,,,,...,,,,,,,,,,
98,,,,5.0,5.0,4.0,5.0,,,,...,,,,,,,,,,
362,,,,,,,,,,,...,,,,,,,,,,
205,,,,,,,,,,,...,,,,,,,,,,


In [20]:
ratings.loc[233, ratings.loc[233,:].notna()]  # Ratings given by 233 

movieId
122912    2.0
168266    3.5
174055    2.5
177593    5.0
178061    3.0
180031    3.5
183011    1.5
183897    3.5
187593    2.5
Name: 233, dtype: float64

In [21]:
ratings.loc[471, ratings.loc[471,:].notna()]  # Ratings given by 471

movieId
168252    4.5
Name: 471, dtype: float64

In [22]:
df_movies.loc[df_movies.movieId == 168252, 'title']

17    Logan (2017)
Name: title, dtype: object

### Hamming Distance
Measures how different two sequences are. It is <b>% of disagreement </b>. A value of 1 indicates sequences are very different, 0 indicates they are very similar.

In [23]:
l1 = (1,2,4,np.nan)
l2 = (1,2,np.nan,3)
l3 = (1,np.nan, 3,5)
l4 = (np.nan, np.nan, np.nan, np.nan)
print(hamming(l1,l2))
print(hamming(l1,l3))
print(hamming(l1,l4))

0.5
0.75
1.0


In [24]:
# Find out hamming distance between ratings of two users
def hamming_distance(ratings, user1, user2):
    # Ratings of a user 
    try:
        user1_ratings = ratings.loc[user1,:]
        user2_ratings = ratings.loc[user2,:]
        distance = hamming(user1_ratings, user2_ratings)
    except:
        distance = np.NaN

    return distance    

In [26]:
# Get k neighbours of the given user based on hamming distance 
def get_nearest_users(ratings, active_user, k = 10):
    all_users = pd.DataFrame(ratings.index) # UserIds 
    other_users = all_users[all_users.userId != active_user]
    
    other_users['distance'] = other_users['userId'].apply(lambda userid: hamming_distance(ratings,active_user,userid))
    
    # find out hamming distance and return users with low hamming distance from active user 
    return  other_users.sort_values(['distance'], ascending = True).userId[:k]

In [27]:
print (get_nearest_users(ratings, 184))

22    249
28    305
19    212
2      25
57    567
30    318
8      98
49    514
50    515
0      18
Name: userId, dtype: int64


In [36]:
def get_recommended_movies(ratings, movies, user, top=5):
    # Find out nearest neighbours(userids) based on hamming distance 
    nn_users = get_nearest_users(ratings, user, 10)
    #print(nn_users)
    # Get ratings of nearest neighbours(users)
    nn_ratings = ratings[ratings.index.isin(nn_users)]
    #print(nn_ratings)
    # Average ratings given by nearest neighbours for all movies
    avg_ratings = nn_ratings.apply(np.nanmean).dropna()
    #print(avg_ratings)
    #print(avg_ratings.shape)
    #print(avg_ratings.sort_values(ascending=False)[:20])
    
    # Find out movies that user had already watched
    movies_watched = ratings.transpose()[user].dropna().index
    
    # remove movies that user already watched
    avg_ratings = avg_ratings[~ avg_ratings.index.isin(movies_watched)]
    
    # Findout top n movies based on avg ratings given by nearest neighbours 
    top_movies_ids = avg_ratings.sort_values(ascending=False).index[:top]
    print(top_movies_ids)
    # Return recommended movies 
    return movies[movies.movieId.isin(top_movies_ids)].title

In [37]:
get_recommended_movies(ratings, df_movies, 249, 5)

Index([172705, 178827, 183897, 177593, 168418], dtype='int64', name='movieId')


22                                 The Boss Baby (2017)
56                               Tickling Giants (2017)
97     Three Billboards Outside Ebbing, Missouri (2017)
105                                 Paddington 2 (2017)
145                                 Isle of Dogs (2018)
Name: title, dtype: object

In [38]:
get_recommended_movies(ratings, df_movies, 433,10)

Index([168326, 122912, 167746, 168248, 168492, 176371, 171763, 176751, 177593,
       173197],
      dtype='int64', name='movieId')


3               Avengers: Infinity War - Part I (2018)
12                        The Lego Batman Movie (2017)
15                       John Wick: Chapter Two (2017)
20                                 The Big Sick (2017)
23                         Call Me by Your Name (2017)
51                                  Baby Driver (2017)
58                                   The Square (2017)
87                            Blade Runner 2049 (2017)
92                                American Made (2017)
97    Three Billboards Outside Ebbing, Missouri (2017)
Name: title, dtype: object

In [39]:
get_recommended_movies(ratings, df_movies, 125, 10)

Index([177593, 185031, 166534, 173197, 187593, 122912, 185029, 173925, 122926,
       122898],
      dtype='int64', name='movieId')


1                                 Justice League (2017)
3                Avengers: Infinity War - Part I (2018)
6                     Untitled Spider-Man Reboot (2017)
8                                          Split (2017)
58                                    The Square (2017)
65                                 Seven Sisters (2017)
97     Three Billboards Outside Ebbing, Missouri (2017)
157                                A Quiet Place (2018)
158                                        Alpha (2018)
166                                   Deadpool 2 (2018)
Name: title, dtype: object