## Recommending movies using Collaborative Filtering 

In [19]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import hamming

In [20]:
import warnings
warnings.simplefilter(action='ignore',category=Warning)

In [21]:
df_ratings = pd.read_csv("recent_ratings.csv")
df_movies = pd.read_csv('recent_movies.csv')

In [22]:
df_ratings.shape

(552, 4)

In [23]:
df_ratings.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
28,50,175661,1.5,1514240027
438,515,122918,5.0,1513602063
217,233,177593,5.0,1524781384
389,414,184791,2.5,1519592410
86,111,167634,4.5,1516153933


In [24]:
# Remove timestamp column  
df_ratings.drop(columns='timestamp', inplace=True)

In [25]:
df_ratings.sample(5)

Unnamed: 0,userId,movieId,rating
451,556,187031,4.0
429,514,185033,2.0
35,62,122898,4.0
266,279,175303,5.0
65,89,176805,4.0


In [26]:
# Get rating for each user and movie - userid is row label and movieid is column label 
ratings = df_ratings.pivot(
    index='userId',
    columns='movieId',
    values='rating')

In [27]:
ratings.sample(5)

movieId,122896,122898,122906,122912,122916,122918,122926,143355,166534,167064,...,189381,189713,190183,190209,190215,191005,193581,193583,193585,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
515,,,,,5.0,5.0,,,,,...,,,,,,,,,,
362,,,,,,,,,,,...,,,,,,,,,,
305,,,,4.5,4.5,3.5,4.0,,,,...,,,,,,,,,,
448,,,,,,,,,,,...,,,,,,,,,,
514,,,2.0,,,,4.0,,,,...,,,,,,,,,,


### Hamming Distance
Measures how different two sequences are. It is % of disagreement. A value of 1 indicates sequences are very different, 0 indicates they are very similar.

In [28]:
# Find out hamming distance between two users
def hamming_distance(user1,user2):
    # Ratings of a user 
    try:
        user1_ratings = ratings.loc[user1,:]
        user2_ratings = ratings.loc[user2,:]
        distance = hamming(user1_ratings, user2_ratings)
    except:
        distance = np.NaN

    return distance    

In [29]:
def get_nearest_users(active_user, k = 10):
    all_users = pd.DataFrame(ratings.index) # UserIds 
    other_users = all_users[all_users.userId != active_user]
    other_users['distance'] = other_users['userId'].apply(lambda x: hamming_distance(active_user,x))
    
    # find out hamming distance and return users with low hamming distance from active user 
    return  other_users.sort_values(['distance'], ascending = True).userId[:k]

In [30]:
def get_recommended_movies(ratings,movies, user,top=5):
    # Find out nearest neighbours
    nn_users = get_nearest_users(user,10)
    
    # Get ratings of other nearest neighbours(users)
    nn_ratings = ratings[ratings.index.isin(nn_users)]
    
    # Average ratings giving by nearest neighbours for other movies
    avg_ratings = nn_ratings.apply(np.nanmean).dropna()
    
    # Find out movies that user had already watched
    movies_watched = ratings.transpose()[user].dropna().index
    
    # remove movies that user already watched
    avg_ratings = avg_ratings[~ avg_ratings.index.isin(movies_watched)]
    
    # Findout top n movies based on avg ratings given by other nearest neighbours 
    top_movies = avg_ratings.sort_values(ascending=False).index[:top]
   
    # Return recommended movies 
    return movies[movies.movieId.isin(top_movies)].title

In [31]:
get_recommended_movies(ratings,df_movies,249,10)

22                                  The Boss Baby (2017)
28                           Neal Brennan: 3 Mics (2017)
38                                         Gifted (2017)
56                                Tickling Giants (2017)
73                                    Logan Lucky (2017)
97      Three Billboards Outside Ebbing, Missouri (2017)
105                                  Paddington 2 (2017)
125                           The Disaster Artist (2017)
132    Too Funny to Fail: The Life and Death of The D...
145                                  Isle of Dogs (2018)
Name: title, dtype: object

In [32]:
get_recommended_movies(ratings,df_movies, 433,5)

3               Avengers: Infinity War - Part I (2018)
12                        The Lego Batman Movie (2017)
15                       John Wick: Chapter Two (2017)
20                                 The Big Sick (2017)
97    Three Billboards Outside Ebbing, Missouri (2017)
Name: title, dtype: object