## Recommending movies using Collaborative Filtering 

In [11]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import hamming

In [12]:
import warnings
warnings.simplefilter(action='ignore',category=Warning)

In [13]:
df_ratings = pd.read_csv("recent_ratings.csv")
df_movies = pd.read_csv('recent_movies.csv')

In [14]:
df_ratings.shape

(552, 4)

In [15]:
df_ratings.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
123,125,122918,3.5,1494785239
58,62,187595,4.0,1528934550
100,111,170957,3.0,1517440869
99,111,170937,3.0,1517440974
498,596,122916,5.0,1535627433


In [16]:
# Remove timestamp column  
df_ratings.drop(columns='timestamp', inplace=True)

In [17]:
df_ratings.sample(5)

Unnamed: 0,userId,movieId,rating
181,205,179817,4.5
352,380,182823,3.0
151,184,168218,4.5
356,380,188301,4.0
85,111,166534,0.5


In [18]:
# Get rating for each user and movie - userid is row label and movieid is column label 
ratings = df_ratings.pivot(
    index='userId',
    columns='movieId',
    values='rating')

In [19]:
ratings.sample(5)

movieId,122896,122898,122906,122912,122916,122918,122926,143355,166534,167064,...,189381,189713,190183,190209,190215,191005,193581,193583,193585,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
68,,,,,,3.5,,,,,...,,,,,,,,,,
153,,,,,,,,,,,...,,,,,,,,,,
601,,,,,3.5,4.0,,,,,...,,,,,,,,,,
205,,,,,,,,,,,...,,,,,,,,,,
62,3.5,4.0,,4.0,4.0,4.0,4.0,3.5,4.5,,...,,,,,,,,,,


### Hamming Distance
Measures how different two sequences are. It is % of disagreement. A value of 1 indicates sequences are very different, 0 indicates they are very similar.

In [20]:
# Find out hamming distance between two users
def hamming_distance(user1,user2):
    # Ratings of a user 
    try:
        user1_ratings = ratings.loc[user1,:]
        user2_ratings = ratings.loc[user2,:]
        distance = hamming(user1_ratings, user2_ratings)
    except:
        distance = np.NaN

    return distance    

In [21]:
def get_nearest_users(active_user, k = 10):
    all_users = pd.DataFrame(ratings.index) # UserIds 
    other_users = all_users[all_users.userId != active_user]
    other_users['distance'] = other_users['userId'].apply(lambda x: hamming_distance(active_user,x))
    
    # find out hamming distance and return users with low hamming distance from active user 
    return  other_users.sort_values(['distance'], ascending = True).userId[:k]

In [22]:
def get_recommended_movies(ratings,movies, user,top=5):
    # Find out nearest neighbours based on hamming distance 
    nn_users = get_nearest_users(user,10)
    
    # Get ratings of other nearest neighbours(users)
    nn_ratings = ratings[ratings.index.isin(nn_users)]
    
    # Average ratings gived by nearest neighbours for all movies
    avg_ratings = nn_ratings.apply(np.nanmean).dropna()
    
    # Find out movies that user had already watched
    movies_watched = ratings.transpose()[user].dropna().index
    
    # remove movies that user already watched
    avg_ratings = avg_ratings[~ avg_ratings.index.isin(movies_watched)]
    
    # Findout top n movies based on avg ratings given by other nearest neighbours 
    top_movies_ids = avg_ratings.sort_values(ascending=False).index[:top]
   
    # Return recommended movies 
    return movies[movies.movieId.isin(top_movies_ids)].title

In [23]:
get_recommended_movies(ratings,df_movies,249,10)

22                                  The Boss Baby (2017)
28                           Neal Brennan: 3 Mics (2017)
38                                         Gifted (2017)
56                                Tickling Giants (2017)
73                                    Logan Lucky (2017)
97      Three Billboards Outside Ebbing, Missouri (2017)
105                                  Paddington 2 (2017)
125                           The Disaster Artist (2017)
132    Too Funny to Fail: The Life and Death of The D...
145                                  Isle of Dogs (2018)
Name: title, dtype: object

In [24]:
get_recommended_movies(ratings,df_movies, 433,5)

3               Avengers: Infinity War - Part I (2018)
12                        The Lego Batman Movie (2017)
15                       John Wick: Chapter Two (2017)
20                                 The Big Sick (2017)
97    Three Billboards Outside Ebbing, Missouri (2017)
Name: title, dtype: object