In [1]:
!! cd

['C:\\Users\\nagal\\demo\\movies']

# Recommending movies using Collaborative Filtering

### import libraries

In [2]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import hamming

In [4]:
import warnings
warnings.simplefilter(action = "ignore", category = Warning)

In [5]:
df_ratings = pd.read_csv("recent_ratings.csv")
df_movies = pd.read_csv("recent_movies.csv")

In [6]:
df_ratings.shape, df_movies.shape

((552, 4), (188, 4))

In [8]:
df_ratings.userId

0       18
1       18
2       18
3       18
4       18
      ... 
547    610
548    610
549    610
550    610
551    610
Name: userId, Length: 552, dtype: int64

In [14]:
df_ratings.userId.unique().size # NUMBER OF UNIQUE USERS

63

In [15]:
df_movies.movieId.unique().size # NUMBER OF UNIQUE MOVIES

188

In [16]:
df_ratings.sample(10)

Unnamed: 0,userId,movieId,rating,timestamp
109,111,176329,4.0,1517440844
192,212,122906,3.0,1527795017
320,339,174055,5.0,1507760155
68,89,179211,2.5,1520408792
373,414,173209,4.0,1511535879
169,184,180497,4.5,1537109959
279,305,180031,3.0,1521100564
495,596,122898,3.5,1535711652
281,305,187593,5.0,1532877841
377,414,176371,5.0,1511535779


In [17]:
df_movies.sample(10)

Unnamed: 0,movieId,title,genres,year
171,188675,Dogman (2018),Crime|Drama,2018
71,174727,Good Time (2017),Crime|Drama,2017
21,168366,Beauty and the Beast (2017),Fantasy|Romance,2017
33,169992,Free Fire (2017),Action|Crime|Drama,2017
116,179815,"Roman J. Israel, Esq. (2017)",Drama|Thriller,2017
161,185473,Blockers (2018),Comedy,2018
103,178111,"Fireworks, Should We See It from the Side or t...",Animation,2017
100,177765,Coco (2017),Adventure|Animation|Children,2017
8,166534,Split (2017),Drama|Horror|Thriller,2017
54,172461,Get Me Roger Stone (2017),Documentary,2017


In [18]:
# REMOVE TIMESTAMP COLUMN
df_ratings.drop(columns = "timestamp", inplace = True)

In [19]:
df_ratings.sample(10)

Unnamed: 0,userId,movieId,rating
31,50,176371,3.0
188,210,177765,5.0
334,380,122926,5.0
286,306,175199,4.0
527,599,173291,3.0
373,414,173209,4.0
13,25,177593,5.0
137,153,175303,0.5
471,567,179491,1.0
86,111,167634,4.5


In [24]:
# Get rating for each user and movie - userid is row label and movieid is column label
ratings = df_ratings.pivot(index = "userId", columns = "movieId", values = "rating")

In [25]:
ratings.sample(5)

movieId,122896,122898,122906,122912,122916,122918,122926,143355,166534,167064,...,189381,189713,190183,190209,190215,191005,193581,193583,193585,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
98,,,,5.0,5.0,4.0,5.0,,,,...,,,,,,,,,,
548,,,,,,,,,,,...,,,,,,,,,,
414,,,4.0,,4.0,4.0,4.5,4.5,,,...,,,,,,,,,,
601,,,,,3.5,4.0,,,,,...,,,,,,,,,,
338,,,,1.5,1.0,,,,,,...,,,,4.0,1.5,,,,,


In [26]:
ratings.loc[233]

movieId
122896    NaN
122898    NaN
122906    NaN
122912    2.0
122916    NaN
         ... 
191005    NaN
193581    NaN
193583    NaN
193585    NaN
193587    NaN
Name: 233, Length: 188, dtype: float64

In [28]:
ratings.loc[233, ratings.loc[233,:].notna()]

movieId
122912    2.0
168266    3.5
174055    2.5
177593    5.0
178061    3.0
180031    3.5
183011    1.5
183897    3.5
187593    2.5
Name: 233, dtype: float64

# Hamming Distance

### Measures how different two sequences are. It is %disagreement. A value of 1 indicates sequences are very different, 0 indicates they are very similar

In [55]:
l1 = (1,2,4,np.nan)
l2 = (1,2,np.nan,3)
l3 = (1,np.nan, 4,5)
print(hamming(l1,l2))
print(hamming(l1,l3))

0.5
0.5


In [56]:
# Find out hamming distance between ratings of two users
def hamming_distance(user1,user2):
    # Ratings of a user 
    try:
        user1_ratings = ratings.loc[user1,:]
        user2_ratings = ratings.loc[user2,:]
        distance = hamming(user1_ratings, user2_ratings)
    except:
        distance = np.NaN

    return distance    

In [57]:
# Get neighbours of the given user 
def get_nearest_users(active_user, k = 10):
    all_users = pd.DataFrame(ratings.index) # UserIds 
    other_users = all_users[all_users.userId != active_user]
    
    other_users['distance'] = other_users['userId'].apply(lambda x: hamming_distance(active_user,x))
    
    # find out hamming distance and return users with low hamming distance from active user 
    return  other_users.sort_values(['distance'], ascending = True).userId[:k]

In [58]:
def get_recommended_movies(ratings, movies, user, top=5):
    # Find out nearest neighbours based on hamming distance 
    nn_users = get_nearest_users(user,10)
    print(nn_users)
    # Get ratings of nearest neighbours(users)
    nn_ratings = ratings[ratings.index.isin(nn_users)]
    
    # Average ratings given by nearest neighbours for all movies
    avg_ratings = nn_ratings.apply(np.nanmean).dropna()
    print(avg_ratings[:20])
    
    # Find out movies that user had already watched
    movies_watched = ratings.transpose()[user].dropna().index
    
    # remove movies that user already watched
    avg_ratings = avg_ratings[~ avg_ratings.index.isin(movies_watched)]
    
    # Findout top n movies based on avg ratings given by nearest neighbours 
    top_movies_ids = avg_ratings.sort_values(ascending=False).index[:top]
   
    # Return recommended movies 
    return movies[movies.movieId.isin(top_movies_ids)].title

In [59]:
get_recommended_movies(ratings,df_movies,249,5)

18    210
15    184
39    414
28    305
58    586
5      62
62    610
19    212
13    125
11    111
Name: userId, dtype: int64
movieId
122896    3.875000
122898    3.000000
122906    4.100000
122912    4.500000
122916    4.357143
122918    4.166667
122926    4.285714
143355    3.600000
166534    3.000000
167634    4.500000
167746    3.750000
168218    4.500000
168248    4.500000
168250    4.000000
168252    4.500000
168254    3.000000
168266    3.750000
168326    4.500000
168366    4.000000
168418    4.750000
dtype: float64


22                                 The Boss Baby (2017)
56                               Tickling Giants (2017)
97     Three Billboards Outside Ebbing, Missouri (2017)
105                                 Paddington 2 (2017)
145                                 Isle of Dogs (2018)
Name: title, dtype: object

In [60]:
get_recommended_movies(ratings, df_movies, 433,5)

14    153
0      18
33    339
34    362
35    363
36    380
37    400
38    401
39    414
40    417
Name: userId, dtype: int64
movieId
122898    3.000000
122906    3.750000
122912    5.000000
122916    4.500000
122918    4.500000
122926    4.333333
143355    3.750000
167746    5.000000
168248    5.000000
168250    3.500000
168252    4.200000
168254    4.000000
168326    5.000000
168492    4.500000
168612    4.000000
169982    3.000000
169984    2.000000
169992    3.500000
170939    3.500000
171023    4.000000
dtype: float64


3     Avengers: Infinity War - Part I (2018)
12              The Lego Batman Movie (2017)
15             John Wick: Chapter Two (2017)
20                       The Big Sick (2017)
23               Call Me by Your Name (2017)
Name: title, dtype: object

In [62]:
get_recommended_movies(ratings,df_movies,18,10)

39    414
19    212
35    363
61    601
30    318
26    279
21    248
17    209
15    184
54    551
Name: userId, dtype: int64
movieId
122896    3.750000
122898    2.000000
122906    3.750000
122912    4.500000
122916    4.000000
122918    4.214286
122926    4.166667
143355    3.625000
168218    4.500000
168248    4.000000
168250    3.666667
168252    3.900000
168254    3.000000
168326    4.500000
168366    4.000000
168492    4.000000
168612    3.750000
169992    3.500000
170697    4.166667
170813    2.000000
dtype: float64


51                                    Baby Driver (2017)
56                                Tickling Giants (2017)
73                                    Logan Lucky (2017)
101              The Night Is Short, Walk on Girl (2017)
105                                  Paddington 2 (2017)
109                                Blue Planet II (2017)
118                      Star Wars: The Last Jedi (2017)
125                           The Disaster Artist (2017)
132    Too Funny to Fail: The Life and Death of The D...
145                                  Isle of Dogs (2018)
Name: title, dtype: object

In [65]:
get_recommended_movies(ratings,df_movies,50,5)

20    233
0      18
34    362
35    363
36    380
37    400
38    401
39    414
40    417
41    433
Name: userId, dtype: int64
movieId
122898    3.000000
122906    3.750000
122912    3.500000
122916    4.500000
122918    4.500000
122926    4.333333
143355    3.750000
167746    5.000000
168248    5.000000
168250    3.500000
168252    4.200000
168254    4.000000
168266    3.500000
168326    5.000000
168612    4.000000
169982    3.000000
169984    2.000000
169992    3.500000
170939    3.500000
171023    4.000000
dtype: float64


12                        The Lego Batman Movie (2017)
15                       John Wick: Chapter Two (2017)
20                                 The Big Sick (2017)
57               War for the Planet of the Apes (2017)
97    Three Billboards Outside Ebbing, Missouri (2017)
Name: title, dtype: object