# Collaborative Filtering - User - User similarity
Collaborative filtering is built around the premise that users who have ranked items similarly in the past have similar tastes, and therefore are likely to rate new items in a similar fashion.

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
user_ratings_df = pd.read_csv('user_ratings.csv')

In [3]:
user_ratings_df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,1493844794,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,1493848789,Blair Witch (2016),Horror|Thriller


In [15]:
user_ratings_df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [58]:
print('number of unique movie titles are : ', len(user_ratings_df.title.unique()))
print('number of unique movieIDs are     : ', len(user_ratings_df.movieId.unique()))
print('number of unique userIDs are      : ', len(user_ratings_df.userId.unique()))      

number of unique movie titles are :  9446
number of unique movieIDs are     :  9724
number of unique userIDs are      :  610


In [38]:
user_ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
 4   title      100836 non-null  object 
 5   genres     100836 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 4.6+ MB


As we can note that there are there are total 9724 unique movie ids, whereas 9446 unique movies titles and there are total 100836 users. We need to find user-user similarity so that we can recommend movies rated highly by one user to other similar user.

In [65]:
# Custom function to clean the data
def clean_title(x):
    return re.sub(r'[^a-zA-Z0-9_: ]',"",x).strip()

# Custom function to create custom heading
def heading(title):
    print(f'\n \033[7m {title} \033[0m \n')

In [67]:
user_ratings_df['title'] = user_ratings_df['title'].apply(clean_title)
user_ratings_df['title']

0                         Toy Story
1                         Toy Story
2                         Toy Story
3                         Toy Story
4                         Toy Story
                    ...            
100831                    Bloodmoon
100832    Sympathy for the Underdog
100833                       Hazard
100834                  Blair Witch
100835                           31
Name: title, Length: 100836, dtype: object

In [68]:
# Transform the table
user_ratings_table = user_ratings_df.pivot_table(index='userId', columns='title', values='rating')
user_ratings_table

title,Unnamed: 1_level_0,00 Schneider Jagd auf Nihil Baxter,10,10 Cent Pistol,10 Cloverfield Lane,10 Items or Less,10 Things I Hate About You,10 Years,100 Girls,100 Streets,...,Zootopia,Zulu,anohana: The Flower We Saw That Day The Movie,batteries not included,burbs The,eXistenZ,night Mother,nous la libert Freedom for Us,xXx,xXx: State of the Union
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,3.5,,,,,...,,,,,,4.5,,,3.5,
609,,,,,,,,,,,...,,,,,,,,,,


### Normalize the user_ratings to avoid NaN issues

In [69]:
# Get the average rating given by each user for all movies
avg_ratings = user_ratings_table.mean(axis=1)
heading('Average Rating for each user / accross rows')
print(avg_ratings)

# Center each users ratings around 0
user_ratings_table_centered = np.round(user_ratings_table.sub(avg_ratings, axis=0),0)
heading('User Ratings Centered around the mean')
print(user_ratings_table_centered.mean(axis=1))

# Fill in the missing data with 0s
user_ratings_table_normed = user_ratings_table_centered.fillna(0)
heading('Normalized User Ratings')
print(user_ratings_table_normed.mean(axis=1))


 [7m Average Rating for each user / accross rows [0m 

userId
1      4.373362
2      3.948276
3      2.435897
4      3.555556
5      3.636364
         ...   
606    3.656476
607    3.786096
608    3.129394
609    3.270270
610    3.687163
Length: 610, dtype: float64

 [7m User Ratings Centered around the mean [0m 

userId
1      0.371179
2      0.103448
3      0.102564
4     -0.444444
5     -0.363636
         ...   
606   -0.128623
607   -0.213904
608   -0.098182
609    0.270270
610   -0.100231
Length: 610, dtype: float64

 [7m Normalized User Ratings [0m 

userId
1      0.009004
2      0.000318
3      0.000424
4     -0.010169
5     -0.001695
         ...   
606   -0.015042
607   -0.004237
608   -0.008581
609    0.001059
610   -0.013771
Length: 610, dtype: float64


In [70]:
user_ratings_table_normed

title,Unnamed: 1_level_0,00 Schneider Jagd auf Nihil Baxter,10,10 Cent Pistol,10 Cloverfield Lane,10 Items or Less,10 Things I Hate About You,10 Years,100 Girls,100 Streets,...,Zootopia,Zulu,anohana: The Flower We Saw That Day The Movie,batteries not included,burbs The,eXistenZ,night Mother,nous la libert Freedom for Us,xXx,xXx: State of the Union
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Generate user-user similarity matrix

In [71]:
from sklearn.metrics.pairwise import cosine_similarity

In [73]:
# Generate the similarity matrix
similarities = np.round(cosine_similarity(user_ratings_table_normed), 3)
heading('User User Similarities')
print(similarities)

# Wrap the similarities in a DataFrame
user_similarity_df = pd.DataFrame(similarities,
                                  index=user_ratings_table_normed.index,
                                  columns=user_ratings_table_normed.index)

heading('User-User Similarity Table')
user_similarity_df


 [7m User User Similarities [0m 

[[ 1.     0.    -0.    ...  0.066  0.     0.034]
 [ 0.     1.     0.    ...  0.    -0.073  0.037]
 [-0.     0.     1.    ... -0.015  0.     0.016]
 ...
 [ 0.066  0.    -0.015 ...  1.     0.039  0.038]
 [ 0.    -0.073  0.    ...  0.039  1.     0.   ]
 [ 0.034  0.037  0.016 ...  0.038  0.     1.   ]]

 [7m User-User Similarity Table [0m 



userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000,0.000,-0.000,0.015,0.022,0.009,0.019,0.032,0.044,-0.026,...,0.064,0.037,-0.062,-0.026,-0.006,0.015,0.028,0.066,0.000,0.034
2,0.000,1.000,0.000,-0.034,0.033,-0.026,0.000,-0.032,0.000,0.031,...,-0.078,-0.038,-0.006,0.000,0.000,0.009,-0.017,0.000,-0.073,0.037
3,-0.000,0.000,1.000,-0.007,-0.021,-0.008,0.000,-0.020,0.000,0.000,...,-0.024,-0.023,0.024,0.000,0.000,-0.029,-0.005,-0.015,0.000,0.016
4,0.015,-0.034,-0.007,1.000,-0.035,-0.023,0.054,0.034,-0.011,0.064,...,-0.066,0.016,0.032,-0.039,0.028,-0.002,0.029,-0.034,-0.031,-0.017
5,0.022,0.033,-0.021,-0.035,1.000,-0.083,0.000,0.000,0.000,-0.020,...,0.000,-0.154,0.045,-0.082,-0.047,0.029,0.043,0.004,0.000,-0.005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.015,0.009,-0.029,-0.002,0.029,-0.023,0.032,0.028,0.032,-0.019,...,0.068,0.007,0.071,0.041,0.016,1.000,0.021,0.050,0.013,0.050
607,0.028,-0.017,-0.005,0.029,0.043,0.026,-0.005,0.031,-0.017,-0.010,...,0.063,0.000,0.070,-0.017,-0.042,0.021,1.000,0.028,0.023,0.005
608,0.066,0.000,-0.015,-0.034,0.004,-0.030,0.021,0.077,0.028,-0.044,...,0.100,0.041,-0.001,-0.039,-0.010,0.050,0.028,1.000,0.039,0.038
609,0.000,-0.073,0.000,-0.031,0.000,0.127,0.039,-0.044,0.000,-0.043,...,0.053,0.155,-0.025,0.143,-0.026,0.013,0.023,0.039,1.000,0.000


In [88]:
np.array(similarities).shape

(610, 610)

In [74]:
# Custom Funtion to Find the similarity scores for user_1 with all other users and sort them

def similar_users(user, n):
    user_similarity_series = user_similarity_df.loc[user]
    ordered_similarities = user_similarity_series.sort_values(ascending=False)
    nearest_neighbors = ordered_similarities[1:n].index
    return list(nearest_neighbors)

# Custom Function to Extract the ratings of the neighbors
def avg_neighbor_ratings(movie, user_list):
    neighbor_ratings = user_ratings_table.reindex(user_list)
    avg_ratings = np.round(neighbor_ratings[movie].mean(),2)
    return avg_ratings

# Custom Function to Get the average ratings given by similar users for a particular movie
def recommend_movie(user, movie):
    num_of_neighbors = 10
    neighbors = similar_users(user,num_of_neighbors)
    avg_ratings = avg_neighbor_ratings(movie, neighbors)
    print("The average rating given by similar users for the movie '{0}' is {1}".format(movie,avg_ratings ))
    if avg_ratings > 3.5 :
        print("The user-user model recommends '{0}' to user {1}".format(movie,user ))
    else:
        print("The user-user model does not recommend '{0}' to user {1}".format(movie,user ))  

In [75]:
recommend_movie(user=6, movie='Apollo 13')

The average rating given by similar users for the movie 'Apollo 13' is 3.88
The user-user model recommends 'Apollo 13' to user 6


## k-NN Approach

In [76]:
# Drop the column you are trying to predict
#user_ratings_table_normed.loc[[1]].drop("Apollo 13", axis=1, inplace=True)

# Get the data for the user you are predicting for
target_user_x = user_ratings_table_normed.loc[[1]]
target_user_x.drop("Apollo 13", axis=1, inplace=True)

In [91]:
user_ratings_table_normed.loc[[1]]

title,Unnamed: 1_level_0,00 Schneider Jagd auf Nihil Baxter,10,10 Cent Pistol,10 Cloverfield Lane,10 Items or Less,10 Things I Hate About You,10 Years,100 Girls,100 Streets,...,Zootopia,Zulu,anohana: The Flower We Saw That Day The Movie,batteries not included,burbs The,eXistenZ,night Mother,nous la libert Freedom for Us,xXx,xXx: State of the Union
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [94]:
# Get the target data from user_ratings_table
other_users_y = user_ratings_table["Apollo 13"].drop(1,axis=0)

In [95]:
#user_ratings_table["Apollo 13"]
other_users_y

userId
2      NaN
3      NaN
4      NaN
5      3.0
6      4.0
      ... 
606    NaN
607    5.0
608    2.0
609    3.0
610    NaN
Name: Apollo 13, Length: 609, dtype: float64

In [96]:
# Get the data for only those that have seen the movie
users_x = user_ratings_table_normed.loc[:, user_ratings_table_normed.columns != "Apollo 13"]
other_users_x = users_x.drop(1, axis=0)
other_users_x

title,Unnamed: 1_level_0,00 Schneider Jagd auf Nihil Baxter,10,10 Cent Pistol,10 Cloverfield Lane,10 Items or Less,10 Things I Hate About You,10 Years,100 Girls,100 Streets,...,Zootopia,Zulu,anohana: The Flower We Saw That Day The Movie,batteries not included,burbs The,eXistenZ,night Mother,nous la libert Freedom for Us,xXx,xXx: State of the Union
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [97]:
other_users_x = other_users_x[other_users_y.notnull()]
other_users_x

title,Unnamed: 1_level_0,00 Schneider Jagd auf Nihil Baxter,10,10 Cent Pistol,10 Cloverfield Lane,10 Items or Less,10 Things I Hate About You,10 Years,100 Girls,100 Streets,...,Zootopia,Zulu,anohana: The Flower We Saw That Day The Movie,batteries not included,burbs The,eXistenZ,night Mother,nous la libert Freedom for Us,xXx,xXx: State of the Union
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
602,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
605,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [81]:
# Remove those that have not seen the movie from the target
other_users_y.dropna(inplace=True)
other_users_y

userId
5      3.0
6      4.0
7      4.5
8      4.0
11     5.0
      ... 
602    4.0
605    5.0
607    5.0
608    2.0
609    3.0
Name: Apollo 13, Length: 201, dtype: float64

In [82]:
# Import the regressor
from sklearn.neighbors import KNeighborsRegressor

# Instantiate the user KNN model
user_knn = KNeighborsRegressor(metric='cosine', n_neighbors=10)

In [83]:
# Fit the model and predict the target user
user_knn.fit(other_users_x, other_users_y)
user_user_pred = user_knn.predict(target_user_x)

print(user_user_pred)

[3.95]
