In [12]:
import pandas as pd
import numpy as np

In [13]:
movies_df = pd.read_csv('Movie.csv')

In [None]:
movies_df.head(20)

In [None]:
movies_df[(movies_df['userId']==6)]

In [None]:
movies_df.sort_values('userId')

In [17]:
movies_df.shape

(8992, 3)

In [18]:
#number of unique users in the dataset
len(movies_df.userId.unique())

4081

In [19]:
(movies_df['userId'].value_counts()).shape[0]

4081

In [None]:
movies_df['rating'].value_counts()

In [22]:
len(movies_df.movie.unique())

10

In [None]:
movies_df.movie.value_counts()

In [24]:
# change structure of dataset so that we can compute the similarity score
user_movies_df = movies_df.pivot(index='userId', # row = userID
                                 columns='movie', # Col = Movie name
                                 values='rating')

In [None]:
user_movies_df
# NaN - Person did not watch movie so he didn't give rating. Now we can recommend those (not watched) movies to him.

In [26]:
user_movies_df.shape

(4081, 10)

In [None]:
#user_movies_df.index = movies_df.userId.unique()

In [None]:
user_movies_df

In [27]:
#Impute those NaNs with 0 values ) 0 - did not watch movie
user_movies_df.fillna(0, inplace=True)

In [None]:
user_movies_df

In [29]:
#Calculating Cosine Similarity between Users
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

In [30]:
# User similarity - pairwise - taking 2 rows at a time it will compute distance
# That distance is scaled between 0 to 1
# Suppose distance between 2 rows is 0.  We want similarity, not a distance.
# That is calculated as Similarity=1-distance. So 1-0=1. i.e. similarity in rows is 1.
# Suppose distance is 0.9. So 1-0.9=0.1
user_sim = 1 - pairwise_distances(user_movies_df.values,metric='cosine')

In [None]:
user_sim # This is similarity matrix - diagonal values are 1

In [32]:
np.fill_diagonal(user_sim, 0)

In [33]:
#Store the results in a dataframe
user_sim_df = pd.DataFrame(user_sim)

In [35]:
user_sim_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4071,4072,4073,4074,4075,4076,4077,4078,4079,4080
0,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,1.000000,0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.553372
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.390567,0.707107,0.615457,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.458831,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.650945,0.000000,0.492366,1.000000,0.874157,...,0.000000,1.000000,0.000000,0.707107,0.000000,0.000000,0.752577,0.458831,1.000000,0.622543
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.615457,0.000000,0.388514,...,0.800000,0.000000,0.000000,0.000000,0.989949,0.000000,0.000000,0.619422,0.000000,0.000000
4,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,1.000000,0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.553372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4076,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.658505,0.000000,0.000000,0.000000
4077,0.000000,0.000000,0.752577,0.000000,0.000000,0.489886,0.000000,0.370543,0.752577,0.657870,...,0.000000,0.752577,0.000000,0.532152,0.000000,0.658505,0.000000,0.345306,0.752577,0.468511
4078,0.000000,0.458831,0.458831,0.619422,0.000000,0.701884,0.567775,0.889532,0.458831,0.568212,...,0.344124,0.458831,0.000000,0.324443,0.648886,0.000000,0.345306,0.000000,0.458831,0.476071
4079,0.000000,0.000000,1.000000,0.000000,0.000000,0.650945,0.000000,0.492366,1.000000,0.874157,...,0.000000,1.000000,0.000000,0.707107,0.000000,0.000000,0.752577,0.458831,0.000000,0.622543


In [36]:
#Set the index and column names to user ids
user_sim_df.index = movies_df.userId.unique()
user_sim_df.columns = movies_df.userId.unique()

In [37]:
user_sim_df

Unnamed: 0,3,6,8,10,11,12,13,14,16,19,...,6975,6979,6993,7030,7031,7044,7070,7080,7087,7105
3,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,1.000000,0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.553372
6,0.000000,0.000000,0.000000,0.000000,0.000000,0.390567,0.707107,0.615457,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.458831,0.000000,0.000000
8,0.000000,0.000000,0.000000,0.000000,0.000000,0.650945,0.000000,0.492366,1.000000,0.874157,...,0.000000,1.000000,0.000000,0.707107,0.000000,0.000000,0.752577,0.458831,1.000000,0.622543
10,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.615457,0.000000,0.388514,...,0.800000,0.000000,0.000000,0.000000,0.989949,0.000000,0.000000,0.619422,0.000000,0.000000
11,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,1.000000,0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.553372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7044,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.658505,0.000000,0.000000,0.000000
7070,0.000000,0.000000,0.752577,0.000000,0.000000,0.489886,0.000000,0.370543,0.752577,0.657870,...,0.000000,0.752577,0.000000,0.532152,0.000000,0.658505,0.000000,0.345306,0.752577,0.468511
7080,0.000000,0.458831,0.458831,0.619422,0.000000,0.701884,0.567775,0.889532,0.458831,0.568212,...,0.344124,0.458831,0.000000,0.324443,0.648886,0.000000,0.345306,0.000000,0.458831,0.476071
7087,0.000000,0.000000,1.000000,0.000000,0.000000,0.650945,0.000000,0.492366,1.000000,0.874157,...,0.000000,1.000000,0.000000,0.707107,0.000000,0.000000,0.752577,0.458831,0.000000,0.622543


In [38]:
# idxmax() method returns a Series with the index of the maximum value for each column. (row 3 anc col 11 has highest value as 1)
# By specifying the column axis (axis='columns' or 1), the idxmax() method returns a Series with the index of the maximum value for each row.
user_sim_df.idxmax(axis=1)

3         11
6        168
8         16
10      4047
11         3
        ... 
7044      80
7070    1808
7080     708
7087       8
7105    4110
Length: 4081, dtype: int64

In [39]:
#Most Similar Users
user_sim_df.idxmax(axis=1)[0:10]

3       11
6      168
8       16
10    4047
11       3
12    6676
13    5953
14    4138
16       8
19    3603
dtype: int64

In [None]:
user_sim_df.iloc[0:5, 0:5]
# similarity 1 - between customer 3 to 3, 6 to 6.
# similarity between customer 11 and customer 3 is 1. i.e. they are very similar.

In [None]:
# eg. find movies watched by customer 6 and 168 as they are similar
movies_df[(movies_df['userId']==6) | (movies_df['userId']==168)]
# Both watched Toy Story with good rating, 6 watched 2 more movies.
# Now rating for Sabrina is more than other movie. So we can recommend that movie to 168.

In [42]:
# Other way to display above table
user_1=movies_df[movies_df['userId']==6] # Movie name watched by user 6

In [43]:
user_2=movies_df[movies_df['userId']==168] # Movie name watched by user 168

In [44]:
user_1.movie # 6 in index no. here, Display Movie name watched by user 168. Movies watched by customer 6

1              Toy Story (1995)
3725    Grumpier Old Men (1995)
6464             Sabrina (1995)
Name: movie, dtype: object

In [45]:
user_2.movie # Movies watched by customer 168

60    Toy Story (1995)
Name: movie, dtype: object

In [46]:
pd.merge(user_1,user_2,on='movie',how='left') # Merge 2 outputs of user_1 and user_2

Unnamed: 0,userId_x,movie,rating_x,userId_y,rating_y
0,6,Toy Story (1995),5.0,168.0,4.5
1,6,Grumpier Old Men (1995),3.0,,
2,6,Sabrina (1995),5.0,,
