In [1]:
import pandas as pd
import numpy as np

In [2]:
movies_df = pd.read_csv('Movie.csv')

In [3]:
movies_df.head(20)

Unnamed: 0,userId,movie,rating
0,3,Toy Story (1995),4.0
1,6,Toy Story (1995),5.0
2,8,Toy Story (1995),4.0
3,10,Toy Story (1995),4.0
4,11,Toy Story (1995),4.5
5,12,Toy Story (1995),4.0
6,13,Toy Story (1995),4.0
7,14,Toy Story (1995),4.5
8,16,Toy Story (1995),3.0
9,19,Toy Story (1995),5.0


In [4]:
movies_df[(movies_df['userId']==6)]

Unnamed: 0,userId,movie,rating
1,6,Toy Story (1995),5.0
3725,6,Grumpier Old Men (1995),3.0
6464,6,Sabrina (1995),5.0


In [5]:
movies_df.sort_values('userId')

Unnamed: 0,userId,movie,rating
2569,1,Jumanji (1995),3.5
3724,2,Grumpier Old Men (1995),4.0
0,3,Toy Story (1995),4.0
5204,4,Heat (1995),3.0
7444,4,GoldenEye (1995),4.0
...,...,...,...
6463,7117,Heat (1995),5.0
2567,7119,Toy Story (1995),5.0
2568,7120,Toy Story (1995),4.5
3723,7120,Jumanji (1995),4.0


In [6]:
movies_df.shape

(8992, 3)

In [None]:
#number of unique users in the dataset
len(movies_df.userId.unique())

In [8]:
(movies_df['userId'].value_counts()).shape[0]

4081

In [None]:
movies_df['rating'].value_counts()

In [None]:
len(movies_df.movie.unique())

In [None]:
movies_df.movie.value_counts()

In [12]:
# change structure of dataset so that we can compute the similarity score
user_movies_df = movies_df.pivot(index='userId', # row = userID
                                 columns='movie', # Col = Movie name
                                 values='rating')

In [13]:
user_movies_df
# NaN - Person did not watch movie so he didn't give rating. Now we can recommend those (not watched) movies to him.

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,,,,,3.5,,,,,
2,,,4.0,,,,,,,
3,,,,,,,,,4.0,
4,,4.0,,3.0,,,,,,
5,,,,,3.0,,,,,
...,...,...,...,...,...,...,...,...,...,...
7115,4.0,,,,,,,,,
7116,3.5,,,,,,,,4.0,
7117,,3.0,4.0,5.0,,3.0,1.0,,4.0,
7119,,,,,,,,,5.0,


In [14]:
user_movies_df.shape

(4081, 10)

In [None]:
#user_movies_df.index = movies_df.userId.unique()

In [None]:
user_movies_df

In [16]:
#Impute those NaNs with 0 values ) 0 - did not watch movie
user_movies_df.fillna(0, inplace=True)

In [None]:
user_movies_df

In [18]:
#Calculating Cosine Similarity between Users
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

In [19]:
# User similarity - pairwise - taking 2 rows at a time it will compute distance
# That distance is scaled between 0 to 1
# Suppose distance between 2 rows is 0.  We want similarity, not a distance.
# That is calculated as Similarity=1-distance. So 1-0=1. i.e. similarity in rows is 1.
# Suppose distance is 0.9. So 1-0.9=0.1
user_sim = 1 - pairwise_distances(user_movies_df.values,metric='cosine')

In [None]:
user_sim # This is similarity matrix - diagonal values are 1

In [21]:
np.fill_diagonal(user_sim, 0)

In [22]:
#Store the results in a dataframe
user_sim_df = pd.DataFrame(user_sim)

In [None]:
user_sim_df

In [24]:
#Set the index and column names to user ids
user_sim_df.index = movies_df.userId.unique()
user_sim_df.columns = movies_df.userId.unique()

In [None]:
user_sim_df

In [None]:
# idxmax() method returns a Series with the index of the maximum value for each column. (row 3 anc col 11 has highest value as 1)
# By specifying the column axis (axis='columns' or 1), the idxmax() method returns a Series with the index of the maximum value for each row.
user_sim_df.idxmax(axis=1)

In [28]:
#Most Similar Users
user_sim_df.idxmax(axis=1)[0:10]

3       11
6      168
8       16
10    4047
11       3
12    6676
13    5953
14    4138
16       8
19    3603
dtype: int64

In [29]:
user_sim_df.iloc[0:5, 0:5]
# similarity 1 - between customer 3 to 3, 6 to 6.
# similarity between customer 11 and customer 3 is 1. i.e. they are very similar.

Unnamed: 0,3,6,8,10,11
3,0.0,0.0,0.0,0.0,1.0
6,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0
11,1.0,0.0,0.0,0.0,0.0


In [30]:
# eg. find movies watched by customer 6 and 168 as they are similar
movies_df[(movies_df['userId']==6) | (movies_df['userId']==168)]
# Both watched Toy Story with good rating, 6 watched 2 more movies.
# Now rating for Sabrina is more than other movie. So we can recommend that movie to 168.

Unnamed: 0,userId,movie,rating
1,6,Toy Story (1995),5.0
60,168,Toy Story (1995),4.5
3725,6,Grumpier Old Men (1995),3.0
6464,6,Sabrina (1995),5.0


In [31]:
# Other way to display above table
user_1=movies_df[movies_df['userId']==6] # Movie name watched by user 6

In [32]:
user_2=movies_df[movies_df['userId']==168] # Movie name watched by user 168

In [33]:
user_1.movie # 6 in index no. here, Display Movie name watched by user 168. Movies watched by customer 6

1              Toy Story (1995)
3725    Grumpier Old Men (1995)
6464             Sabrina (1995)
Name: movie, dtype: object

In [34]:
user_2.movie # Movies watched by customer 168

60    Toy Story (1995)
Name: movie, dtype: object

In [35]:
pd.merge(user_1,user_2,on='movie',how='left') # Merge 2 outputs of user_1 and user_2

Unnamed: 0,userId_x,movie,rating_x,userId_y,rating_y
0,6,Toy Story (1995),5.0,168.0,4.5
1,6,Grumpier Old Men (1995),3.0,,
2,6,Sabrina (1995),5.0,,
