In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

In [2]:
movies= pd.read_csv(' /10.Movie.csv')
movies.head()

Unnamed: 0,userId,movie,rating
0,3,Toy Story (1995),4.0
1,6,Toy Story (1995),5.0
2,8,Toy Story (1995),4.0
3,10,Toy Story (1995),4.0
4,11,Toy Story (1995),4.5


In [3]:
movies.shape

(8992, 3)

In [4]:
# number of unique users and rows as per recommender system
len(movies.userId.unique())

4081

In [5]:
# number of unique movies and columns as per recommender system
len(movies.movie.unique())

10

In [6]:
# to create crosstab as per recommender system
mov= movies.pivot(index='userId',columns='movie',values='rating').reset_index(drop=True)
mov

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
0,,,,,3.5,,,,,
1,,,4.0,,,,,,,
2,,,,,,,,,4.0,
3,,4.0,,3.0,,,,,,
4,,,,,3.0,,,,,
...,...,...,...,...,...,...,...,...,...,...
4076,4.0,,,,,,,,,
4077,3.5,,,,,,,,4.0,
4078,,3.0,4.0,5.0,,3.0,1.0,,4.0,
4079,,,,,,,,,5.0,


In [7]:
# set unique user-id's from original dataset as index for new created pivot matrix
mov.index = movies.userId.unique()
mov.head()

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
3,,,,,3.5,,,,,
6,,,4.0,,,,,,,
8,,,,,,,,,4.0,
10,,4.0,,3.0,,,,,,
11,,,,,3.0,,,,,


In [8]:
#Impute those NaNs with 0 values
mov.fillna(0, inplace=True) # inplace-within the dataset
mov

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
3,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
10,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
7044,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7070,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
7080,0.0,3.0,4.0,5.0,0.0,3.0,1.0,0.0,4.0,0.0
7087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0


In [9]:
# Calculating Cosine Similarity between Users
pict= 1 - pairwise_distances( mov.values,metric='cosine')
pict

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.55337157],
       [0.        , 1.        , 0.        , ..., 0.45883147, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.45883147, 1.        ,
        0.62254302],
       ...,
       [0.        , 0.45883147, 0.45883147, ..., 1.        , 0.45883147,
        0.47607054],
       [0.        , 0.        , 1.        , ..., 0.45883147, 1.        ,
        0.62254302],
       [0.55337157, 0.        , 0.62254302, ..., 0.47607054, 0.62254302,
        1.        ]])

In [10]:
#Store the results in a dataframe
picture= pd.DataFrame(pict)
picture

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4071,4072,4073,4074,4075,4076,4077,4078,4079,4080
0,1.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,1.000000,0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.553372
1,0.000000,1.000000,0.000000,0.000000,0.000000,0.390567,0.707107,0.615457,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.458831,0.000000,0.000000
2,0.000000,0.000000,1.000000,0.000000,0.000000,0.650945,0.000000,0.492366,1.000000,0.874157,...,0.000000,1.000000,0.000000,0.707107,0.000000,0.000000,0.752577,0.458831,1.000000,0.622543
3,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.615457,0.000000,0.388514,...,0.800000,0.000000,0.000000,0.000000,0.989949,0.000000,0.000000,0.619422,0.000000,0.000000
4,1.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,1.000000,0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.553372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4076,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.658505,0.000000,0.000000,0.000000
4077,0.000000,0.000000,0.752577,0.000000,0.000000,0.489886,0.000000,0.370543,0.752577,0.657870,...,0.000000,0.752577,0.000000,0.532152,0.000000,0.658505,1.000000,0.345306,0.752577,0.468511
4078,0.000000,0.458831,0.458831,0.619422,0.000000,0.701884,0.567775,0.889532,0.458831,0.568212,...,0.344124,0.458831,0.000000,0.324443,0.648886,0.000000,0.345306,1.000000,0.458831,0.476071
4079,0.000000,0.000000,1.000000,0.000000,0.000000,0.650945,0.000000,0.492366,1.000000,0.874157,...,0.000000,1.000000,0.000000,0.707107,0.000000,0.000000,0.752577,0.458831,1.000000,0.622543


In [11]:
#Set the index and column names to user ids 
picture.index = movies.userId.unique()
picture.columns= movies.userId.unique()

In [12]:
# similarity scores for customers (3&11 are similar here)
picture.iloc[0:5, 0:5]

Unnamed: 0,3,6,8,10,11
3,1.0,0.0,0.0,0.0,1.0
6,0.0,1.0,0.0,0.0,0.0
8,0.0,0.0,1.0,0.0,0.0
10,0.0,0.0,0.0,1.0,0.0
11,1.0,0.0,0.0,0.0,1.0


In [13]:
# correlation with self is always 1,so replace diagonal values of correlation matrix value with 0.
np.fill_diagonal(pict, 0)
picture.iloc[0:5, 0:5]

Unnamed: 0,3,6,8,10,11
3,0.0,0.0,0.0,0.0,1.0
6,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0
11,1.0,0.0,0.0,0.0,0.0


In [14]:
#Most Similar Users based on columns 
picture.idxmax(axis=1)[0:]

3         11
6        168
8         16
10      4047
11         3
        ... 
7044      80
7070    1808
7080     708
7087       8
7105    4110
Length: 4081, dtype: int64

In [15]:
# to check movies of simlar customer
movies[(movies['userId']==3) | (movies['userId']==11)]

Unnamed: 0,userId,movie,rating
0,3,Toy Story (1995),4.0
4,11,Toy Story (1995),4.5
7446,11,GoldenEye (1995),2.5


In [16]:
# to check for individual customer
user1=movies[movies['userId']==6]
user1.movie

1              Toy Story (1995)
3725    Grumpier Old Men (1995)
6464             Sabrina (1995)
Name: movie, dtype: object

In [17]:
user2=movies[movies['userId']==11]
user2.movie

4       Toy Story (1995)
7446    GoldenEye (1995)
Name: movie, dtype: object

In [18]:
# join movies watched by both and then we can recommend the ones which have not been wacthed.
pd.merge(user1,user2,on='movie',how='outer')

Unnamed: 0,userId_x,movie,rating_x,userId_y,rating_y
0,6.0,Toy Story (1995),5.0,11.0,4.5
1,6.0,Grumpier Old Men (1995),3.0,,
2,6.0,Sabrina (1995),5.0,,
3,,GoldenEye (1995),,11.0,2.5
