# **Develop a Collaborative Similarity Function between Users**

---

In [1]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
DATA_DIR = Path() / "data"
DATASETS_NAME = "hetrec2011-movielens-2k-v2"


In [3]:
user_rated_movies = pd.read_csv(
    DATA_DIR / DATASETS_NAME / "user_ratedmovies-timestamps.dat",
    sep="\t",
    encoding="utf-8",
)
user_rated_movies.head(n=5)

Unnamed: 0,userID,movieID,rating,timestamp
0,75,3,1.0,1162160236000
1,75,32,4.5,1162160624000
2,75,110,4.0,1162161008000
3,75,160,2.0,1162160212000
4,75,163,4.0,1162160970000


In [5]:
user_ids = [75, 14454, 22035]

user_rated_movies = user_rated_movies[user_rated_movies.userID.isin(user_ids)]
user_rated_movies


Unnamed: 0,userID,movieID,rating,timestamp
0,75,3,1.0,1162160236000
1,75,32,4.5,1162160624000
2,75,110,4.0,1162161008000
3,75,160,2.0,1162160212000
4,75,163,4.0,1162160970000
...,...,...,...,...
262698,22035,7044,4.0,1106800437000
262699,22035,8207,4.0,1111441710000
262700,22035,8830,0.5,1106800996000
262701,22035,30894,4.0,1106663752000


In [8]:
# Number of movies rated by each user
user_rated_movies.groupby("userID").count()["movieID"]

userID
75        55
14454    853
22035    280
Name: movieID, dtype: int64

In [9]:
sp_matrix = user_rated_movies.pivot(index="userID", columns="movieID", values="rating")
sp_matrix = sp_matrix.fillna(0)
sp_matrix

movieID,1,3,16,18,19,21,24,25,29,32,...,38038,38886,39052,39292,39381,40851,41569,45431,45722,49272
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
75,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,...,0.0,0.0,3.5,0.0,0.0,0.0,0.0,3.5,4.5,0.0
14454,2.5,0.0,3.0,1.0,1.0,3.0,0.0,0.0,3.5,3.5,...,3.0,3.5,0.0,2.5,3.5,2.0,3.0,0.0,0.0,0.0
22035,0.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


In [10]:
pd.DataFrame(cosine_similarity(sp_matrix), index=sp_matrix.index, columns=sp_matrix.index)


userID,75,14454,22035
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
75,1.0,0.096066,0.12122
14454,0.096066,1.0,0.249523
22035,0.12122,0.249523,1.0


In [12]:
pd.DataFrame(pairwise_distances(
    sp_matrix.values,
    metric="jaccard",
), index=sp_matrix.index, columns=sp_matrix.index)



userID,75,14454,22035
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
75,0.0,0.974011,0.953125
14454,0.974011,0.0,0.893555
22035,0.953125,0.893555,0.0


In [13]:
pd.DataFrame(pairwise_distances(
    sp_matrix.values,
    metric="correlation",
), index=sp_matrix.index, columns=sp_matrix.index)

userID,75,14454,22035
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
75,0.0,1.186404,0.984422
14454,1.186404,0.0,1.396767
22035,0.984422,1.396767,0.0
