# Computing User-User Similarities using different similarity functions

In [None]:
import pandas as pd
import numpy as np
from util.cloud_connection import bucket_connection
pd.options.display.max_columns = None
from sklearn.metrics.pairwise import pairwise_distances

In [None]:
RATING_CSV = 'data/rating_normalized.csv'

In [None]:
df_ratings = pd.read_csv(RATING_CSV)
df_meals = bucket_connection.get_meals()

In [None]:
df_ratings = df_ratings.assign(
    title_prim=[df_meals.loc[(df_meals['m_id']==m_id),'title_prim'].to_string(index=False) 
                for m_id in df_ratings.loc[:,'m_id']])

df_ratings.head()

## User - Item Matrix

In [None]:
df_user_item = df_ratings.pivot_table(index="user",
                                      columns="title_prim",
                                      values="rating",
                                      aggfunc=np.mean).fillna(0)

In [None]:
df_user_item

## User similarities
### Cosine similarity
Using the pairwise cosine similarity from sklearn

In [None]:
user_similarity_cos = pd.DataFrame(1 - pairwise_distances(df_user_item, metric="cosine"))
user_similarity_cos

In [None]:
user_similarity_cos.sum()

We have one user whose sum of cosine similarities for all other users including himself is 1. This means he has no similarity with any other user.

In [None]:
user_similarity_cos.sum().sum()

In [None]:
print(df_user_item.size)
print(np.count_nonzero(df_user_item))
print(np.count_nonzero(df_user_item)/ df_user_item.size)

In [None]:
print(user_similarity_cos.size)
print(np.count_nonzero(user_similarity_cos))
print(np.count_nonzero(user_similarity_cos)/ user_similarity_cos.size)

The user item matrix is 6.3% nonzero, the user-user similarity matrix is 25% nonzero. 

### Pearson correlation coefficient

In [None]:
user_similarity_pearson = pd.DataFrame(1 - pairwise_distances(df_user_item, metric="correlation"))
user_similarity_pearson

In [None]:
print("{} valus bigger then 0.5".format(str(len(np.where(user_similarity_pearson > 0.5)[0]) -53)))

In [None]:
user_similarity_pearson[user_similarity_pearson > 0.5]

In [None]:
print(user_similarity_pearson.size)
print(np.count_nonzero(user_similarity_pearson))
print(np.count_nonzero(user_similarity_pearson)/ user_similarity_pearson.size)
print()
print(user_similarity_pearson.size)
print(len(np.where(user_similarity_pearson < 0)[0]))
print( 1- len(np.where(user_similarity_pearson < 0)[0])/ user_similarity_pearson.size)

With the pearson correlation coefficient we get a dense matrix, but it has a lot of negative values. We need to check how those affect the further recommendation steps.

### Dice similarity

In [None]:
user_similarity_dice = pd.DataFrame(1 - pairwise_distances(df_user_item, metric="dice"))
user_similarity_dice

In [None]:
print("{} valus bigger then 0".format(str(len(np.where(user_similarity_dice > 0)[0]) -53)))
print("{} valus bigger then 0.5".format(str(len(np.where(user_similarity_dice > 0.5)[0]) -53)))

In [None]:
user_similarity_dice.sum()

With the dice similarity one user has no similarity to any user. It's the same user as with the cosine similarity.

In [None]:
print(user_similarity_dice.size)
print(np.count_nonzero(user_similarity_dice))
print(np.count_nonzero(user_similarity_dice)/ user_similarity_dice.size)

In [None]:
user_similarity_dice.sum().sum()

### Jaccard similarity

In [None]:
user_similarity_jaccard = pd.DataFrame(1 - pairwise_distances(df_user_item, metric="jaccard"))
user_similarity_jaccard

In [None]:
print("{} valus bigger then 0".format(str(len(np.where(user_similarity_jaccard > 0)[0]) -53)))
print("{} valus bigger then 0.5".format(str(len(np.where(user_similarity_jaccard > 0.5)[0]) -53)))

In [None]:
user_similarity_jaccard.sum()

In [None]:
print(user_similarity_jaccard.size)
print(np.count_nonzero(user_similarity_jaccard))
print(np.count_nonzero(user_similarity_jaccard)/ user_similarity_jaccard.size)

In [None]:
user_similarity_jaccard.sum().sum()