In [194]:
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

In [204]:
data_origin_df = pd.read_csv("test.csv", usecols = ["id", "reviews.rating", "reviews.username"], dtype={'id': object})
data_origin_df = data_origin_df.rename(columns = {"reviews.rating": "rating", "id": "product_id", "reviews.username": "username"})
data_origin_df

Unnamed: 0,product_id,username,rating
0,1,user1,4
1,2,user1,5
2,3,user1,3
3,5,user1,2
4,6,user1,1
5,1,user2,5
6,2,user2,3
7,3,user2,3
8,4,user2,2
9,5,user2,2


In [205]:
data_pivot_df = data_origin_df.groupby(by=['username', 'product_id']).rating.sum().reset_index()
data_pivot_df = data_pivot_df.pivot(index="username",columns="product_id", values="rating")
data_pivot_df = data_pivot_df.fillna(0)
data_pivot_df

product_id,1,2,3,4,5,6
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
user1,4.0,5.0,3.0,0.0,2.0,1.0
user2,5.0,3.0,3.0,2.0,2.0,0.0
user3,1.0,0.0,0.0,4.0,5.0,4.0
user4,0.0,2.0,1.0,4.0,0.0,3.0
user5,1.0,0.0,2.0,3.0,3.0,4.0


In [206]:
def standardize(row):
    new_row = (row - row.mean()) / (row.max()-row.min())
    return new_row

In [207]:
data_std = data_pivot_df.apply(standardize)
data_std

product_id,1,2,3,4,5,6
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
user1,0.36,0.6,0.4,-0.65,-0.08,-0.35
user2,0.56,0.2,0.4,-0.15,-0.08,-0.6
user3,-0.24,-0.4,-0.6,0.35,0.52,0.4
user4,-0.44,0.0,-0.266667,0.35,-0.48,0.15
user5,-0.24,-0.4,0.066667,0.1,0.12,0.4


In [208]:
product_similarity = cosine_similarity(data_std.T)
product_similarity

array([[ 1.        ,  0.70668875,  0.81368151, -0.79941088, -0.02539184,
        -0.91410609],
       [ 0.70668875,  1.        ,  0.72310153, -0.84515425, -0.5189993 ,
        -0.84337386],
       [ 0.81368151,  0.72310153,  1.        , -0.84794611, -0.3799803 ,
        -0.80218063],
       [-0.79941088, -0.84515425, -0.84794611,  1.        ,  0.14803913,
         0.72374686],
       [-0.02539184, -0.5189993 , -0.3799803 ,  0.14803913,  1.        ,
         0.39393939],
       [-0.91410609, -0.84337386, -0.80218063,  0.72374686,  0.39393939,
         1.        ]])

In [209]:
product_similaritty_df = pd.DataFrame(product_similarity, index=data_pivot_df.columns, columns=data_pivot_df.columns)
product_similaritty_df

product_id,1,2,3,4,5,6
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1.0,0.706689,0.813682,-0.799411,-0.025392,-0.914106
2,0.706689,1.0,0.723102,-0.845154,-0.518999,-0.843374
3,0.813682,0.723102,1.0,-0.847946,-0.37998,-0.802181
4,-0.799411,-0.845154,-0.847946,1.0,0.148039,0.723747
5,-0.025392,-0.518999,-0.37998,0.148039,1.0,0.393939
6,-0.914106,-0.843374,-0.802181,0.723747,0.393939,1.0


In [210]:
def get_similarity_products(product_id, user_rating):
    similarity_score = product_similaritty_df[product_id]*user_rating
    similarity_score = similarity_score.sort_values(ascending=False)
    return similarity_score

In [212]:
print(get_similarity_products("6", 1))

product_id
6    1.000000
4    0.723747
5    0.393939
3   -0.802181
2   -0.843374
1   -0.914106
Name: 6, dtype: float64


In [216]:
user_actions = [("1", 5), ("2", 4), ("6", 1)]

similar_products = pd.DataFrame()

for product_id, user_rating in user_actions:
    similar_products = similar_products.append(get_similarity_products(product_id, user_rating), ignore_index=True)
    
similar_products.head()
similar_products.sum().sort_values(ascending=False)

1    6.912649
2    6.690070
3    6.158633
5   -1.809017
4   -6.653925
6   -6.944026
dtype: float64