In [1]:
import os
import json
import pandas as pd
import numpy as np
import tqdm
import scipy.sparse as sp

from pprint import pprint

In [2]:
from matplotlib import pyplot as plt
import seaborn as sns
from datetime import datetime
import swifter

sns.set_style('whitegrid')

In [3]:
DATA_PATH = '../data'

In [4]:
%%time
ratings = pd.read_csv(
    os.path.join(DATA_PATH, 'ratings.csv'),
    dtype={
        'element_uid': np.uint16,
        'user_uid': np.uint32,
        'ts': np.float64,
        'rating': np.uint8
    }
)

ratings_df = pd.DataFrame.from_dict(ratings)

Wall time: 243 ms


In [5]:
ratings_df['real_ts'] = ratings_df['ts'].swifter.apply(lambda ts: datetime.utcfromtimestamp(ts + 1500000000))

HBox(children=(FloatProgress(value=0.0, description='Dask Apply', max=8.0, style=ProgressStyle(description_wid…




In [6]:
ratings_df.head(10)

Unnamed: 0,user_uid,element_uid,rating,ts,real_ts
0,571252,1364,10,44305170.0,2018-12-08 21:39:34.263099
1,63140,3037,10,44305140.0,2018-12-08 21:38:59.282818
2,443817,4363,8,44305140.0,2018-12-08 21:38:56.205849
3,359870,1364,10,44305060.0,2018-12-08 21:37:43.006373
4,359870,3578,9,44305060.0,2018-12-08 21:37:40.739133
5,557663,1918,10,44305050.0,2018-12-08 21:37:30.536551
6,230987,8273,10,44305000.0,2018-12-08 21:36:41.952828
7,95790,5368,9,44305000.0,2018-12-08 21:36:37.418347
8,16810,2245,8,44304980.0,2018-12-08 21:36:24.786579
9,36122,7587,6,44304930.0,2018-12-08 21:35:34.097562


In [6]:
mean = ratings_df.groupby(['user_uid'], as_index=False, sort=False).mean()[['user_uid', 'rating']].rename(columns={'rating': 'mean_rating'})

In [12]:
mean

Unnamed: 0,user_uid,mean_rating
0,571252,10.000000
1,63140,9.244444
2,443817,8.133333
3,359870,7.444444
4,557663,10.000000
...,...,...
104558,561109,9.000000
104559,201980,8.000000
104560,569421,10.000000
104561,541238,8.000000


In [7]:
ratings_df = pd.merge(ratings_df, mean, how='left', on='user_uid', sort=False)

In [8]:
ratings_df['adjusted_rating'] = ratings_df['rating'] - ratings_df['mean_rating']

In [9]:
ratings_df.head()

Unnamed: 0,user_uid,element_uid,rating,ts,real_ts,mean_rating,adjusted_rating
0,571252,1364,10,44305170.0,2018-12-08 21:39:34.263099,10.0,0.0
1,63140,3037,10,44305140.0,2018-12-08 21:38:59.282818,9.244444,0.755556
2,443817,4363,8,44305140.0,2018-12-08 21:38:56.205849,8.133333,-0.133333
3,359870,1364,10,44305060.0,2018-12-08 21:37:43.006373,7.444444,2.555556
4,359870,3578,9,44305060.0,2018-12-08 21:37:40.739133,7.444444,1.555556


In [10]:
result_df = pd.DataFrame({'user_uid': ratings_df['user_uid'], 
                          'element_uid': ratings_df['element_uid'],
                          'rating': ratings_df['rating']}).pivot_table(index='user_uid', 
                                                                      columns='element_uid',
                                                                      values='rating').fillna(0)

In [11]:
result_df

element_uid,3,4,6,7,9,12,13,15,18,21,...,10182,10183,10184,10185,10186,10187,10194,10196,10197,10199
user_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593464,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
593465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
593467,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
593478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
all_users = result_df.values

In [24]:
all_users[0].shape

(7519,)

In [29]:
target_user = all_users[0]
denominator = np.sqrt(sum([np.square(x) for x in target_user]))

cosine_similarity = [(result_df.index[0], 1)]
i = 1
for user in tqdm.tqdm(all_users[1:]):
    numerator = [x*y for x,y in zip(target_user, user)]
    denominator2 = np.sqrt(sum([np.square(x) for x in user]))
    cos_theta = sum(numerator)/(denominator*denominator2)
    cosine_similarity.append((result_df.index[i], cos_theta))
    
cosine_similarity.sort(key=lambda x: x[1], reverse=True)

  if __name__ == '__main__':
 12%|█████████                                                                  | 12602/104562 [05:29<40:03, 38.27it/s]


KeyboardInterrupt: 

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

In [13]:
simple_matrix = result_df.values

In [14]:
%%time

sparse_matrix = sparse.csr_matrix(simple_matrix)

Wall time: 28.2 s


In [15]:
%%time

similarities = cosine_similarity(sparse_matrix)
print('pairwise dense output:\n {}\n'.format(similarities))

MemoryError: Unable to allocate 81.5 GiB for an array with shape (104563, 104563) and data type float64