In [1]:
!pip3 install progressbar2



In [2]:
import json
import random
from collections import defaultdict

import pandas as pd
import numpy as np
import sklearn

import scipy as sp
import scipy.sparse
import scipy.sparse.linalg

from progressbar import ProgressBar

from metrics import apk

In [3]:
genres_count = 10

-----

In [4]:
likes_df = pd.read_csv('train_likes.csv')

In [5]:
likes_df.iloc[1]

user_id    71a7f1d1be96603971ba66e4a17e845c
item_id    5edaf734b432e5cc954a10b59cb97e70
channel    ec5decca5ed3d6b8079e2e7e7bacc9f2
time                            1.39046e+09
Name: 1, dtype: object

In [6]:
items_data = dict()
for item in json.load(open('items.json')):
    items_data[item['id']] = item

In [7]:
items_data['5edaf734b432e5cc954a10b59cb97e70']

{'duration': -0.6535735882045764,
 'f_106593': 1,
 'f_114306': 2,
 'f_122038': 1,
 'f_122097': 1,
 'f_125216': 1,
 'f_127794': 1,
 'f_131919': 1,
 'f_137909': 1,
 'f_143336': 1,
 'f_153548': 1,
 'f_161336': 1,
 'f_173035': 1,
 'f_173938': 1,
 'f_174560': 1,
 'f_177744': 1,
 'f_180790': 1,
 'f_183958': 1,
 'f_192481': 1,
 'f_193515': 1,
 'f_196983': 1,
 'f_205162': 1,
 'f_35526': 1,
 'f_38095': 1,
 'f_61462': 1,
 'f_63680': 1,
 'f_64513': 1,
 'f_68894': 1,
 'f_96274': 2,
 'genre': 2,
 'id': '5edaf734b432e5cc954a10b59cb97e70',
 'year': 0.13337536834806843}

-----

In [8]:
users = likes_df.user_id.unique()
users_index = dict(zip(users, range(len(users))))

In [9]:
items_index = dict(zip(items_data.keys(), range(len(items_data))))

In [10]:
features_index = dict()

for data in items_data.values():
    for k in data.keys():
        if k[0] == 'f':
            features_index[k] = int(k[2:]) + genres_count

-----

In [11]:
users_likes = defaultdict(set)
for i, data in likes_df.iterrows():
    users_likes[data['user_id']].add(data['item_id'])

-----

In [12]:
bar = ProgressBar()

items_features_matrix = sp.sparse.lil_matrix((len(items_index), len(features_index) + genres_count), dtype='bool')

for item, item_index in bar(items_index.items()):
    data = items_data[item]
    
    items_features_matrix[item_index, data['genre']] = 1
    
    for it in data:
        if it[0] == 'f':
            items_features_matrix[item_index, features_index[it]] = 1
            
items_features_matrix = items_features_matrix.tocsr()

100% (138078 of 138078) |#################| Elapsed Time: 0:00:27 Time: 0:00:27


In [13]:
items_features_matrix.shape

(138078, 213512)

In [14]:
bar = ProgressBar()

users_items_matrix = sp.sparse.lil_matrix((len(users_index), len(items_index)), dtype='bool')

for user, user_index in bar(users_index.items()):
    liked_items = users_likes[user]
    
    for liked_item in liked_items:
        liked_item_index = items_index.get(liked_item)
        
        if liked_item_index is not None:
            users_items_matrix[user_index, liked_item_index] = 1
            
users_items_matrix = users_items_matrix.tocsr()

100% (55863 of 55863) |###################| Elapsed Time: 0:00:01 Time: 0:00:01


In [15]:
users_items_matrix.shape

(55863, 138078)

In [16]:
bar = ProgressBar()

users_features_matrix = sp.sparse.lil_matrix((len(users_index), len(features_index) + genres_count), dtype='bool')

for user, user_index in bar(users_index.items()):
    liked_items = users_likes[user]
    
    for liked_item in liked_items:
        liked_item_index = items_index.get(liked_item)
        
        if liked_item_index is not None:
            data = items_data[item]

            users_features_matrix[user_index, data['genre']] = 1

            for it in data:
                if it[0] == 'f':
                    users_features_matrix[user_index, features_index[it]] = 1

users_features_matrix = users_features_matrix.tocsr()

100% (55863 of 55863) |###################| Elapsed Time: 0:00:06 Time: 0:00:06


In [17]:
users_features_matrix.shape

(55863, 213512)

-----

In [18]:
u_users_items_matrix, s_users_items_matrix, vt_users_items_matrix = \
    sp.sparse.linalg.svds(users_items_matrix.astype(np.float32), k=100)

In [19]:
u_users_items_matrix.shape, s_users_items_matrix.shape, vt_users_items_matrix.shape

((55863, 100), (100,), (100, 138078))

-----

In [20]:
def cos_angle_of_vectors(vec1, vec2):
    l = np.sqrt(np.sum(vec1 ** 2) * np.sum(vec2 ** 2))
    if l != 0:
        cos_val = np.dot(vec1, vec2) / l
        return cos_val
    else:
        return None

In [21]:
def predict_item_like_likeness_via_other_users(user, item):
    user_index = users_index[user]
    item_index = items_index[item]
    
    u, s, vt = u_users_items_matrix, s_users_items_matrix, vt_users_items_matrix
    
    cos_angle = cos_angle_of_vectors(u[user_index,:] * s, vt[:,item_index])
    if cos_angle is None:
        return 0
    else:
        return np.abs(cos_angle)

In [22]:
def predict_item_like_likeness_via_features(user, item):
    user_index = users_index[user]
    item_index = items_index[item]
    uf_i = users_features_matrix[user_index].toarray().flatten()
    if_i = items_features_matrix[item_index].toarray().flatten()
    
    cos_angle = cos_angle_of_vectors(uf_i, if_i)
    if cos_angle is None:
        return 0
    else:
        return np.abs(cos_angle)

In [23]:
def predict_item_like_likeness(user, item):
    predictions = [
        predict_item_like_likeness_via_other_users(user, item),
        predict_item_like_likeness_via_features(user, item)
    ]
    return max(predictions)

-----

In [24]:
def get_random_user():
    return users[random.randrange(len(users))]

In [25]:
def get_random_item():
    return random.sample(items_data.keys(), 1)[0]

In [26]:
predict_item_like_likeness(get_random_user(), get_random_item())

0.11785113019775793

-----

In [27]:
def get_recommendations(user, k):
    predictions = dict()
    bar = ProgressBar()
    for item, item_index in bar(items_index.items()):
        predictions[item] = predict_item_like_likeness(user, item)

    best = []
    for i in range(k):
        m = 0
        mi = None
        for k, v in predictions.items():
            if v > m:
                m = v
                mi = k
        best.append(mi)
        del predictions[mi]
        
    return best

-----

In [28]:
m = 0
best_user = None
for user, liked_items in users_likes.items():
    l = len(liked_items)
    if l > m:
        m = l
        best_user = user
        
best_user

'bf3e98f81f7a72d07cd6306a556292be'

In [29]:
K = 1000

bar = ProgressBar()
APatK_per_user = []
for user in bar([best_user]):
    test_items = users_likes[user]
    recommendation_list = get_recommendations(user, k=K)
    user_APatK = apk(test_items, recommendation_list, k=K)
    APatK_per_user.append(user_APatK)
    
print('mAP@{} = {}'.format(K, np.mean(APatK_per_user)))

100% (138078 of 138078) |#################| Elapsed Time: 0:03:50 Time: 0:03:50


mAP@1000 = 0.7143695196639056


100% (1 of 1) |###########################| Elapsed Time: 0:07:16 Time: 0:07:16


In [30]:
test_items

{'004b70556c696bcb2c17c8bf19679d35',
 '005d8b0023814109c31167048af168fc',
 '01bef0b49088c0f070ba7b4edf50dd02',
 '02305163183984c30d65641c068a4093',
 '02636779dcbd81818bfd6f9e4bc242f3',
 '026da7fcafbf2cf39abcf4753ddc7ab2',
 '029c1e84546ac4721e13c7aba2f0165d',
 '030e898032ece371c52cfaf3401b87d2',
 '034c263dd3be3ca53c49b82f61a075a2',
 '03bc6143d205819a444c01b55b3756b6',
 '0441b5ff6b3e602a1d1ee537c9dff0ff',
 '0454c5f89d59dadd3e2db4d9ddfbaf71',
 '04b0c650d532058a9a4bff94d3cc02a2',
 '04beab30795f4bd11df2879fa9ea86c3',
 '04f3d9c63517f5a7d2739f10ad193f80',
 '055771f99d2d3beb97da6b49b9a62621',
 '0567587b6316bc733683bed7f468f534',
 '05d7d96eb433fca4eb45dbcd3af1aecf',
 '0604e08d80432da9d762d1c2f1313b15',
 '0641f4fbdc4bf04cff43d4546934ce65',
 '064cec42dd730a57108a4be28b61818a',
 '0726681d16908583bb742c27cf4e6d74',
 '0753360a647686ee100fd9cf29b64811',
 '07c76e645fd0caaf3f4c72484227fb19',
 '08cc925c7719cdb2392abd44b2a6edf5',
 '08d8abf4d99605ce9735dd33612dd9d9',
 '09829f159fa61e5f8c3e96874bf4c665',
 

In [31]:
recommendation_list

['696b705f7b6602299e5b8d6ea3a36098',
 '530ea7e0526e1f8906f7af9af581a094',
 '5dbe3ac0ad9a97dd1aff9dd6aa779daf',
 'adf5c3c87bfc53a9130dbb4a08bccc9d',
 '7e37669992c974911ac524e666916d8f',
 '390e3b61c88d2424b7589400f4210c04',
 'd0aaf17e3d058a3df5c480598bf4ebe4',
 'e68e362673b1ba6dfaf4ae16272fe2af',
 '1df32e7142a6af69110f06aae7b7c722',
 '66206e08ad31b178cfaa305ee7320877',
 '9897ba59bae4b1fc8e60ceb0279f2bfa',
 '0e68a7ab31fcbead4279d4210123c3bb',
 'eb1a2c7c46b318d819baf46ee232d53e',
 'a8c89ad8f4efb7ecc7787150af167b4c',
 '4c2ed5687ea1d7de589dfe359c816c47',
 '870a5234d3acf52d35c2d9c56fc6facf',
 '38589a9423c9fd66eba0187bdeb368b7',
 '2ff43ce8158e2faffa02405cdcf68e5d',
 '159d571a2d9eac95f42ccc2f459386bb',
 'fa3c7162e01d964d4415d2d827e057a4',
 '1a3b8ffb9ca54cf6b730b0be1b90398c',
 '6b070e1a73a8e58816567158afd62532',
 'b956d47230767986a57ba1aa52ffee23',
 'd391822ed231b06cf4af8b91b2c8b4d4',
 '84ec426e929d40cade24da5c2c686fc1',
 '3d7034ce65d5ac0ce09b241e58e492bc',
 'c05b9613e7172e93260aa6b9fdd1a799',
 

In [40]:
def similar_features_count(user, item):
    a = users_features_matrix[users_index[user]].toarray().flatten()
    b = items_features_matrix[items_index[item]].toarray().flatten()
    return (a == b).sum()

In [41]:
items_features_matrix.shape[1]

213512

In [42]:
similar_features_count('bf3e98f81f7a72d07cd6306a556292be', '004b70556c696bcb2c17c8bf19679d35')

213478

In [43]:
similar_features_count('bf3e98f81f7a72d07cd6306a556292be', '696b705f7b6602299e5b8d6ea3a36098')

213475