In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import re
import random
from typing import List

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [2]:
data = pd.read_csv('retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
test_size_weeks = 3

data_train = data[data['week_no'] <  data['week_no'].max() - test_size_weeks]
data_test  = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

### Задание 0. Товар 999999
На вебинаре мы использовали товар 999999 - что это за товар?  
Зачем он нужен?  
Используя этот товар мы смещаем качество рекомендаций.
В какую сторону?   
Можно ли удалить этот товар?   
Уберите этот товар и сравните с качеством на семинаре.

In [4]:
def precision_at_k_l(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
#     print(f"bought_list: {bought_list}")
#     print(f"recommended_list: {recommended_list}")

    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
#     print(flags)
    
    precision = flags.sum() / len(recommended_list)
    
    return precision

In [5]:
def adapt_par(inc: List) -> List:
    return list(map(int, inc.replace("\n", "").replace(",", "").replace("[", "").replace("]", "").split()))

In [6]:
predictions_basic = pd.read_csv('./sem_2/predictions_basic.csv')
predictions_basic.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,[ 821867 834484 856942 865456 889248 ...,"[963664, 733146, 949877, 9682312, 1056548]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,[ 835476 851057 872021 878302 879948 ...,"[870170, 824571, 920025, 953351, 9878598]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[999999, 1082185, 1098066, 6534178, 1127831]"


In [7]:
col_list = predictions_basic.columns[1:]
col_list

Index(['actual', 'random_recommendation', 'popular_recommendation', 'itemitem',
       'cosine', 'tfidf', 'own_purchases'],
      dtype='object')

In [8]:
for col_n in col_list:
    print(type(predictions_basic[col_n][0]))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


In [9]:
for col_n in col_list:
    predictions_basic[col_n] = list(map(adapt_par, predictions_basic[col_n]))
    print(type(predictions_basic[col_n][0][0]))
    
predictions_basic.head(2)

<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>


Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[963664, 733146, 949877, 9682312, 1056548]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[870170, 824571, 920025, 953351, 9878598]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[999999, 1082185, 1098066, 6534178, 1127831]"


In [10]:
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0)

user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

In [11]:
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

In [12]:
user_item_matrix.head(3)

item_id,25671,26081,26093,26190,26355,26426,26540,26601,26636,26691,...,17328742,17329473,17329749,17330255,17330511,17381856,17382205,17383227,17827644,17829232
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [14]:
%%time

model = ItemItemRecommender(K=5, num_threads=2) # K - кол-во билжайших соседей

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

recs = model.recommend(userid=userid_to_id[2],  # userid - id от 0 до N
                       user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                       N=5, # кол-во рекомендаций 
                       filter_already_liked_items=False, 
                       filter_items=None, 
                       recalculate_user=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=86865.0), HTML(value='')))


CPU times: user 51.2 s, sys: 606 ms, total: 51.9 s
Wall time: 38.8 s


In [15]:
%%time

predictions_basic['itemitem_noadd'] = predictions_basic['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=True)])

CPU times: user 359 ms, sys: 7.28 ms, total: 366 ms
Wall time: 413 ms


In [16]:
f"{sum(list(map(lambda x, y: precision_at_k_l(x, y, 5), predictions_basic.itemitem, predictions_basic.actual))):.4f}"

'279.6000'

In [17]:
f"{sum(list(map(lambda x, y: precision_at_k_l(x, y, 5), predictions_basic.itemitem_noadd, predictions_basic.actual))):.4f}"

'314.6000'

Дополнительная метка события в списке товаров, конечно, негативно влияет на метрики, т.к. метка события вытесняет релевантные товары.

### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [18]:
items_weights = data.groupby('item_id')['sales_value'].sum()
items_weights

item_id
25671       20.94
26081        0.99
26093        1.59
26190        1.54
26355        1.98
            ...  
17991689     2.49
17991691     2.49
18000012    19.96
18024155     3.99
18024556    30.51
Name: sales_value, Length: 89051, dtype: float64

In [19]:
min_val = min(items_weights.values[items_weights.values>0])
min_val

8.881784e-16

In [20]:
items_weights.values[items_weights.values==0] = min_val

In [21]:
items_weights = np.log(items_weights)
items_weights

item_id
25671       3.041661
26081      -0.010050
26093       0.463734
26190       0.431782
26355       0.683097
              ...   
17991689    0.912283
17991691    0.912283
18000012    2.993730
18024155    1.383791
18024556    3.418054
Name: sales_value, Length: 89051, dtype: float64

In [22]:
def weighted_random_recommendation(items_weights, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    
    # Подсказка: необходимо модифицировать функцию random_recommendation()
    recs = random.choices(items_weights.index, items_weights.values, k=n)
    
    return random.choices(items_weights.index, items_weights.values, k=n) # recs.tolist()

Сделайте предсказания

In [23]:
%%time

predictions_basic['weighted_rnd'] = \
        predictions_basic.user_id.apply(lambda _: weighted_random_recommendation(items_weights, 5))

CPU times: user 2min 11s, sys: 1.04 s, total: 2min 12s
Wall time: 2min 12s


### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [24]:
col_list = predictions_basic.columns[1:]
col_list

Index(['actual', 'random_recommendation', 'popular_recommendation', 'itemitem',
       'cosine', 'tfidf', 'own_purchases', 'itemitem_noadd', 'weighted_rnd'],
      dtype='object')

In [25]:
predictions_basic.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,itemitem_noadd,weighted_rnd
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[963664, 733146, 949877, 9682312, 1056548]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]","[1082185, 981760, 1127831, 995242, 840361]","[1121865, 857773, 914244, 1117249, 1014140]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[870170, 824571, 920025, 953351, 9878598]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[999999, 1082185, 1098066, 6534178, 1127831]","[1082185, 981760, 1098066, 826249, 995242]","[1000205, 1059316, 911140, 5568682, 6423751]"


In [26]:
prec_5 = {}

for col_n in col_list[1:]:
#     print(f"Precision@5 {col_n}: "
#           f"{np.mean(list(map(lambda x, y: precision_at_k_l(x, y, 5), predictions_basic[col_n], predictions_basic.actual))):.4f}")
    prec_5[col_n] = np.mean(list(map(lambda x, y: precision_at_k_l(x, y, 5), predictions_basic[col_n], predictions_basic.actual)))

for k, v in dict(sorted(prec_5.items(), key=lambda item: item[1])).items():
    print(f"Precision@5: {v:0.4f} - {k}")

Precision@5: 0.0008 - random_recommendation
Precision@5: 0.0020 - weighted_rnd
Precision@5: 0.1329 - cosine
Precision@5: 0.1369 - itemitem
Precision@5: 0.1390 - tfidf
Precision@5: 0.1541 - itemitem_noadd
Precision@5: 0.1552 - popular_recommendation
Precision@5: 0.1797 - own_purchases


### Задание 3. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.
-  *Попробуйте стратегии ансамблирования изученных алгоритмов

Обязательно нужно сделать первые 2 пункта!

In [27]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

#### Random recommendation

In [28]:
def random_recommendation(items, n=5):
    """Случайные рекоммендации"""
    
    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [29]:
%%time

predictions_basic['random_recommendation_t5k'] = predictions_basic['user_id'].apply(lambda _: random_recommendation(top_5000, n=5))
predictions_basic.head(2)

CPU times: user 1.54 s, sys: 37 µs, total: 1.54 s
Wall time: 1.59 s


Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,itemitem_noadd,weighted_rnd,random_recommendation_t5k
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[963664, 733146, 949877, 9682312, 1056548]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]","[1082185, 981760, 1127831, 995242, 840361]","[1121865, 857773, 914244, 1117249, 1014140]","[1013703, 885290, 1033142, 9297615, 1122112]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[870170, 824571, 920025, 953351, 9878598]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[999999, 1082185, 1098066, 6534178, 1127831]","[1082185, 981760, 1098066, 826249, 995242]","[1000205, 1059316, 911140, 5568682, 6423751]","[6979427, 1074223, 945662, 1092040, 5568197]"


#### Weighted random recommender

In [30]:
items_weights.sort_values(ascending=False, inplace=True)
items_weights[:5000]

item_id
6534178    13.056210
6533889    10.660683
1029743    10.544865
6534166    10.351340
1082185    10.214313
             ...    
7410342     5.631678
914188      5.631678
1036297     5.631463
1071845     5.631391
973042      5.631212
Name: sales_value, Length: 5000, dtype: float64

In [31]:
%%time

predictions_basic['weighted_rnd_t5k'] = \
        predictions_basic.user_id.apply(lambda _: weighted_random_recommendation(items_weights[:5000], 5))

CPU times: user 7.09 s, sys: 36 ms, total: 7.13 s
Wall time: 7.14 s


In [32]:
col_list = predictions_basic.columns[1:]
col_list

Index(['actual', 'random_recommendation', 'popular_recommendation', 'itemitem',
       'cosine', 'tfidf', 'own_purchases', 'itemitem_noadd', 'weighted_rnd',
       'random_recommendation_t5k', 'weighted_rnd_t5k'],
      dtype='object')

In [33]:
prec_5 = {}

for col_n in col_list[1:]:
#     print(f"Precision@5 {col_n}: "
#           f"{np.mean(list(map(lambda x, y: precision_at_k_l(x, y, 5), predictions_basic[col_n], predictions_basic.actual))):.4f}")
    prec_5[col_n] = np.mean(list(map(lambda x, y: precision_at_k_l(x, y, 5), predictions_basic[col_n], predictions_basic.actual)))

for k, v in dict(sorted(prec_5.items(), key=lambda item: item[1])).items():
    print(f"Precision@5: {v:0.4f} - {k}")

Precision@5: 0.0008 - random_recommendation
Precision@5: 0.0020 - weighted_rnd
Precision@5: 0.0067 - random_recommendation_t5k
Precision@5: 0.0067 - weighted_rnd_t5k
Precision@5: 0.1329 - cosine
Precision@5: 0.1369 - itemitem
Precision@5: 0.1390 - tfidf
Precision@5: 0.1541 - itemitem_noadd
Precision@5: 0.1552 - popular_recommendation
Precision@5: 0.1797 - own_purchases


#### ItemItemRecommender

In [34]:
%%time

for i in range(1, 7):

    model = ItemItemRecommender(K=i, num_threads=2) # K - кол-во билжайших соседей

    model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
              show_progress=True)

    recs = model.recommend(userid=userid_to_id[2],  # userid - id от 0 до N
                           user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                           N=5, # кол-во рекомендаций 
                           filter_already_liked_items=False, 
                           filter_items=None, 
                           recalculate_user=True)

    col_name_str = 'itemitem_noadd_k' + str(i)
    predictions_basic[col_name_str] = predictions_basic['user_id'].\
        apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                        model.recommend(userid=userid_to_id[x], 
                                        user_items=sparse_user_item,   # на вход user-item matrix
                                        N=5, 
                                        filter_already_liked_items=False, 
                                        filter_items=None, 
                                        recalculate_user=True)])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=86865.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=86865.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=86865.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=86865.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=86865.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=86865.0), HTML(value='')))


CPU times: user 5min 16s, sys: 2.33 s, total: 5min 19s
Wall time: 3min 59s


In [35]:
col_list = predictions_basic.columns[1:]
col_list

Index(['actual', 'random_recommendation', 'popular_recommendation', 'itemitem',
       'cosine', 'tfidf', 'own_purchases', 'itemitem_noadd', 'weighted_rnd',
       'random_recommendation_t5k', 'weighted_rnd_t5k', 'itemitem_noadd_k1',
       'itemitem_noadd_k2', 'itemitem_noadd_k3', 'itemitem_noadd_k4',
       'itemitem_noadd_k5', 'itemitem_noadd_k6'],
      dtype='object')

In [36]:
prec_5 = {}

for col_n in col_list[1:]:
#     print(f"Precision@5 {col_n}: "
#           f"{np.mean(list(map(lambda x, y: precision_at_k_l(x, y, 5), predictions_basic[col_n], predictions_basic.actual))):.4f}")
    prec_5[col_n] = np.mean(list(map(lambda x, y: precision_at_k_l(x, y, 5), predictions_basic[col_n], predictions_basic.actual)))

for k, v in dict(sorted(prec_5.items(), key=lambda item: item[1])).items():
    print(f"Precision@5: {v:0.4f} - {k}")

Precision@5: 0.0008 - random_recommendation
Precision@5: 0.0020 - weighted_rnd
Precision@5: 0.0067 - random_recommendation_t5k
Precision@5: 0.0067 - weighted_rnd_t5k
Precision@5: 0.1329 - cosine
Precision@5: 0.1369 - itemitem
Precision@5: 0.1390 - tfidf
Precision@5: 0.1510 - itemitem_noadd_k4
Precision@5: 0.1541 - itemitem_noadd
Precision@5: 0.1541 - itemitem_noadd_k5
Precision@5: 0.1552 - popular_recommendation
Precision@5: 0.1591 - itemitem_noadd_k6
Precision@5: 0.1700 - itemitem_noadd_k3
Precision@5: 0.1797 - own_purchases
Precision@5: 0.2063 - itemitem_noadd_k2
Precision@5: 0.2195 - itemitem_noadd_k1


#### Stack

In [37]:
np.mean(list(map(lambda x, y: precision_at_k_l(x, y, 10), 
                 predictions_basic['own_purchases'], 
                 predictions_basic.actual)))

0.17969311132876264

In [38]:
np.mean(list(map(lambda x, y: precision_at_k_l(x, y, 10), 
                 predictions_basic['popular_recommendation'], 
                 predictions_basic.actual)))

0.15523996082272282

In [39]:
np.mean(list(map(lambda x, y: precision_at_k_l(x, y, 10), 
                 predictions_basic['own_purchases'] + predictions_basic['popular_recommendation'], 
                 predictions_basic.actual)))

0.09635686301944872

### Задание 4. Улучшение детерминированных алгоритмов
На семинаре мы рассматривали 



Далее $U \equiv N_i(u) $

$$r_{u,i} =  \frac{1}{S}\sum\limits_{v \in U}\operatorname{sim}(u,v)r_{v, i}$$
$$ S = \sum\limits_{v \in U} \operatorname{sim}(u,v)$$

Предлагается улучшить эту формулу и учесть средние предпочтения всех пользователей

$$r_{u,i} = \mu + \bar{r_u} + \frac{1}{S}\sum\limits_{v \in U}\operatorname{sim}(u,v)(r_{v, i}-\bar{r_{v}} - \mu)$$

Какие смысл имееют $ \mu $ и $ \bar{r_u}$ ?

Реализуйте алгоритм, прогнозирующий рейтинги на основе данной формулы, на numpy (векторизованно!)

В качестве схожести возьмите CosineSimilarity.

Примените к user_item_matrix. В качестве рейтингов возьмите количество или стоимость купленного товара. 
Данный алгоритм предсказывает рейтинги. Как на основании предсказанных рейтингов предсказать факт покупки?

Предложите вариант.
Посчитайте accuracy@5 и сравните с алгоритмами, разобранными на вебинаре.

In [40]:
item_user_matrix = pd.pivot_table(data_train, 
                                  index='item_id', columns='user_id', 
                                  values='quantity',
                                  aggfunc='sum', 
                                  fill_value=0)

In [41]:
item_user_matrix.head(3)

user_id,1,2,3,4,5,6,7,8,9,10,...,2491,2492,2493,2494,2495,2496,2497,2498,2499,2500
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25671,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26081,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26093,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
import tqdm

In [43]:
def ii_cossim(data, ind_src, n=5):
    sim_list = []

    for i in tqdm.tqdm(range(item_user_matrix.shape[0])):
        if i != ind_src:
            sm = item_user_matrix.iloc[i]@item_user_matrix.iloc[ind_src]
            if sm:
                sim_list.append(sm/np.sqrt(sum(item_user_matrix.iloc[i]**2)*sum(item_user_matrix.iloc[ind_src]**2)))

    sim_list.sort(reverse=True)
    return sim_list[:5]

In [44]:
%%time

ii_cossim(item_user_matrix, 0, n=5)

100%|██████████| 86865/86865 [00:59<00:00, 1455.14it/s]

CPU times: user 59.5 s, sys: 420 ms, total: 59.9 s
Wall time: 59.8 s





[0.9428090415820635,
 0.9428090415820635,
 0.816496580927726,
 0.769800358919501,
 0.7257747386024231]