# Рекомендательные системы
# ДЗ 2 "Бейзлайны и детерминированные алгоритмы item-item" 

In [None]:
pip install implicit

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
# from implicit.evaluation import train_test_split
# from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [2]:
data = pd.read_csv('./data/retail_train.csv')
data.head(3)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0


In [3]:
users, items, interactions = data.user_id.nunique(), data.item_id.nunique(), data.shape[0]

print('# users: ', users)
print('# items: ', items)
print('# interactions: ', interactions)

# users:  2499
# items:  89051
# interactions:  2396804


In [4]:
data_2 = data.copy()
data_2.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [5]:
# найдем продажи по строкам
data_2['sale_multy'] = data_2['quantity'] * data_2['sales_value']
data_2.head(3)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,sale_multy
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,1.39
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,0.82
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0,0.99


In [6]:
# Найдем ПРОДАЖИ по каждому item_id
# Группируем по item_id и суммируем продажи по каждому item_id
df_sum_id_sales = data_2.groupby('item_id')['sale_multy'].agg(sum).reset_index()
df_sum_id_sales.rename(columns={'sale_multy': 'sum_sale_id'}, inplace=True)
df_sum_id_sales.sort_values('sum_sale_id').head(3)

Unnamed: 0,item_id,sum_sale_id
23308,976473,0.0
82206,13513364,0.0
74772,12384790,0.0


In [7]:
# Присоединим колонку продаж к нашим данным

data_2 = data_2.merge(df_sum_id_sales, on=['item_id'])
data_2.head(3)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,sale_multy,sum_sale_id
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,1.39,9905.63
1,1929,27021022215,4,1004906,1,1.39,441,-0.6,1755,1,0.0,0.0,1.39,9905.63
2,568,27021090189,4,1004906,1,1.39,446,-0.6,35,1,0.0,0.0,1.39,9905.63


In [8]:
# Выкинем нулевые item (не было покупок)

data_2 = data_2.loc[data_2.sum_sale_id != 0]
data_2.shape[0] 

2388736

In [None]:
# # общая сумма проданых товаров (знаменатель для вычисления весов)
# sum_sales_value = data_2['sale_multy'].sum()
# sum_sales_value

In [10]:
# общая сумма проданых товаров (знаменатель для вычисления весов)
sum_sales_value = df_sum_id_sales['sum_sale_id'].sum()
sum_sales_value

7584303583.379998

In [11]:
# Просчитаем колонку ВЕСОВ для каждого item

data_2['weight_sale'] = data_2['sum_sale_id'] /  sum_sales_value
data_2.head(3)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,sale_multy,sum_sale_id,weight_sale
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,1.39,9905.63,1e-06
1,1929,27021022215,4,1004906,1,1.39,441,-0.6,1755,1,0.0,0.0,1.39,9905.63,1e-06
2,568,27021090189,4,1004906,1,1.39,446,-0.6,35,1,0.0,0.0,1.39,9905.63,1e-06


In [12]:
# Просчитаем колонку ЛОГАРИФМОВ ВЕСОВ для каждого item

data_2['log_weight_sale'] = np.log(1 +  data_2['weight_sale'])
data_2.head(3)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,sale_multy,sum_sale_id,weight_sale,log_weight_sale
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,1.39,9905.63,1e-06,1e-06
1,1929,27021022215,4,1004906,1,1.39,441,-0.6,1755,1,0.0,0.0,1.39,9905.63,1e-06,1e-06
2,568,27021090189,4,1004906,1,1.39,446,-0.6,35,1,0.0,0.0,1.39,9905.63,1e-06,1e-06


-------------------------

In [15]:

# Создам датафрейм с даными items и их весами
# items_weigh = data_2.groupby('item_id')['weight_sale'].unique().reset_index()
items_w = data_2.groupby('item_id')['log_weight_sale'].unique().reset_index()
items_w.head(4)

Unnamed: 0,item_id,log_weight_sale
0,25671,[8.282896334479913e-09]
1,26081,[1.30532695754944e-10]
2,26093,[2.0964341371799312e-10]
3,26190,[2.030509093991293e-10]


In [16]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [17]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(5)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412..."


### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [18]:
#  для сравнения сделаем random_recommendation

def random_recommendation(items, n=5):
    """Случайные рекомендации"""
    
    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [19]:
def weighted_random_recommendation(items, n=5):


    items_ = (items['item_id'])
    weight_ =  (items['log_weight_sale'])

    recs = np.random.choice(list(map(int, items_weight['item_id'])), n, list(map(float, items_weight['log_weight_sale'])))
    
    return recs.tolist()

In [61]:
%%time

# items = data_train.item_id.unique()
items_weight = data_2.groupby('item_id')['log_weight_sale'].unique().reset_index()

result['weigth_random_recommendation'] = result['user_id'].map(lambda x: weighted_random_recommendation(items_weight, 5))
result

CPU times: total: 49.8 s
Wall time: 49.9 s


Unnamed: 0,user_id,actual,weigth_random_recommendation,random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1955394, 98614, 565285, 1230050, 1002032]","[942361, 1114587, 1111344, 12257869, 1341366]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[12648296, 2648872, 859667, 7166546, 6979716]","[915869, 1182562, 8160464, 904100, 1053656]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[1065875, 9528219, 1276557, 9487802, 9677072]","[1110039, 891802, 1310372, 853416, 1881198]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[862010, 1104607, 859444, 1019958, 902396]","[1309669, 1119970, 928190, 854205, 1103406]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[13381770, 7442693, 16223326, 15830614, 9249539]","[13511579, 1123771, 12263570, 6704400, 9575583]"
...,...,...,...,...
2037,2496,[6534178],"[9827853, 9553528, 6469995, 1253919, 12673066]","[9445752, 397172, 2756336, 869951, 836608]"
2038,2497,"[1016709, 9835695, 1132298, 16809501, 845294, ...","[1214976, 5587104, 7170493, 13512647, 6424182]","[1298335, 15452673, 13190648, 922266, 10121895]"
2039,2498,"[15716530, 834484, 901776, 914190, 958382, 972...","[13008290, 1034563, 954918, 1098927, 889864]","[5748585, 6904436, 13776705, 877811, 1040960]"
2040,2499,"[867188, 877580, 902396, 914190, 951590, 95813...","[12456488, 9222012, 15452826, 10355413, 862541]","[1040791, 5566974, 8203691, 1046243, 828287]"


In [21]:
def random_recommendation(items, n=5):
    """Случайные рекомендации"""
    
    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [22]:
%%time

items = data_train.item_id.unique()

result['random_recommendation'] = result['user_id'].map(lambda x: random_recommendation(items, n=5))
result.head(2)

CPU times: total: 3.25 s
Wall time: 3.25 s


Unnamed: 0,user_id,actual,weigth_random_recommendation,random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1057812, 8357619, 979421, 911319, 6557783]","[942361, 1114587, 1111344, 12257869, 1341366]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1400939, 437389, 16124540, 15596323, 1976518]","[915869, 1182562, 8160464, 904100, 1053656]"


### Задание 2. Расчет метрик
Рассчитайте Precision@5, Precision@3, Recall@5 для каждого алгоритма с помощью функций из вебинара 1. Какой алгоритм показывает лучшее качество?

In [23]:
result.head(2)

Unnamed: 0,user_id,actual,weigth_random_recommendation,random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1057812, 8357619, 979421, 911319, 6557783]","[942361, 1114587, 1111344, 12257869, 1341366]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1400939, 437389, 16124540, 15596323, 1976518]","[915869, 1182562, 8160464, 904100, 1053656]"


In [24]:
# Precision@5

def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(recommended_list, bought_list)
    
    precision = flags.sum() / len(recommended_list)
    
    
    return precision

In [59]:
%%time
    
Precision_5_random = result.apply(lambda row: precision_at_k(row['random_recommendation'], row['actual'], 5), axis=1).mean()
Precision_5_weigth_random = result.apply(lambda row: precision_at_k(row['weigth_random_recommendation'], row['actual'],5), axis=1).mean()
Precision_3_random = result.apply(lambda row: precision_at_k(row['random_recommendation'], row['actual'], 3), axis=1).mean()
Precision_3_weigth_random = result.apply(lambda row: precision_at_k(row['weigth_random_recommendation'], row['actual'],3), axis=1).mean()

print(f'Precision_5_random =        {Precision_5_random}')
print(f'Precision_5_weigth_random = {Precision_5_weigth_random}')
print(f'Precision_3_random =        {Precision_3_random}')
print(f'Precision_3_weigth_random = {Precision_3_weigth_random}')


Precision_5_random =        0.0006856023506366307
Precision_5_weigth_random = 0.000979431929480901
Precision_3_random =        0.0004897159647404506
Precision_3_weigth_random = 0.0004897159647404506
CPU times: total: 438 ms
Wall time: 431 ms


In [53]:
%%time
    
# result['als'] = result['user_id'].map(lambda x: get_recommendations(x, model, sparse_user_item, N=5))
Precision_5_weigth_random = result.apply(lambda row: precision_at_k(row['weigth_random_recommendation'], row['actual'],5), axis=1).mean()
Precision_5_weigth_random

CPU times: total: 93.8 ms
Wall time: 104 ms


0.000979431929480901

In [62]:
def recall_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(recommended_list, bought_list)
    
    recall = flags.sum() / len(bought_list)
    
    
    return recall

In [None]:
%%time
    
recall_5_random = result.apply(lambda row: precision_at_k(row['random_recommendation'], row['actual'], 5), axis=1).mean()
recall_5_weigth_random = result.apply(lambda row: precision_at_k(row['weigth_random_recommendation'], row['actual'],5), axis=1).mean()
recall_3_random = result.apply(lambda row: precision_at_k(row['random_recommendation'], row['actual'], 3), axis=1).mean()
recall_3_weigth_random = result.apply(lambda row: precision_at_k(row['weigth_random_recommendation'], row['actual'],3), axis=1).mean()

print(f'recall_5_random =        {recall_5_random}')
print(f'recall_5_weigth_random = {recall_5_weigth_random}')
print(f'recall_3_random =        {recall_3_random}')
print(f'Precision_3_weigth_random = {Precision_3_weigth_random}')

In [32]:
%%time

# items = data_train.item_id.unique()
items_weight = data_2.groupby('item_id')['log_weight_sale'].unique().reset_index()


result_3['weigth_random_recommendation'] = result_3['user_id'].map(lambda x: weighted_random_recommendation(items_weight, n=3))
result_3

CPU times: total: 49.6 s
Wall time: 49.8 s


Unnamed: 0,user_id,actual,weigth_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[15506797, 9419559, 8204904]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[116219, 13190613, 1097642]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[13007249, 937995, 96670]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[823695, 7442004, 5572278]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[5567057, 12352330, 2649645]"
...,...,...,...
2037,2496,[6534178],"[6979046, 6602643, 12649316]"
2038,2497,"[1016709, 9835695, 1132298, 16809501, 845294, ...","[1288106, 14050504, 16806418]"
2039,2498,"[15716530, 834484, 901776, 914190, 958382, 972...","[13189851, 1194732, 829594]"
2040,2499,"[867188, 877580, 902396, 914190, 951590, 95813...","[13945293, 14111471, 1281640]"


In [34]:
%%time

items = data_train.item_id.unique()

result_3['random_recommendation'] = result_3['user_id'].map(lambda x: random_recommendation(items, n=5))
result_3.head(2)

CPU times: total: 3.27 s
Wall time: 3.27 s


Unnamed: 0,user_id,actual,weigth_random_recommendation,random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[15506797, 9419559, 8204904]","[83237, 10207148, 12648295, 6979006, 1951721]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[116219, 13190613, 1097642]","[328490, 8351541, 893087, 1009862, 1065813]"


In [35]:
# Precision@3

def precision_at_k(recommended_list, bought_list, k=3):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(recommended_list, bought_list)
    
    precision = flags.sum() / len(recommended_list)
    
    
    return precision

In [41]:
%%time
    
Precision_3_random = result_3.apply(lambda row: precision_at_k(row['random_recommendation'], row['actual']), axis=1).mean()
Precision_3_random


CPU times: total: 141 ms
Wall time: 120 ms


0.000652954619653934

In [43]:
%%time
    
# result['als'] = result['user_id'].map(lambda x: get_recommendations(x, model, sparse_user_item, N=5))
Precision_3_weigth_random = result_3.apply(lambda row: precision_at_k(row['weigth_random_recommendation'], row['actual']), axis=1).mean()

Precision_3_weigth_random

CPU times: total: 109 ms
Wall time: 112 ms


0.0001632386549134835

### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 (или другое количество) товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

In [None]:
# your_code