In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [2]:
data = pd.read_csv('retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [4]:
from scipy.special import softmax
from sklearn.preprocessing import minmax_scale

In [5]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [6]:
def weighted_random_recommendation(items, items_weights, n=5):

    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False, p=items_weights)

    return recs.tolist()

In [7]:
items = data_train.item_id.unique()

In [8]:
weights = softmax(minmax_scale(data_train.groupby('item_id')['sales_value'].sum()))

In [9]:
weighted_random_recommendation(items, weights)

[1111922, 7409967, 881655, 1469989, 10122016]

In [10]:
%%time

result['random_recommendation'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items, weights, n=5))
result.head(2)

CPU times: user 1.61 s, sys: 15.6 ms, total: 1.62 s
Wall time: 1.6 s


Unnamed: 0,user_id,actual,random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[12524691, 15511929, 10206904, 1264043, 901543]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[992267, 8016028, 1116476, 9677744, 1038024]"


### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [46]:
result = pd.read_csv('predictions_basic.csv')
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,[ 821867 834484 856942 865456 889248 ...,"[5586238, 1015228, 866118, 2416733, 2603573]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,[ 835476 851057 872021 878302 879948 ...,"[161354, 63027, 1027802, 12263694, 307395]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]"


In [47]:
result['actual'] = result.actual.str.strip('[ ]').str.replace('\n', '').str.replace(r'\d +', ',')

In [20]:
from ast import literal_eval

In [54]:
result['actual'] = result.actual.apply(literal_eval)

In [79]:
result['random_recommendation'] = result['random_recommendation'].apply(literal_eval)

In [85]:
result['itemitem'] = result['itemitem'].apply(literal_eval)

In [87]:
result['cosine'] = result['cosine'].apply(literal_eval)

In [92]:
result['tfidf'] = result['tfidf'].apply(literal_eval)

In [82]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)
    
    
    return precision

In [98]:
result.apply(lambda row: precision_at_k(row['random_recommendation'], row['actual']), axis=1).mean()

0.0002938295788442704

In [86]:
result.apply(lambda row: precision_at_k(row['itemitem'], row['actual']), axis=1).mean()

0.0006856023506366307

In [91]:
result.apply(lambda row: precision_at_k(row['cosine'], row['actual']), axis=1).mean()

0.0004897159647404506

In [93]:
result.apply(lambda row: precision_at_k(row['tfidf'], row['actual']), axis=1).mean()

0.0004897159647404506

Лучший результат показал Item-Item Recommender

### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

In [101]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popularity.head()

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1
3,26190,1
4,26355,2


In [102]:
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [104]:
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

In [106]:
result = data_test.loc[data_test.item_id != 999999].groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [107]:
result['random_recommendation'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items, weights, n=5))
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1192512, 925626, 2752711, 797898, 1192597]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[984425, 948665, 851130, 9802800, 1020683]"


Результат случайных рекомендаций действительно улучшился

In [108]:
result.apply(lambda row: precision_at_k(row['random_recommendation'], row['actual']), axis=1).mean()

0.0005876591576885406

In [109]:
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

In [110]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [114]:
%%time

model = ItemItemRecommender(K=3, num_threads=4) # K - кол-во билжайших соседей

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))


CPU times: user 1.08 s, sys: 0 ns, total: 1.08 s
Wall time: 480 ms


In [117]:
def ii_rec(i):
    recs = model.recommend(userid=userid_to_id[i],  # userid - id от 0 до N
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, # кол-во рекомендаций 
                        filter_already_liked_items=True, 
                        filter_items=None, 
                        recalculate_user=True)
    return [item[0] for item in recs]

In [129]:
%%time
res = []
for user_id in result.user_id.values:
    res.append(ii_rec(user_id))
    

CPU times: user 7min 45s, sys: 46.9 ms, total: 7min 45s
Wall time: 7min 45s


In [140]:
result['ii_3'] = [[id_to_itemid[rec[0]] for rec in recs] for result in res]

In [142]:
result.apply(lambda row: precision_at_k(row['ii_3'], row['actual']), axis=1).mean()

0.0347698334965722