# Вебинар 6. Двухуровневые модели рекомендаций


Код для src, utils, metrics вы можете скачать из [этого](https://github.com/geangohn/recsys-tutorial) github репозитория

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [2]:
data = pd.read_csv('/home/alex/Downloads/recsys/retail_train.csv')
item_features = pd.read_csv('/home/alex/Downloads/recsys/product.csv')
user_features = pd.read_csv('/home/alex/Downloads/recsys/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


In [3]:
n_items_before = data['item_id'].nunique()

data = prefilter_items(data, item_features=item_features, take_n_popular=5000)

n_items_after = data['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 89051 to 5001


In [4]:




# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
7,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,0.0,0.0,2.99
11,1364,26984896261,1,999999,1,2.19,31742,0.0,1520,1,0.0,0.0,2.19


In [5]:
recommender = MainRecommender(data_train_lvl_1)



HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




In [5]:
recommender.get_als_recommendations(2375, N=200)

[984054,
 983959,
 984176,
 984669,
 984575,
 982743,
 970455,
 978318,
 982469,
 982449,
 982393,
 982360,
 982112,
 981900,
 981677,
 981660,
 981628,
 981601,
 981374,
 981370,
 981288,
 981165,
 981086,
 980984,
 980943,
 980823,
 980433,
 980263,
 980259,
 980246,
 979987,
 979877,
 979792,
 979707,
 979674,
 979551,
 979505,
 979452,
 979439,
 979022,
 978906,
 978879,
 978343,
 978332,
 970747,
 985889,
 956486,
 970202,
 970160,
 970152,
 970028,
 969977,
 969945,
 969941,
 969866,
 969846,
 969836,
 969725,
 969568,
 969403,
 969138,
 969039,
 968992,
 968759,
 968732,
 968695,
 968687,
 968391,
 968363,
 968359,
 968164,
 968072,
 968025,
 967994,
 967948,
 967514,
 967461,
 967205,
 967144,
 967041,
 966778,
 966684,
 966619,
 966546,
 966348,
 965956,
 965889,
 965842,
 965772,
 965766,
 965693,
 965679,
 965555,
 965530,
 965430,
 965267,
 965041,
 965009,
 964968,
 964835,
 964734,
 964717,
 964521,
 964520,
 964462,
 964342,
 964221,
 964133,
 964120,
 963971,
 963868,
 

In [6]:
recommender.get_own_recommendations(2375, N=200)

[948640,
 918046,
 847962,
 907099,
 873980,
 884694,
 10285454,
 1107760,
 7169090,
 979674,
 10308345,
 1069531,
 974766,
 1015474,
 950935,
 847066,
 1102207,
 1020770,
 9521787,
 974265,
 940996,
 8019845,
 5567194,
 12811490,
 1003616,
 973181,
 890719,
 982955,
 9677152,
 998519,
 1072685,
 1131382,
 1021715,
 12263119,
 960791,
 7441873,
 986021,
 956666,
 1038692,
 9677748,
 9297223,
 927030,
 12757653,
 1046919,
 6391532,
 989069,
 1068451,
 951954,
 835300,
 937343,
 1047249,
 13876348,
 1061732,
 981601,
 1121028,
 1087547,
 828393,
 996269,
 951951,
 1036093,
 1023815,
 5570408,
 827667,
 1082454,
 1006878,
 5570048,
 841309,
 1078652,
 1115553,
 1056492,
 1138467,
 1004945,
 947858,
 1092885,
 1121694,
 938138,
 8019916,
 827919,
 984315,
 10341855,
 883932,
 8291322,
 1096794,
 1028938,
 1087618,
 8020166,
 1082185,
 866871,
 930666,
 825994,
 910151,
 823990,
 848029,
 896613,
 12301839,
 1117219,
 1135258,
 869868,
 1046545,
 899624,
 6442594,
 1137775,
 825343,
 104290

In [7]:
recommender.get_similar_items_recommendation(2375, N=200)

[15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,
 15926886,

In [8]:
recommender.get_similar_users_recommendation(2375, N=200)

[1104349,
 1059347,
 970160,
 8358933,
 1055403,
 1112205,
 1138596,
 1039224,
 931124,
 972235,
 1101932,
 902377,
 1076580,
 899115,
 847434,
 12949590,
 13007846,
 911614,
 822161,
 1095227,
 1057168,
 1038745,
 997987,
 993838,
 8019916,
 917427,
 5707857,
 8090612,
 1097398,
 958067,
 937110,
 920025,
 12262992,
 1088771,
 906844,
 972416,
 1104349,
 890423,
 835499,
 879393,
 12696089,
 9392700,
 847066,
 963686,
 7169090,
 1057168,
 868888,
 823365,
 7409622,
 1102250,
 1134633,
 878715,
 1079528,
 871514,
 1137771,
 9297055,
 1029272,
 1034462,
 10341855,
 959455,
 956125,
 916990,
 6533368,
 5569074,
 1114653,
 5590965,
 7169088,
 1118946,
 969866,
 942475,
 1049922,
 1056212,
 8090653,
 894360,
 7142552,
 948640,
 1013578,
 995408,
 1076187,
 832492,
 1057168,
 857736,
 1092588,
 896666,
 861494,
 906202,
 863324,
 910151,
 1118946,
 1086061,
 1031253,
 1030093,
 837969,
 5568197,
 12427353,
 1027372,
 960421,
 1134483,
 985605,
 8293439,
 829349,
 8357825,
 8090549,
 903529,

### Задание 1

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 200 кандидатов (k=200)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?


In [6]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[865456, 999999, 878285, 922281, 940947, 98389..."
1,2,"[999999, 839656, 866211, 893867, 1011457, 1103..."


In [7]:
# Пока только warm start
train_users = data_train_lvl_1['user_id'].unique()
result_lvl_1 = result_lvl_1[result_lvl_1['user_id'].isin(train_users)]

result_lvl_1['candidates_own'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=200))

In [10]:
result_lvl_1['candidates_als'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=200))

In [11]:
result_lvl_1['candidates_sim_items'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_similar_items_recommendation(x, N=200))

In [12]:
result_lvl_1['candidates_sim_users'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_similar_users_recommendation(x, N=200))

In [13]:
result_lvl_1

Unnamed: 0,user_id,actual,candidates_own,candidates_als,candidates_sim_items,candidates_sim_users
0,1,"[865456, 999999, 878285, 922281, 940947, 98389...","[856942, 9297615, 5577022, 9655212, 888104, 10...","[985893, 985889, 986021, 986869, 986727, 98365...","[16809649, 16809649, 16809649, 16809649, 16809...","[1104349, 1059347, 970160, 8358933, 1055403, 1..."
1,2,"[999999, 839656, 866211, 893867, 1011457, 1103...","[911974, 1076580, 5567582, 1103898, 1056620, 9...","[985893, 985889, 986021, 986869, 986727, 98365...","[16809649, 16809649, 16809649, 16809649, 16809...","[1104349, 1059347, 970160, 8358933, 1055403, 1..."
2,4,"[883932, 970760, 1035676, 999999, 831063, 8914...","[6391541, 1052294, 891423, 936470, 1137010, 83...","[985893, 985889, 986021, 986869, 986727, 98365...","[16809649, 16809649, 16809649, 16809649, 16809...","[1104349, 1059347, 970160, 8358933, 1055403, 1..."
3,6,"[1024306, 6548453, 1098844, 999999, 8357613, 9...","[13003092, 972416, 995598, 923600, 1138596, 10...","[985893, 985889, 986021, 986869, 986727, 98365...","[16809649, 16809649, 16809649, 16809649, 16809...","[1104349, 1059347, 970160, 8358933, 1055403, 1..."
4,7,"[999999, 993638, 1106523, 5572738, 909714, 102...","[998519, 894360, 7147142, 9338009, 896666, 939...","[985893, 985889, 986021, 986869, 986727, 98365...","[16809649, 16809649, 16809649, 16809649, 16809...","[1104349, 1059347, 970160, 8358933, 1055403, 1..."
5,8,"[945611, 999999, 1013389, 1089888, 12302069, 8...","[12808385, 981660, 939860, 7410201, 6463874, 5...","[985893, 985889, 986021, 986869, 986727, 98365...","[16809649, 16809649, 16809649, 16809649, 16809...","[1104349, 1059347, 970160, 8358933, 1055403, 1..."
6,9,"[883616, 1029743, 1051323, 999999, 1114605, 11...","[872146, 918046, 9655676, 1056005, 8019755, 10...","[985893, 985889, 986021, 986869, 986727, 98365...","[16809649, 16809649, 16809649, 16809649, 16809...","[1104349, 1059347, 970160, 8358933, 1055403, 1..."
7,13,"[999999, 7024814, 9488065, 882604, 914346, 951...","[965772, 9488065, 10342382, 6554400, 10308337,...","[985893, 985889, 986021, 986869, 986727, 98365...","[16809649, 16809649, 16809649, 16809649, 16809...","[1104349, 1059347, 970160, 8358933, 1055403, 1..."
8,14,"[1127758, 1135552, 999999]","[902377, 822161, 1123106, 874563, 8090610, 138...","[985893, 985889, 986021, 986869, 986727, 98365...","[16809649, 16809649, 16809649, 16809649, 16809...","[1104349, 1059347, 970160, 8358933, 1055403, 1..."
9,15,"[1014509, 1029743, 1070820, 825994, 845208, 90...","[823576, 1052975, 1071196, 1053530, 1010051, 1...","[985893, 985889, 986021, 986869, 986727, 98365...","[16809649, 16809649, 16809649, 16809649, 16809...","[1104349, 1059347, 970160, 8358933, 1055403, 1..."


In [14]:
result_lvl_1.apply(lambda row: recall_at_k(row['candidates_own'], row['actual'], 200), axis=1).mean()

0.3678429844225468

In [18]:
result_lvl_1.apply(lambda row: recall_at_k(row['candidates_als'], row['actual'], 200), axis=1).mean()

0.03296251062163147

In [19]:
result_lvl_1.apply(lambda row: recall_at_k(row['candidates_sim_items'], row['actual'], 200), axis=1).mean()

0.11927808952662688

In [21]:
result_lvl_1.apply(lambda row: recall_at_k(row['candidates_sim_users'], row['actual'], 200), axis=1).mean()

0.020418144150881078

Как видно рекоммендации собственных товаров и схожик с ними дают значительно лучший результат

In [22]:
result_lvl_1['candidates_own_20'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=20))
result_lvl_1['candidates_own_50'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))
result_lvl_1['candidates_own_100'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=100))
result_lvl_1['candidates_own_500'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=500))

In [23]:
result_lvl_1.apply(lambda row: recall_at_k(row['candidates_own_20'], row['actual'], 20), axis=1).mean()

0.11105115040192286

In [24]:
result_lvl_1.apply(lambda row: recall_at_k(row['candidates_own_50'], row['actual'], 50), axis=1).mean()

0.18172520630192224

In [25]:
result_lvl_1.apply(lambda row: recall_at_k(row['candidates_own_100'], row['actual'], 100), axis=1).mean()

0.2658796946273117

In [26]:
result_lvl_1.apply(lambda row: recall_at_k(row['candidates_own_500'], row['actual'], 500), axis=1).mean()

0.47673186593969946

Как видно, качество модели растёт вместе с К. Возможно, стоит брать К побольше, учитывая, что товары нужно ещё отфильтровать в соответствии с бизрес требованиями

### Задание 2.

Обучите модель 2-ого уровня, при этом:
    - Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар
    - Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_lvl_2
    - Вырос ли precision@5 при использовании двухуровневой модели?

In [39]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

# Пока только warm start
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=200))

In [41]:
data_train_lvl_2.head(4)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
2107471,2021,40618753059,594,896862,2,5.0,443,-2.98,101,86,0.0,0.0,2.5
2107473,2021,40618753059,594,1019142,2,5.0,443,-1.98,101,86,0.0,0.0,2.5
2107476,2021,40618753059,594,9835223,1,9.27,443,-3.63,101,86,0.0,0.0,9.27
2108010,1753,40618809138,594,999999,1,29.99,345,0.0,8,86,0.0,0.0,29.99


In [43]:
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['drop'] = 1  # фиктивная пересенная

users_lvl_2.head(4)

Unnamed: 0,user_id,item_id,drop
0,2021,950935,1
0,2021,1119454,1
0,2021,835578,1
0,2021,863762,1


In [48]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('drop', axis=1, inplace=True)

In [42]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(data_train_lvl_2, on=['user_id', 'item_id'], how='left')


targets_lvl_2.head(20)

Unnamed: 0,user_id,item_id,target,manufacturer_x,department_x,brand_x,commodity_desc_x,sub_commodity_desc_x,curr_size_of_product_x,age_desc_x,...,day,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
0,2021,950935,0.0,2193,GROCERY,National,FRZN NOVELTIES/WTR ICE,WATER ICE,24 CT,,...,,,,,,,,,,
1,2021,1119454,0.0,910,GROCERY,National,BAKED BREAD/BUNS/ROLLS,HAMBURGER BUNS,8 OZ,,...,,,,,,,,,,
2,2021,835578,0.0,539,DRUG GM,National,CIGARETTES,CIGARETTES,971267 PK,,...,,,,,,,,,,
3,2021,863762,0.0,6046,DRUG GM,National,PREPAID WIRELESS&ACCESSORIES,PREPAID WIRELESS CARDS,,,...,,,,,,,,,,
4,2021,1019142,1.0,1007,MEAT-PCKGD,National,BREAKFAST SAUSAGE/SANDWICHES,ROLLS - FLAVORED/OTHER,1 LB,,...,594.0,2.0,5.0,443.0,-1.98,101.0,86.0,0.0,0.0,2.5
5,2021,1029125,0.0,136,GROCERY,National,DOG FOODS,DOG BISCUITS (HARD BISCUIT),24 OZ,,...,,,,,,,,,,
6,2021,1097398,0.0,111,DRUG GM,National,CIGARETTES,CIGARETTES,CTN,,...,,,,,,,,,,
7,2021,870515,0.0,2,PRODUCE,National,ONIONS,ONIONS RED (BULK&BAG),40 LB,,...,,,,,,,,,,
8,2021,6424460,0.0,69,PASTRY,Private,PIES,PIES: FRUIT/NUT,,,...,,,,,,,,,,
9,2021,988277,0.0,253,NUTRITION,National,FROZEN,FROZEN BREAKFAST,9 OZ,,...,,,,,,,,,,


In [37]:
user_features.

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1


In [36]:
test = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2021,950935,0.0,2193,GROCERY,National,FRZN NOVELTIES/WTR ICE,WATER ICE,24 CT,,,,,,,
1,2021,1119454,0.0,910,GROCERY,National,BAKED BREAD/BUNS/ROLLS,HAMBURGER BUNS,8 OZ,,,,,,,
2,2021,835578,0.0,539,DRUG GM,National,CIGARETTES,CIGARETTES,971267 PK,,,,,,,
3,2021,863762,0.0,6046,DRUG GM,National,PREPAID WIRELESS&ACCESSORIES,PREPAID WIRELESS CARDS,,,,,,,,
4,2021,1019142,1.0,1007,MEAT-PCKGD,National,BREAKFAST SAUSAGE/SANDWICHES,ROLLS - FLAVORED/OTHER,1 LB,,,,,,,
5,2021,1029125,0.0,136,GROCERY,National,DOG FOODS,DOG BISCUITS (HARD BISCUIT),24 OZ,,,,,,,
6,2021,1097398,0.0,111,DRUG GM,National,CIGARETTES,CIGARETTES,CTN,,,,,,,
7,2021,870515,0.0,2,PRODUCE,National,ONIONS,ONIONS RED (BULK&BAG),40 LB,,,,,,,
8,2021,6424460,0.0,69,PASTRY,Private,PIES,PIES: FRUIT/NUT,,,,,,,,
9,2021,988277,0.0,253,NUTRITION,National,FROZEN,FROZEN BREAKFAST,9 OZ,,,,,,,


### Финальный проект

Мы уже прошли всю необходимую теорию для финального проекта. Проект осуществляется на данных из вебинара (данные считаны в начале ДЗ).
Рекомендуем вам **начать делать проект сразу после этого домашнего задания**
- Целевая метрика - money precision@5. Порог для уcпешной сдачи проекта money precision@5 > 20%

Бизнес ограничения в топ-5 товарах:
- Для каждого юзера 5 рекомендаций (иногда модели могут возвращать < 5)
- **2 новых товара** (юзер никогда не покупал)
- **1 дорогой товар, > 7 долларов**
- **Все товары из разных категорий** (категория - department)  
- **Стоимость каждого рекомендованного товара > 1 доллара**  

- Будет public тестовый датасет, на котором вы сможете измерять метрику
- Также будет private тестовый датасет для измерения финального качества
- НЕ обязательно использовать 2-ух уровневые рекоммендательные системы в проекте
- Вы сдаете код проекта в виде github репозитория и .csv файл с рекомендациями. В .csv файле 2 столбца: user_id - (item_id1, item_id2, ..., item_id5)