**Вступительное задание школы MADE на решение задач Learn2rank**

# 1. Текст задания

Для данной задачи вам предстоит построить рекомендательную систему баннеров на основе логов просмотров и лайков.

Логи представлены четырьмя колонками:

 - user_id (идентификатор пользователя),  
 - item_id (идентификатор баннера),  
 - like (флаг понравился ли пользователю баннер),  
 - timestamp (unix время в секундах совершения действия).  

Кроме того, для пользователей и баннеров имеются признаки размерностью 32.

Вам необходимо предсказать 20 баннеров для пользователей. Качество решения будет оцениваться как доля "лайкнутых" пользователей баннеров из предложенного вами списка (top-20 accuracy).

----

Качество решения оценивается по метрике Top-K Accuracy, где k = 20. Код:
```{python}
def calc_score(test_choices, pred_choices, tk):
    s = 0
    for gt, p in zip(test_choices, pred_choices):
        s += int(gt in p)
    score = s / len(test_choices)
    return score
```

На экзамене за данную задачу вы можете получить до 100 баллов. Расчёт баллов производится по формуле (отличается от задачи 1 коэффициентами):

```
result = 100 if y > 0.54 else (40 + (y - 0.5) / 0.04 * 60 if y > 0.5 else 0)
```

где y — это ваш результат по метрике Top-K Accuracy. Количество баллов является округленным целым числом.

Желаем удачи!

# 2. Решение

In [12]:
import sys, gc, random

import pandas as pd
import lightgbm as lgb
import numpy as np

# 2.1 Просмотр данных

In [24]:
dtrain = pd.read_csv("./train.csv")
dtest = pd.read_csv("./test.csv")

duser = pd.read_csv("./user-features.csv")
ditem = pd.read_csv("./item-features.csv")

print(
    f"shapes: dtrain={dtrain.shape}, dtest={dtest.shape}, duser={duser.shape}, ditem={ditem.shape}"
)

shapes: dtrain=(8674, 4), dtest=(497, 2), duser=(497, 33), ditem=(444, 33)


### 2.1.1 dtrain

In [26]:
dtrain.head()

Unnamed: 0,user_id,item_id,like,timestamp
0,140,342,0,1490936622
1,378,172,1,1490936628
2,150,182,0,1490936650
3,455,17,0,1490936704
4,350,409,0,1490936735


In [27]:
dtrain.describe()

Unnamed: 0,user_id,item_id,like,timestamp
count,8674.0,8674.0,8674.0,8674.0
mean,244.855891,179.805626,0.175928,1491073000.0
std,143.102186,126.366142,0.380781,79702.18
min,0.0,0.0,0.0,1490937000.0
25%,121.0,73.0,0.0,1491004000.0
50%,243.0,154.0,0.0,1491071000.0
75%,368.0,266.0,0.0,1491141000.0
max,496.0,443.0,1.0,1491216000.0


In [28]:
dtrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8674 entries, 0 to 8673
Data columns (total 4 columns):
user_id      8674 non-null int64
item_id      8674 non-null int64
like         8674 non-null int64
timestamp    8674 non-null int64
dtypes: int64(4)
memory usage: 271.2 KB


### 2.1.2 dtest 

In [31]:
dtest.head()

Unnamed: 0,user_id,timestamp
0,166,1490944431
1,26,1490957371
2,41,1490958147
3,286,1490971255
4,108,1490976836


In [32]:
dtest.describe()

Unnamed: 0,user_id,timestamp
count,497.0,497.0
mean,248.0,1491146000.0
std,143.615807,58389.27
min,0.0,1490944000.0
25%,124.0,1491117000.0
50%,248.0,1491161000.0
75%,372.0,1491193000.0
max,496.0,1491215000.0


In [33]:
dtest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497 entries, 0 to 496
Data columns (total 2 columns):
user_id      497 non-null int64
timestamp    497 non-null int64
dtypes: int64(2)
memory usage: 7.9 KB


### 2.1.3 duser

In [34]:
duser.head()

Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,...,22,23,24,25,26,27,28,29,30,31
0,0,0.000695,-0.001573,-0.00147,0.002085,-0.000498,0.000685,0.000811,0.000666,-0.003031,...,-0.004196,-0.000698,0.001121,-0.001079,-0.001993,-0.001993,0.000422,-0.001168,-0.001168,0.000297
1,1,0.001204,-0.002725,-0.002546,0.003612,-0.000862,0.001187,0.001404,0.001154,-0.005251,...,-0.007268,-0.001209,0.001942,-0.00187,-0.003451,-0.003451,0.000732,-0.002023,-0.002023,0.000515
2,2,0.000491,-0.001112,-0.001039,0.001475,-0.000352,0.000484,0.000573,0.000471,-0.002144,...,-0.002967,-0.000494,0.000793,-0.000763,-0.001409,-0.001409,0.000299,-0.000826,-0.000826,0.00021
3,3,0.000777,-0.001759,-0.001643,0.002332,-0.000557,0.000766,0.000906,0.000745,-0.003389,...,-0.004691,-0.000781,0.001254,-0.001207,-0.002228,-0.002228,0.000472,-0.001306,-0.001306,0.000332
4,4,0.000695,-0.001573,-0.00147,0.002085,-0.000498,0.000685,0.000811,0.000666,-0.003031,...,-0.004196,-0.000698,0.001121,-0.001079,-0.001993,-0.001993,0.000422,-0.001168,-0.001168,0.000297


In [35]:
duser.describe()

Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,...,22,23,24,25,26,27,28,29,30,31
count,497.0,497.0,497.0,497.0,497.0,497.0,497.0,497.0,497.0,497.0,...,497.0,497.0,497.0,497.0,497.0,497.0,497.0,497.0,497.0,497.0
mean,248.0,0.001739,-0.003936,-0.003678,0.005218,-0.001246,0.001714,0.002029,0.001667,-0.007585,...,-0.0105,-0.001747,0.002806,-0.002701,-0.004986,-0.004986,0.001057,-0.002923,-0.002923,0.000744
std,143.615807,0.001148,0.002598,0.002428,0.003445,0.000822,0.001132,0.001339,0.0011,0.005007,...,0.006931,0.001153,0.001852,0.001783,0.003292,0.003292,0.000698,0.00193,0.00193,0.000491
min,0.0,0.000491,-0.016461,-0.015381,0.001475,-0.005209,0.000484,0.000573,0.000471,-0.031721,...,-0.04391,-0.007307,0.000793,-0.011296,-0.020852,-0.020852,0.000299,-0.012224,-0.012224,0.00021
25%,124.0,0.000919,-0.005036,-0.004706,0.002759,-0.001594,0.000906,0.001073,0.000881,-0.009705,...,-0.013434,-0.002236,0.001484,-0.003456,-0.00638,-0.00638,0.000559,-0.00374,-0.00374,0.000393
50%,248.0,0.00139,-0.003146,-0.00294,0.004171,-0.000996,0.00137,0.001622,0.001332,-0.006063,...,-0.008392,-0.001397,0.002243,-0.002159,-0.003985,-0.003985,0.000845,-0.002336,-0.002336,0.000595
75%,372.0,0.002225,-0.002081,-0.001944,0.006676,-0.000658,0.002193,0.002596,0.002132,-0.00401,...,-0.005551,-0.000924,0.00359,-0.001428,-0.002636,-0.002636,0.001353,-0.001545,-0.001545,0.000952
max,496.0,0.007273,-0.001112,-0.001039,0.021822,-0.000352,0.007169,0.008484,0.006969,-0.002144,...,-0.002967,-0.000494,0.011735,-0.000763,-0.001409,-0.001409,0.004421,-0.000826,-0.000826,0.003111


In [36]:
duser.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497 entries, 0 to 496
Data columns (total 33 columns):
user_id    497 non-null int64
0          497 non-null float64
1          497 non-null float64
2          497 non-null float64
3          497 non-null float64
4          497 non-null float64
5          497 non-null float64
6          497 non-null float64
7          497 non-null float64
8          497 non-null float64
9          497 non-null float64
10         497 non-null float64
11         497 non-null float64
12         497 non-null float64
13         497 non-null float64
14         497 non-null float64
15         497 non-null float64
16         497 non-null float64
17         497 non-null float64
18         497 non-null float64
19         497 non-null float64
20         497 non-null float64
21         497 non-null float64
22         497 non-null float64
23         497 non-null float64
24         497 non-null float64
25         497 non-null float64
26         497 non-null float64
2

### 2.1.4 ditem

In [37]:
ditem.head()

Unnamed: 0,item_id,0,1,2,3,4,5,6,7,8,...,22,23,24,25,26,27,28,29,30,31
0,35,0.005646,-0.01278,-0.011941,0.016942,-0.004044,0.005566,0.006587,0.005411,-0.024627,...,-0.03409,-0.005673,0.009111,-0.00877,-0.016189,-0.016189,0.003432,-0.00949,-0.00949,0.002416
1,19,0.00253,-0.005726,-0.00535,0.007591,-0.001812,0.002494,0.002951,0.002424,-0.011035,...,-0.015274,-0.002542,0.004082,-0.003929,-0.007254,-0.007254,0.001538,-0.004252,-0.004252,0.001082
2,145,0.001592,-0.003604,-0.003368,0.004778,-0.001141,0.00157,0.001858,0.001526,-0.006946,...,-0.009615,-0.0016,0.00257,-0.002473,-0.004566,-0.004566,0.000968,-0.002677,-0.002677,0.000681
3,36,0.002866,-0.006486,-0.00606,0.008598,-0.002052,0.002825,0.003343,0.002746,-0.012499,...,-0.017301,-0.002879,0.004624,-0.004451,-0.008216,-0.008216,0.001742,-0.004816,-0.004816,0.001226
4,357,0.000602,-0.001362,-0.001273,0.001806,-0.000431,0.000593,0.000702,0.000577,-0.002625,...,-0.003634,-0.000605,0.000971,-0.000935,-0.001726,-0.001726,0.000366,-0.001012,-0.001012,0.000258


In [38]:
ditem.describe()

Unnamed: 0,item_id,0,1,2,3,4,5,6,7,8,...,22,23,24,25,26,27,28,29,30,31
count,444.0,444.0,444.0,444.0,444.0,444.0,444.0,444.0,444.0,444.0,...,444.0,444.0,444.0,444.0,444.0,444.0,444.0,444.0,444.0,444.0
mean,221.5,0.000439,-0.003094,-0.000707,0.002174,0.001087,0.000673,0.000247,0.000774,-0.004414,...,-0.007104,-0.00118,0.001865,-0.001499,-0.00327,-0.00327,0.000345,-0.002038,-0.002038,-4.6e-05
std,128.316016,0.016297,0.021682,0.022706,0.015934,0.032901,0.024576,0.026334,0.028865,0.007102,...,0.016812,0.01248,0.031342,0.017446,0.008021,0.008021,0.011123,0.011271,0.011271,0.023676
min,0.0,-0.142731,-0.299908,-0.06006,-0.246777,-0.081493,-0.389399,-0.326175,-0.492083,-0.10723,...,-0.284258,-0.176811,-0.286772,-0.172129,-0.08317,-0.08317,-0.112601,-0.110081,-0.110081,-0.370836
25%,110.75,0.000491,-0.002836,-0.00265,0.001475,-0.000897,0.000484,0.000573,0.000471,-0.005465,...,-0.007565,-0.001259,0.000793,-0.001946,-0.003626,-0.003626,0.000299,-0.002106,-0.002106,0.00021
50%,221.5,0.000851,-0.001927,-0.0018,0.002554,-0.00061,0.000839,0.000993,0.000816,-0.003713,...,-0.005139,-0.000855,0.001373,-0.001322,-0.002441,-0.002441,0.000495,-0.001431,-0.001431,0.000364
75%,332.25,0.001253,-0.001112,-0.001039,0.003759,-0.000352,0.001235,0.001462,0.001201,-0.002144,...,-0.002967,-0.000494,0.002022,-0.000763,-0.001409,-0.001409,0.000762,-0.000826,-0.000826,0.000536
max,443.0,0.144249,0.181685,0.315345,0.087781,0.640721,0.236851,0.333698,0.287157,0.043447,...,0.106962,0.113402,0.564544,0.22332,0.096677,0.096677,0.16688,0.159891,0.159891,0.214498


In [39]:
ditem.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 444 entries, 0 to 443
Data columns (total 33 columns):
item_id    444 non-null int64
0          444 non-null float64
1          444 non-null float64
2          444 non-null float64
3          444 non-null float64
4          444 non-null float64
5          444 non-null float64
6          444 non-null float64
7          444 non-null float64
8          444 non-null float64
9          444 non-null float64
10         444 non-null float64
11         444 non-null float64
12         444 non-null float64
13         444 non-null float64
14         444 non-null float64
15         444 non-null float64
16         444 non-null float64
17         444 non-null float64
18         444 non-null float64
19         444 non-null float64
20         444 non-null float64
21         444 non-null float64
22         444 non-null float64
23         444 non-null float64
24         444 non-null float64
25         444 non-null float64
26         444 non-null float64
2

## 2.2 Подготовка данных

In [7]:
dtrain = pd.read_csv("./train.csv")
dtest = pd.read_csv("./test.csv")

duser = pd.read_csv("./user-features.csv")
ditem = pd.read_csv("./item-features.csv")

print(
    f"shapes: dtrain={dtrain.shape}, dtest={dtest.shape}, duser={duser.shape}, ditem={ditem.shape}"
)

dtrain = pd.merge(
    dtrain, 
    duser.rename(columns={c:"user_"+c for c in duser.columns if c!="user_id"}), 
    on="user_id"
).reset_index()
dtrain = pd.merge(
    dtrain, 
    ditem.rename(columns={c:"item_"+c for c in ditem.columns if c!="item_id"}), 
    on="item_id"
).sort_values("user_id").reset_index()

dtest = pd.merge(
    dtest, 
    duser.rename(columns={c:"user_"+c for c in duser.columns if c!="user_id"}), 
    on="user_id"
)
dtest = pd.merge(
    dtest.assign(key=1), 
    ditem.rename(columns={c:"item_"+c for c in ditem.columns if c!="item_id"}).assign(key=1), 
    on="key"
).sort_values("user_id")


shapes: dtrain=(8674, 4), dtest=(497, 2), duser=(497, 33), ditem=(444, 33)


In [9]:
feats = [
    f for f in dtrain.columns 
    if (f.startswith("user_") or f.startswith("item_")) and f not in {"user_id", "item_id"}
]


In [10]:
query_sz = dtrain.groupby("user_id")["user_id"].count()
query_sz


user_id
0      16
1      18
2      16
3      17
4      18
       ..
492    16
493    17
494    16
495    16
496    16
Name: user_id, Length: 497, dtype: int64

# 2.2 Подбор параметров

In [23]:
# d = lgb.Dataset(data=dtrain[feats], label=dtrain["like"], group=query_sz)
# p = {
#     "task": "train",
#     "num_iterations": 1000,
#     "num_leaves": 255,
#     "min_data_in_leaf": 50,
#     "min_sum_hessian_in_leaf": 50,
#     "objective": "lambdarank",
#     "metric": "ndcg",
#     "ndcg_eval_at": [1, 5, 10],
#     "learning_rate": .1,
#     "num_threads": 2
# }
# pd.DataFrame(lgb.cv(params=p, train_set=d, nfold=2, ))

In [153]:
d = lgb.Dataset(data=dtrain[feats], label=dtrain["like"], group=query_sz)
p = {
    "task": "train",
    "num_iterations": 1000,
    "num_leaves": 255,
    "min_data_in_leaf": 50,
    "min_sum_hessian_in_leaf": 25,
    "objective": "lambdarank",
    "metric": "ndcg",
    "ndcg_eval_at": [1, 5, 10],
    "learning_rate": .005,
    "num_threads": 2
}
pd.DataFrame(lgb.cv(params=p, train_set=d, nfold=10, ))
#0.859224



Unnamed: 0,ndcg@1-mean,ndcg@1-stdv,ndcg@5-mean,ndcg@5-stdv,ndcg@10-mean,ndcg@10-stdv
0,0.859224,0.051844,0.855293,0.032247,0.874457,0.024529
1,0.905592,0.037943,0.880454,0.023178,0.892663,0.019856
2,0.907592,0.034832,0.880472,0.024641,0.893763,0.020162
3,0.903592,0.044475,0.880089,0.029290,0.891490,0.023709
4,0.907592,0.044870,0.880940,0.029907,0.892998,0.024117
...,...,...,...,...,...,...
995,0.917592,0.035094,0.893433,0.023124,0.901759,0.016875
996,0.917592,0.035094,0.893433,0.023124,0.901759,0.016875
997,0.917592,0.035094,0.893433,0.023124,0.901759,0.016875
998,0.917592,0.035094,0.893433,0.023124,0.901759,0.016875


# 2.3 Обучение 

In [136]:
est = lgb.LGBMRanker(
    objective="lambdarank",
    n_estimators=500,
    num_leaves=255, 
    min_data_in_leaf=50, 
    min_sum_hessian_in_leaf=50,
    metric="ndcg",
    ndcg_eval_at=20,
    learning_rate=.1,
    num_threads=4,
)

est.fit(X=dtrain[feats], y=dtrain["like"], group=query_sz)

LGBMRanker(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
           importance_type='split', learning_rate=0.1, max_depth=-1,
           metric='ndcg', min_child_samples=20, min_child_weight=0.001,
           min_data_in_leaf=50, min_split_gain=0.0, min_sum_hessian_in_leaf=50,
           n_estimators=500, n_jobs=-1, ndcg_eval_at=20, num_leaves=255,
           num_threads=4, objective='lambdarank', random_state=None,
           reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
           subsample_for_bin=200000, subsample_freq=0)

In [137]:
def get_predicts():
    data = []
    users = sorted(dtest.user_id.unique())
    for user in users:
        df = dtest.sort_values("item_id").loc[dtest.user_id==user,:]
        X = df[feats]
        pred = est.predict(X)
        data.append(pred)
    return data

In [138]:
pred_list = get_predicts()

In [139]:
dpred = pd.DataFrame(pred_list)

dpred2 = dpred.apply(
    lambda row:\
    np.argsort(row)[-20:][::-1],
    axis=1
)
dpred2.columns = [x for x in range(len(dpred2.columns))]


dpred2 = dpred2.reset_index().rename(columns={"index":"user_id"})

dtmp = pd.read_csv("./test.csv")
dpred2.loc[dtmp.user_id.tolist(),:].to_csv("sub003.csv", index=False)

In [144]:
dpred2

Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,0,35,22,76,60,37,80,72,65,66,...,58,11,7,40,148,36,21,59,5,87
1,1,37,35,76,22,60,72,80,65,66,...,58,11,7,40,148,21,36,146,59,87
2,2,35,22,76,60,37,80,72,65,66,...,58,11,7,40,148,36,21,59,5,87
3,3,35,22,76,60,37,80,72,65,66,...,58,11,7,40,148,36,21,59,5,87
4,4,35,22,76,60,37,80,72,65,66,...,58,11,7,40,148,36,21,59,5,87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,492,35,22,37,76,60,80,65,72,66,...,58,11,40,7,148,36,21,87,146,5
493,493,76,35,37,22,60,72,80,65,66,...,58,11,7,40,148,21,36,146,59,87
494,494,35,22,76,60,37,80,72,65,66,...,58,11,7,40,148,36,21,59,5,87
495,495,35,22,76,60,37,80,72,65,66,...,58,11,7,40,148,36,21,59,5,87
