# 1. Текст задания

Для данной задачи вам предстоит построить рекомендательную систему баннеров на основе логов просмотров и лайков.

Логи представлены четырьмя колонками:

 - user_id (идентификатор пользователя),  
 - item_id (идентификатор баннера),  
 - like (флаг понравился ли пользователю баннер),  
 - timestamp (unix время в секундах совершения действия).  

Кроме того, для пользователей и баннеров имеются признаки размерностью 32.

Вам необходимо предсказать 20 баннеров для пользователей. Качество решения будет оцениваться как доля "лайкнутых" пользователей баннеров из предложенного вами списка (top-20 accuracy).

----

Качество решения оценивается по метрике Top-K Accuracy, где k = 20. Код:
```{python}
def calc_score(test_choices, pred_choices, tk):
    s = 0
    for gt, p in zip(test_choices, pred_choices):
        s += int(gt in p)
    score = s / len(test_choices)
    return score
```

На экзамене за данную задачу вы можете получить до 100 баллов. Расчёт баллов производится по формуле (отличается от задачи 1 коэффициентами):

```
result = 100 if y > 0.54 else (40 + (y - 0.5) / 0.04 * 60 if y > 0.5 else 0)
```

где y — это ваш результат по метрике Top-K Accuracy. Количество баллов является округленным целым числом.

Желаем удачи!

# 2. Решение

In [12]:
import sys, gc, random

import pandas as pd
import lightgbm as lgb
import numpy as np

## 2.1 Подготовка данных

In [7]:
dtrain = pd.read_csv("./train.csv")
dtest = pd.read_csv("./test.csv")

duser = pd.read_csv("./user-features.csv")
ditem = pd.read_csv("./item-features.csv")

print(
    f"shapes: dtrain={dtrain.shape}, dtest={dtest.shape}, duser={duser.shape}, ditem={ditem.shape}"
)

dtrain = pd.merge(
    dtrain, 
    duser.rename(columns={c:"user_"+c for c in duser.columns if c!="user_id"}), 
    on="user_id"
).reset_index()
dtrain = pd.merge(
    dtrain, 
    ditem.rename(columns={c:"item_"+c for c in ditem.columns if c!="item_id"}), 
    on="item_id"
).sort_values("user_id").reset_index()

dtest = pd.merge(
    dtest, 
    duser.rename(columns={c:"user_"+c for c in duser.columns if c!="user_id"}), 
    on="user_id"
)
dtest = pd.merge(
    dtest.assign(key=1), 
    ditem.rename(columns={c:"item_"+c for c in ditem.columns if c!="item_id"}).assign(key=1), 
    on="key"
).sort_values("user_id")


shapes: dtrain=(8674, 4), dtest=(497, 2), duser=(497, 33), ditem=(444, 33)


In [9]:
feats = [
    f for f in dtrain.columns 
    if (f.startswith("user_") or f.startswith("item_")) and f not in {"user_id", "item_id"}
]


In [10]:
query_sz = dtrain.groupby("user_id")["user_id"].count()
query_sz


user_id
0      16
1      18
2      16
3      17
4      18
       ..
492    16
493    17
494    16
495    16
496    16
Name: user_id, Length: 497, dtype: int64

# 2.2 Подбор параметров

In [22]:
d = lgb.Dataset(data=dtrain[feats], label=dtrain["like"], group=query_sz)
p = {
    "task": "train",
    "num_iterations": 1000,
    "num_leaves": 255,
    "min_data_in_leaf": 50,
    "min_sum_hessian_in_leaf": 50,
    "objective": "lambdarank",
    "metric": "ndcg",
    "ndcg_eval_at": [1, 5, 10],
    "learning_rate": .1,
    "num_threads": 2
}
pd.DataFrame(lgb.cv(params=p, train_set=d, nfold=2, ))



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15506
[LightGBM] [Info] Number of data points in the train set: 4344, number of used features: 64
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 15506
[LightGBM] [Info] Number of data points in the train set: 4330, number of used features: 64
























Unnamed: 0,ndcg@1-mean,ndcg@1-stdv,ndcg@5-mean,ndcg@5-stdv,ndcg@10-mean,ndcg@10-stdv
0,0.702188,0.012672,0.760111,0.005625,0.800188,0.004582
1,0.875227,0.012323,0.862644,0.006579,0.880207,0.002680
2,0.897380,0.002219,0.875192,0.001185,0.890885,0.000833
3,0.901404,0.002210,0.878007,0.000439,0.892844,0.001209
4,0.903412,0.004218,0.879750,0.001040,0.893279,0.001159
...,...,...,...,...,...,...
995,0.915485,0.004194,0.891001,0.005053,0.899924,0.000716
996,0.915485,0.004194,0.891001,0.005053,0.899924,0.000716
997,0.915485,0.004194,0.891001,0.005053,0.899924,0.000716
998,0.915485,0.004194,0.891001,0.005053,0.899924,0.000716


In [153]:
d = lgb.Dataset(data=dtrain[feats], label=dtrain["like"], group=query_sz)
p = {
    "task": "train",
    "num_iterations": 1000,
    "num_leaves": 255,
    "min_data_in_leaf": 50,
    "min_sum_hessian_in_leaf": 25,
    "objective": "lambdarank",
    "metric": "ndcg",
    "ndcg_eval_at": [1, 5, 10],
    "learning_rate": .005,
    "num_threads": 2
}
pd.DataFrame(lgb.cv(params=p, train_set=d, nfold=10, ))
#0.859224



Unnamed: 0,ndcg@1-mean,ndcg@1-stdv,ndcg@5-mean,ndcg@5-stdv,ndcg@10-mean,ndcg@10-stdv
0,0.859224,0.051844,0.855293,0.032247,0.874457,0.024529
1,0.905592,0.037943,0.880454,0.023178,0.892663,0.019856
2,0.907592,0.034832,0.880472,0.024641,0.893763,0.020162
3,0.903592,0.044475,0.880089,0.029290,0.891490,0.023709
4,0.907592,0.044870,0.880940,0.029907,0.892998,0.024117
...,...,...,...,...,...,...
995,0.917592,0.035094,0.893433,0.023124,0.901759,0.016875
996,0.917592,0.035094,0.893433,0.023124,0.901759,0.016875
997,0.917592,0.035094,0.893433,0.023124,0.901759,0.016875
998,0.917592,0.035094,0.893433,0.023124,0.901759,0.016875


# 2.3 Обучение 

In [136]:
est = lgb.LGBMRanker(
    objective="lambdarank",
    n_estimators=500,
    num_leaves=255, 
    min_data_in_leaf=50, 
    min_sum_hessian_in_leaf=50,
    metric="ndcg",
    ndcg_eval_at=20,
    learning_rate=.1,
    num_threads=4,
)

est.fit(X=dtrain[feats], y=dtrain["like"], group=query_sz)

LGBMRanker(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
           importance_type='split', learning_rate=0.1, max_depth=-1,
           metric='ndcg', min_child_samples=20, min_child_weight=0.001,
           min_data_in_leaf=50, min_split_gain=0.0, min_sum_hessian_in_leaf=50,
           n_estimators=500, n_jobs=-1, ndcg_eval_at=20, num_leaves=255,
           num_threads=4, objective='lambdarank', random_state=None,
           reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
           subsample_for_bin=200000, subsample_freq=0)

In [137]:
def get_predicts():
    data = []
    users = sorted(dtest.user_id.unique())
    # user = users[0]
    for user in users:
        df = dtest.sort_values("item_id").loc[dtest.user_id==user,:]
        X = df[feats]
        pred = est.predict(X)
        data.append(pred)
    return data

In [138]:
pred_list = get_predicts()

In [139]:
dpred = pd.DataFrame(pred_list)

dpred2 = dpred.apply(
    lambda row:\
    np.argsort(row)[-20:][::-1],
    axis=1
)
dpred2.columns = [x for x in range(len(dpred2.columns))]


dpred2 = dpred2.reset_index().rename(columns={"index":"user_id"})

dtmp = pd.read_csv("./test.csv")
dpred2.loc[dtmp.user_id.tolist(),:].to_csv("sub003.csv", index=False)

In [144]:
dpred2

Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,0,35,22,76,60,37,80,72,65,66,...,58,11,7,40,148,36,21,59,5,87
1,1,37,35,76,22,60,72,80,65,66,...,58,11,7,40,148,21,36,146,59,87
2,2,35,22,76,60,37,80,72,65,66,...,58,11,7,40,148,36,21,59,5,87
3,3,35,22,76,60,37,80,72,65,66,...,58,11,7,40,148,36,21,59,5,87
4,4,35,22,76,60,37,80,72,65,66,...,58,11,7,40,148,36,21,59,5,87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,492,35,22,37,76,60,80,65,72,66,...,58,11,40,7,148,36,21,87,146,5
493,493,76,35,37,22,60,72,80,65,66,...,58,11,7,40,148,21,36,146,59,87
494,494,35,22,76,60,37,80,72,65,66,...,58,11,7,40,148,36,21,59,5,87
495,495,35,22,76,60,37,80,72,65,66,...,58,11,7,40,148,36,21,59,5,87
