In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
from pathlib import Path
from collections import Counter

from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import dill
import pandas as pd
import polars as pl
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import GroupKFold

import lightgbm as lgb
import xgboost as xgb

import pytorch_lightning as L
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
import seaborn as sns

from tasks.data.dataset.mappers import EntityEncoder
from tasks.jobs import Splitter

In [3]:
from otto_utils import *
from otto_features import *
from otto_candidates_covisit import *
from otto_lgbm_utils import *
from otto_implicit import *

In [4]:
%env PYTHONHASHSEED=1
from otto_word2vec import *

w2vec = Word2Vec.load("__valid__word2vec_window=10_negative=20.w2v")

env: PYTHONHASHSEED=1


# Load data

In [5]:
# load
val_df_train = pl.read_parquet(VALIDATION_PATH / "train.parquet", use_pyarrow=True)
val_df_valid_input = pl.read_parquet(VALIDATION_PATH / "valid.parquet", use_pyarrow=True)
val_df_valid_targets = pl.read_parquet(VALIDATION_PATH / "test_labels.parquet", use_pyarrow=True)

# Features

In [6]:
df_user_action_stats_features = pl.read_parquet("__features__valid__user_action_stats_v1.parquet")
df_user_item_history_features = pl.read_parquet("__features__valid__user_item_history_v1.parquet")
df_user_last_type_actions = pl.read_parquet("__features__valid__user_last_type_actions_v1.parquet")
df_item_action_stats_features = pl.read_parquet("__features__valid__item_action_stats_v1.parquet")
df_item_n_sess_multiple_action = pl.read_parquet("__features__valid__item_n_sess_multiple_action_v1.parquet")

# Candidates

In [7]:
ACT_TYPE = "clicks"

## prepare target and candidates set with this target in holdout (other users for action does not change metric)

In [8]:
df_act_target = (
    val_df_valid_targets
    .filter(pl.col("type") == ACT_TYPE)
    .join(val_df_valid_input.select(["session"]).unique(), on="session", how="inner")
    .drop("type")
    .explode("ground_truth")
    .with_column(pl.lit(1).alias("target"))
    .rename({"ground_truth": "aid"})
)

In [9]:
df = val_df_valid_input.unique().sort(["session", "ts"])

In [11]:
df_test_users = (
    df
    .join(df_act_target.select(["session"]).unique(), on="session", how="inner")
    .unique().sort(["session", "ts"])
)
test_sessions_dict = df_test_users.groupby('session').agg([pl.list("aid"), pl.list("type")])
test_sessions_dict = dict(zip(
    test_sessions_dict["session"].to_list(),
    tuple(zip(test_sessions_dict["aid"].to_list(), test_sessions_dict["type"].to_list()))
))

## covisitation top200

In [14]:
# Use top X for clicks, carts and orders
clicks_th = 15
carts_th  = 20
orders_th = 20

def get_top(df, th):
    return (
        df
        .with_column(pl.lit(1).alias("ones"))
        .with_column(pl.col("ones").cumsum().over("aid").alias("rank"))
        .filter(pl.col("rank") <= th)
    )

TOPK_RECOMMEND = 20
TOPK_RERANK = 40

In [15]:
carts_orders = pl.read_parquet("__valid__covisit_carts_orders_all_v3.parquet")
buys2buys = pl.read_parquet("__valid__covisit_buys2buys_all_v4.parquet")
clicks = pl.read_parquet("__valid__covisit_clicks_all_v3.parquet")

In [16]:
carts_orders_top = get_top(carts_orders, carts_th)
buys2buys_top = get_top(buys2buys, orders_th)
clicks_top = get_top(clicks, clicks_th)

In [17]:
top_clicks = df.filter(pl.col("type") == 0)["aid"].value_counts(sort=True)[:TOPK_RECOMMEND]["aid"].to_list()
top_carts = df.filter(pl.col("type") == 1)["aid"].value_counts(sort=True)[:TOPK_RECOMMEND]["aid"].to_list()
top_orders = df.filter(pl.col("type") == 2)["aid"].value_counts(sort=True)[:TOPK_RECOMMEND]["aid"].to_list()

In [18]:
covisit_rec = CovisitationRecommender(
    df_top_k_buys=carts_orders_top,
    df_top_k_buy2buy=buys2buys_top,
    df_top_k_clicks=clicks_top,
    top_carts=top_carts,
    top_orders=top_orders,
    top_clicks=top_clicks,
)

In [19]:
candidates_dict = {
    "session": [],
    "type": [],
    "candidates": [],
    "rank": [],
}

types = ["clicks", "carts", "orders"]
topk = TOPK_RERANK * 5

for session_id, (session_aid_list, session_type_list) in tqdm(test_sessions_dict.items()):
    rec_items_clicks = covisit_rec.recommend_clicks(session_aid_list, session_type_list, topk)
    rec_items_carts = covisit_rec.recommend_carts(session_aid_list, session_type_list, topk)
    rec_items_buys = covisit_rec.recommend_buys(session_aid_list, session_type_list, topk)

    candidates = [rec_items_clicks, rec_items_carts, rec_items_buys]
#     candidates = [rec_items_buys]
    ranks = [
        np.arange(1, len(rec_items) + 1).tolist()
        for rec_items in candidates
    ]
    
    candidates_dict["session"].extend([session_id] * len(types))
    candidates_dict["type"].extend(types)
    candidates_dict["candidates"].extend(candidates)
    candidates_dict["rank"].extend(ranks)

df_candidates_covisit = pl.DataFrame(candidates_dict)

100%|██████████| 1265979/1265979 [02:04<00:00, 10174.48it/s]


## implicit i2i k=100 top100

In [17]:
from implicit.nearest_neighbours import CosineRecommender

### old weights

In [18]:
df = pl.concat([val_df_train, val_df_valid_input]).unique().sort(["session", "ts"])
df = implicit_old_weight_interactions(df)

In [19]:
train_data = make_sparse_matrix(df)
i2i = CosineRecommender(K=100)
i2i.fit(train_data)

  0%|          | 0/1855603 [00:00<?, ?it/s]

In [20]:
df_candidates_i2i_old = implicit_batch_candidates_for_all_types(
    model=i2i, model_name="i2i_old",
    train_data=train_data, test_users=list(test_sessions_dict.keys()),
    topk=100,
)

100%|██████████| 1266/1266 [01:53<00:00, 11.11it/s]


### new weights

In [21]:
df = pl.concat([val_df_train, val_df_valid_input]).unique().sort(["session", "ts"])
df = implicit_new_weight_interactions(df)

In [22]:
train_data = make_sparse_matrix(df)
i2i = CosineRecommender(K=100)
i2i.fit(train_data)

  0%|          | 0/1855603 [00:00<?, ?it/s]

In [23]:
df_candidates_i2i_new = implicit_batch_candidates_for_all_types(
    model=i2i, model_name="i2i_new",
    train_data=train_data, test_users=list(test_sessions_dict.keys()),
    topk=100,
)

100%|██████████| 1266/1266 [01:55<00:00, 10.93it/s]


# Reranker

## Covisit all

In [20]:
# contains target column and all test users
df_candidates_covisit_all = (
    df_candidates_covisit
    .filter(pl.col("type") == "orders")
    .drop("type")
    .explode(["candidates", "rank"])
    .rename({"candidates": "aid", "rank": "rank_orders"})
    .join(
        (
            df_candidates_covisit
            .filter(pl.col("type") == "carts")
            .drop("type")
            .explode(["candidates", "rank"])
            .rename({"candidates": "aid", "rank": "rank_carts"})
        ),
        on=["session", "aid"],
        how="outer"
    )
    .join(
        (
            df_candidates_covisit
            .filter(pl.col("type") == "clicks")
            .drop("type")
            .explode(["candidates", "rank"])
            .rename({"candidates": "aid", "rank": "rank_clicks"})
        ),
        on=["session", "aid"],
        how="outer"
    )
    .fill_null(999)
    .filter(pl.col("aid") != -1)  # some strange items from implicit
    .unique(subset=["session", "aid"], keep="last")
    .join(df_act_target, on=["session", "aid"], how="left")  # if using rank column as a feature
    .sort("session")
    .fill_null(0)
)

In [22]:
df_sessions_with_positives = (
    df_candidates_covisit_all.groupby(["session"]).agg(pl.sum("target"))
    .filter(pl.col("target") > 0)
    .select(["session"])
)

In [23]:
print("sessions with positives in candidates: ", df_sessions_with_positives["session"].n_unique())
print("sessions with positives in test dataframe: ", df_candidates_covisit_all["session"].n_unique())
print(
    "hit rate: ", 
    df_sessions_with_positives["session"].n_unique() / df_candidates_covisit_all["session"].n_unique()
)

sessions with positives in candidates:  742940
sessions with positives in test dataframe:  1265979
hit rate:  0.5868501768196787


In [24]:
from otto_reranker import *

In [25]:
df_candidates_covisit_with_positives = (
    df_candidates_covisit_all
    .join(df_sessions_with_positives, on="session", how="inner")
)

In [26]:
df_candidates_covisit_with_positives["target"].value_counts()

target,counts
i32,u32
0,73412461
1,742940


In [43]:
df_cand_w2v_features = cand_w2v_features(
    df_candidates_sampled.select(["session", "aid"]),
    df_user_last_type_actions,
    w2vec
)

In [27]:
df_item_to_item_features = cand_item_to_item_features(
    df_candidates_covisit_with_positives.select(["session", "aid"]),
    df_user_last_type_actions,
    df_carts_orders=carts_orders,
    df_buys2buys=buys2buys,
    df_clicks=clicks
)

In [28]:
df_other_features = cand_other_features(
    df_candidates_covisit_with_positives.select(["session", "aid"]),
    df_user_action_stats_features,
    df_item_action_stats_features,
    df_item_n_sess_multiple_action,
    df_user_item_history_features,
)

In [30]:
df_stage_2_dataset = (
    df_candidates_covisit_with_positives
#     .join(df_cand_w2v_features, on=["session", "aid"], how="left")
    .join(df_item_to_item_features, on=["session", "aid"], how="left")
    .join(df_other_features, on=["session", "aid"], how="left")
    .fill_null(0)
)

In [33]:
df_stage_2_dataset.head()

session,aid,rank_orders,rank_carts,rank_clicks,target,user_last_click_aid_carts_orders_weight,user_last_click_aid_buy2buy_weight,user_last_click_aid_click_weight,user_last_cart_aid_carts_orders_weight,user_last_cart_aid_buy2buy_weight,user_last_cart_aid_click_weight,user_last_order_aid_carts_orders_weight,user_last_order_aid_buy2buy_weight,user_last_order_aid_click_weight,user_lifetime_days,user_n_actions,user_n_uniq_items,user_buys_rate,user_uniq_clicks,user_uniq_carts,user_uniq_orders,cl_cnt,ca_cnt,or_cnt,user_ca_cl_ratio,user_or_cl_ratio,user_or_ca_ratio,item_lifetime_days,item_n_actions,item_n_uniq_users,item_buys_rate,item_uniq_clicks,item_uniq_orders,item_uniq_carts,cl_cnt_right,or_cnt_right,ca_cnt_right,item_ca_cl_ratio,item_or_cl_ratio,item_or_ca_ratio,item_n_sess_multi_clicks,item_n_sess_multi_carts,item_n_sess_multi_buys,user_item_log_recency_score,user_item_type_weighted_log_recency_score,user_item_is_in_history
i64,i64,i64,i64,i64,i32,f64,i32,f64,f64,i32,f64,f64,i32,f64,f64,u32,u32,f64,u32,u32,u32,u32,u32,u32,f64,f64,f64,f64,u32,u32,f64,u32,u32,u32,u32,u32,u32,f64,f64,f64,u32,u32,u32,f64,f64,i32
11107743,1243310,1,1,1,0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,3.063962,3,2,0.0,2,0,0,3,0,0,0.0,0.0,0.0,27.967644,3444,1,0.017712,2255,1,54,3384,1,59,0.017435,0.000296,0.016949,589,3,0,1.071773,1.071773,1
11107743,731062,2,2,2,0,2.0,0,10.126189,0.0,0,0.0,0.0,0,0.0,3.063962,3,2,0.0,2,0,0,3,0,0,0.0,0.0,0.0,27.849486,2827,1,0.012381,2358,0,35,2792,0,35,0.012536,0.0,0.0,281,0,0,0.464086,0.464086,1
11107743,984459,3,3,29,0,14.0,0,48.7508,0.0,0,0.0,0.0,0,0.0,3.063962,3,2,0.0,2,0,0,3,0,0,0.0,0.0,0.0,22.987933,40537,1,0.041444,27854,0,1491,38857,0,1680,0.043235,0.0,0.0,6987,152,0,0.0,0.0,0
11107743,224347,4,4,3,1,241.0,6,751.939773,0.0,0,0.0,0.0,0,0.0,3.063962,3,2,0.0,2,0,0,3,0,0,0.0,0.0,0.0,27.997713,15905,1,0.067274,8664,85,742,14923,88,894,0.059908,0.005897,0.098434,2683,110,3,0.0,0.0,0
11107743,1838173,7,5,4,0,113.5,1,423.716155,0.0,0,0.0,0.0,0,0.0,3.063962,3,2,0.0,2,0,0,3,0,0,0.0,0.0,0.0,27.988076,9425,1,0.007745,6630,0,68,9352,0,73,0.007806,0.0,0.0,1548,4,0,0.0,0.0,0


In [34]:
# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
#     'objective': 'binary',
    'objective': 'lambdarank',
    'metric': '"None"',
    'eval_at': 20,
#     'metric': {'auc', 'binary_logloss'},
#     'min_data_in_leaf': 256, 
#     'num_leaves': 63,
    'max_depth': 7,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
#     'bagging_freq': 5,
#     'device': 'gpu',
    'verbose': -1,
    'num_threads': 40,
}

### candidates covisit top200 all types (orders, carts, clicks) + i2i top100 (old, new), features w2v cosine (CV carts 0.4163) BEST!

In [35]:
feature_cols = df_stage_2_dataset.drop(["session", "aid", "target"]).columns
print("num features: ", len(feature_cols))

num features:  44


In [56]:
skf = GroupKFold(n_splits=5)
df_valid_preds = df_candidates_covisit_all.select(["session", "aid"])

for fold, (train_idx, valid_idx) in tqdm(enumerate(
    skf.split(df_stage_2_dataset,
              df_stage_2_dataset['target'],
              groups=df_stage_2_dataset['session'])
)):
    
    # contains sessions with no positives
    X_train = df_stage_2_dataset[train_idx][feature_cols].to_pandas()
    y_train = df_stage_2_dataset[train_idx]["target"].to_pandas()
    X_valid = df_stage_2_dataset[valid_idx][feature_cols].to_pandas()
    y_valid = df_stage_2_dataset[valid_idx]["target"].to_pandas()
    
    # create dataset for lightgbm
    groups_len_train = (
        df_stage_2_dataset[train_idx]
        .groupby("session").agg(pl.count("aid"))
        .sort("session")["aid"].to_numpy()
    )
    groups_len_valid = (
        df_stage_2_dataset[valid_idx]
        .groupby("session").agg(pl.count("aid"))
        .sort("session")["aid"].to_numpy()
    )    
    lgb_train = lgb.Dataset(X_train, y_train, group=groups_len_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid, group=groups_len_valid, reference=lgb_train)
    
    # train model
    gbm_ranking = lgb.train(
        params, 
        lgb_train,
        num_boost_round=2000,
#         return_cvbooster=True,
        feval=lgb_numba_recall,
        valid_sets=lgb_eval,
        callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(period=20)]
    )
    
    scores = gbm_ranking.predict(X_valid)
    df_valid_preds_fold = (
        df_stage_2_dataset[valid_idx].select(["session", "aid"])
        .with_columns([pl.Series(scores).alias(f"scores_fold{fold}")])
    )
    df_valid_preds = (
        df_valid_preds.join(df_valid_preds_fold, on=["session", "aid"], how="left")
    )
    
    # add save models
    gbm_ranking.save_model(
        f"__model__{ACT_TYPE}_covisit_all_types_merged_top200_reranker_rank+42feat_fold{fold}.lgb"
    )

0it [00:00, ?it/s]

Training until validation scores don't improve for 100 rounds
[20]	valid_0's numba_recall@20: 0.903868
[40]	valid_0's numba_recall@20: 0.908734
[60]	valid_0's numba_recall@20: 0.910349
[80]	valid_0's numba_recall@20: 0.910888
[100]	valid_0's numba_recall@20: 0.91152
[120]	valid_0's numba_recall@20: 0.911971
[140]	valid_0's numba_recall@20: 0.912321
[160]	valid_0's numba_recall@20: 0.912651
[180]	valid_0's numba_recall@20: 0.912866
[200]	valid_0's numba_recall@20: 0.912967
[220]	valid_0's numba_recall@20: 0.913142
[240]	valid_0's numba_recall@20: 0.913311
[260]	valid_0's numba_recall@20: 0.913452
[280]	valid_0's numba_recall@20: 0.913465
[300]	valid_0's numba_recall@20: 0.913539
[320]	valid_0's numba_recall@20: 0.9136
[340]	valid_0's numba_recall@20: 0.91362
[360]	valid_0's numba_recall@20: 0.913546
[380]	valid_0's numba_recall@20: 0.91364
[400]	valid_0's numba_recall@20: 0.913788
[420]	valid_0's numba_recall@20: 0.913836
[440]	valid_0's numba_recall@20: 0.913977
[460]	valid_0's numba_r

1it [09:24, 564.58s/it]

Training until validation scores don't improve for 100 rounds
[20]	valid_0's numba_recall@20: 0.903667
[40]	valid_0's numba_recall@20: 0.908081
[60]	valid_0's numba_recall@20: 0.909979
[80]	valid_0's numba_recall@20: 0.910632
[100]	valid_0's numba_recall@20: 0.91113
[120]	valid_0's numba_recall@20: 0.911662
[140]	valid_0's numba_recall@20: 0.911965
[160]	valid_0's numba_recall@20: 0.912207
[180]	valid_0's numba_recall@20: 0.912389
[200]	valid_0's numba_recall@20: 0.912564
[220]	valid_0's numba_recall@20: 0.912665
[240]	valid_0's numba_recall@20: 0.912813
[260]	valid_0's numba_recall@20: 0.912927
[280]	valid_0's numba_recall@20: 0.912974
[300]	valid_0's numba_recall@20: 0.912974
[320]	valid_0's numba_recall@20: 0.912967
[340]	valid_0's numba_recall@20: 0.913149
[360]	valid_0's numba_recall@20: 0.91327
[380]	valid_0's numba_recall@20: 0.91327
[400]	valid_0's numba_recall@20: 0.913324
[420]	valid_0's numba_recall@20: 0.913405
[440]	valid_0's numba_recall@20: 0.913412
[460]	valid_0's numba

2it [24:30, 765.34s/it]

Training until validation scores don't improve for 100 rounds
[20]	valid_0's numba_recall@20: 0.904003
[40]	valid_0's numba_recall@20: 0.908243
[60]	valid_0's numba_recall@20: 0.909979
[80]	valid_0's numba_recall@20: 0.910888
[100]	valid_0's numba_recall@20: 0.911258
[120]	valid_0's numba_recall@20: 0.911796
[140]	valid_0's numba_recall@20: 0.912126
[160]	valid_0's numba_recall@20: 0.912362
[180]	valid_0's numba_recall@20: 0.91288
[200]	valid_0's numba_recall@20: 0.912873
[220]	valid_0's numba_recall@20: 0.91284
[240]	valid_0's numba_recall@20: 0.912853
[260]	valid_0's numba_recall@20: 0.912994
[280]	valid_0's numba_recall@20: 0.913122
[300]	valid_0's numba_recall@20: 0.913136
[320]	valid_0's numba_recall@20: 0.91323
[340]	valid_0's numba_recall@20: 0.913203
[360]	valid_0's numba_recall@20: 0.913364
[380]	valid_0's numba_recall@20: 0.913412
[400]	valid_0's numba_recall@20: 0.913486
[420]	valid_0's numba_recall@20: 0.913573
[440]	valid_0's numba_recall@20: 0.913593
[460]	valid_0's numba

3it [38:29, 798.78s/it]

Training until validation scores don't improve for 100 rounds
[20]	valid_0's numba_recall@20: 0.90473
[40]	valid_0's numba_recall@20: 0.908734
[60]	valid_0's numba_recall@20: 0.910275
[80]	valid_0's numba_recall@20: 0.910955
[100]	valid_0's numba_recall@20: 0.911722
[120]	valid_0's numba_recall@20: 0.912261
[140]	valid_0's numba_recall@20: 0.912557
[160]	valid_0's numba_recall@20: 0.913035
[180]	valid_0's numba_recall@20: 0.913452
[200]	valid_0's numba_recall@20: 0.913815
[220]	valid_0's numba_recall@20: 0.913943
[240]	valid_0's numba_recall@20: 0.91399
[260]	valid_0's numba_recall@20: 0.914199
[280]	valid_0's numba_recall@20: 0.914152
[300]	valid_0's numba_recall@20: 0.914246
[320]	valid_0's numba_recall@20: 0.91426
[340]	valid_0's numba_recall@20: 0.914293
[360]	valid_0's numba_recall@20: 0.914441
[380]	valid_0's numba_recall@20: 0.914495
[400]	valid_0's numba_recall@20: 0.914441
[420]	valid_0's numba_recall@20: 0.914461
[440]	valid_0's numba_recall@20: 0.9143
[460]	valid_0's numba_r

4it [46:15, 667.53s/it]

Training until validation scores don't improve for 100 rounds
[20]	valid_0's numba_recall@20: 0.903996
[40]	valid_0's numba_recall@20: 0.90796
[60]	valid_0's numba_recall@20: 0.909313
[80]	valid_0's numba_recall@20: 0.910154
[100]	valid_0's numba_recall@20: 0.910773
[120]	valid_0's numba_recall@20: 0.91115
[140]	valid_0's numba_recall@20: 0.911258
[160]	valid_0's numba_recall@20: 0.911702
[180]	valid_0's numba_recall@20: 0.911695
[200]	valid_0's numba_recall@20: 0.911911
[220]	valid_0's numba_recall@20: 0.91187
[240]	valid_0's numba_recall@20: 0.91218
[260]	valid_0's numba_recall@20: 0.912247
[280]	valid_0's numba_recall@20: 0.912342
[300]	valid_0's numba_recall@20: 0.912416
[320]	valid_0's numba_recall@20: 0.912759
[340]	valid_0's numba_recall@20: 0.912725
[360]	valid_0's numba_recall@20: 0.912853
[380]	valid_0's numba_recall@20: 0.91284
[400]	valid_0's numba_recall@20: 0.913021
[420]	valid_0's numba_recall@20: 0.913062
[440]	valid_0's numba_recall@20: 0.913129
[460]	valid_0's numba_r

5it [59:17, 711.57s/it]


In [61]:
_ = calc_oof_score_for_type(df_valid_preds, act_type=ACT_TYPE)

100%|██████████| 1265979/1265979 [00:04<00:00, 271966.76it/s]


validation score: 0.053652706719463754
recall per type: type
clicks   0.5365
dtype: float64


In [60]:
# get oof recs and calc recall
scores = (
    df_valid_preds
    .select([
        pl.col("scores_fold0"),
        pl.col("scores_fold1"),
        pl.col("scores_fold2"), 
        pl.col("scores_fold3"), 
        pl.col("scores_fold4"),
    ]).mean(axis=1)
)

df_valid_preds_sorted = (
    df_valid_preds
    .with_columns([pl.Series(scores).alias("score")])
    .sort("score", reverse=True)
    .groupby("session")
    .agg([
        pl.list("aid"),
        pl.list("score"),
    ])
)

df_recall = (
    df_valid_preds_sorted
    .join(val_df_valid_targets.filter(pl.col("type") == ACT_TYPE), on="session", how="inner")
    .with_column(pl.col("aid").arr.head(20).alias("label"))
    .with_column(
        pl.col("label")
        .arr.concat("ground_truth")
        .arr.eval(pl.element().filter(pl.count().over(pl.element()) == 2))
        .arr.unique()
        .alias("hits")
    )
    .with_column(pl.col("hits").arr.lengths().clip(0, 20).alias("hits_count"))
    .with_column(pl.col("ground_truth").arr.lengths().clip(0, 20).alias("gt_count"))
)

recall = df_recall["hits_count"].sum() / df_recall["gt_count"].sum()
print(f"{ACT_TYPE} recall: ", recall)

clicks recall:  0.5365270671946375


## Covisit all + i2i (old, new)

In [28]:
# contains target column
df_candidates_for_action_all = (
    df_candidates_covisit
    .filter(pl.col("type") == "orders")
    .join(
        df_act_target.select(["session"]).unique(),
        on="session", how="inner"
    )
    .drop("type")
    .explode(["candidates", "rank"])
    .rename({"candidates": "aid", "rank": "rank_orders"})
    .join(
        (
            df_candidates_covisit
            .filter(pl.col("type") == "carts")
            .join(
                df_act_target.select(["session"]).unique(),
                on="session", how="inner"
            )
            .drop("type")
            .explode(["candidates", "rank"])
            .rename({"candidates": "aid", "rank": "rank_carts"})
        ),
        on=["session", "aid"],
        how="outer"
    )
    .join(
        (
            df_candidates_covisit
            .filter(pl.col("type") == "clicks")
            .join(
                df_act_target.select(["session"]).unique(),
                on="session", how="inner"
            )
            .drop("type")
            .explode(["candidates", "rank"])
            .rename({"candidates": "aid", "rank": "rank_clicks"})
        ),
        on=["session", "aid"],
        how="outer"
    )
    .join(
        (
            df_candidates_i2i_new
            .join(
                df_act_target.select(["session"]).unique(),
                on="session", how="inner"
            )
            .explode(["aid", "i2i_new_score"])
        ),
        on=["session", "aid"],
        how="outer"
    )
    .join(
        (
            df_candidates_i2i_old
            .join(
                df_act_target.select(["session"]).unique(),
                on="session", how="inner"
            )
            .explode(["aid", "i2i_old_score"])
        ),
        on=["session", "aid"],
        how="outer"
    )
    .fill_null(999)
    .filter(pl.col("aid") != -1)  # some strange items from implicit
    .unique(subset=["session", "aid"], keep="last")
    .join(df_act_target, on=["session", "aid"], how="left")  # if using rank column as a feature
    .join(df_act_target.select(["session"]).unique(), on="session", how="inner")
    .sort("session")
    .fill_null(0)
)

In [35]:
# downsample candidates batch
N_NEGATIVES = 50
test_users = list(test_sessions_dict.keys())
b_sz = 50000

candidates_sampled = []

for test_session_start in tqdm(range(0, len(test_users), b_sz)):
    test_sessions = test_users[test_session_start : test_session_start + b_sz]
    
    df_session_n_negatives = (
        df_act_target.filter(pl.col("session").is_in(test_sessions))
        .groupby("session")
        .agg([pl.count("target").alias("cnt")])
        .with_column((pl.col("cnt") * N_NEGATIVES).alias("n_neg"))
        .drop("cnt")
    )
    
    candidates = (
        df_candidates_covisit
        .filter(pl.col("type") == "orders")
        .filter(pl.col("session").is_in(test_sessions))
        .drop("type")
        .explode(["candidates", "rank"])
        .rename({"candidates": "aid", "rank": "rank_orders"})
        .join(
            (
                df_candidates_covisit
                .filter(pl.col("type") == "carts")
                .filter(pl.col("session").is_in(test_sessions))
                .drop("type")
                .explode(["candidates", "rank"])
                .rename({"candidates": "aid", "rank": "rank_carts"})
            ),
            on=["session", "aid"],
            how="outer"
        )
        .join(
            (
                df_candidates_covisit
                .filter(pl.col("type") == "clicks")
                .filter(pl.col("session").is_in(test_sessions))
                .drop("type")
                .explode(["candidates", "rank"])
                .rename({"candidates": "aid", "rank": "rank_clicks"})
            ),
            on=["session", "aid"],
            how="outer"
        )
        .join(
            (
                df_candidates_i2i_new
                .filter(pl.col("session").is_in(test_sessions))
                .explode(["aid", "i2i_new_score"])
            ),
            on=["session", "aid"],
            how="outer"
        )
        .join(
            (
                df_candidates_i2i_old
                .filter(pl.col("session").is_in(test_sessions))
                .explode(["aid", "i2i_old_score"])
            ),
            on=["session", "aid"],
            how="outer"
        )
        .fill_null(999)
        .filter(pl.col("aid") != -1)  # some strange items from implicit
        .unique(subset=["session", "aid"], keep="last")
        .join(df_act_target, on=["session", "aid"], how="left")  # if using rank column as a feature
        .join(df_act_target.select(["session"]).unique(), on="session", how="inner")
        .sort("session")
        .fill_null(0)
    )
    
    candidates_pos = candidates.filter(pl.col("target") == 1)
    candidates_neg = (
        candidates
        .filter(pl.col("target") == 0)
        .join(df_session_n_negatives, on="session", how="inner")
        .with_column(pl.arange(0, pl.count()).shuffle().over("session").alias("int_id"))
        .filter(pl.col("int_id") < pl.col("n_neg"))
        .drop(["int_id", "n_neg"])
    )
    
    candidates_sampled.append(pl.concat([candidates_pos, candidates_neg]))

100%|██████████| 26/26 [04:33<00:00, 10.53s/it]


In [36]:
df_candidates_sampled = pl.concat(candidates_sampled)

In [37]:
df_candidates_sampled

session,aid,rank_orders,rank_carts,rank_clicks,i2i_new_score,i2i_old_score,target
i64,i64,i64,i64,i64,f64,f64,i32
11107743,224347,4,4,3,0.124877,2.639755,1
11107745,549612,13,13,15,0.070829,2.954173,1
11107747,618078,1,1,1,1.145752,12.037464,1
11107748,1551520,20,6,7,999.0,0.425765,1
11107749,153565,2,2,2,0.095744,10.289484,1
11107750,184976,999,999,39,999.0,999.0,1
11107754,1438446,1,1,1,1.0,10.0,1
11107755,168735,1,1,1,1.0,10.0,1
11107756,100282,1,1,1,1.0,10.0,1
11107760,1781844,2,2,2,3.332681,43.225845,1


In [38]:
df_candidates_sampled["target"].value_counts()

target,counts
i32,u32
0,63289291
1,793057


In [39]:
df_candidates_sampled["session"].n_unique()

1265979

In [42]:
from otto_reranker import *

In [43]:
df_cand_w2v_features = cand_w2v_features(
    df_candidates_sampled.select(["session", "aid"]),
    df_user_last_type_actions,
    w2vec
)

In [44]:
df_item_to_item_features = cand_item_to_item_features(
    df_candidates_sampled.select(["session", "aid"]),
    df_user_last_type_actions,
    df_carts_orders=carts_orders,
    df_buys2buys=buys2buys,
    df_clicks=clicks
)

In [45]:
df_other_features = cand_other_features(
    df_candidates_sampled.select(["session", "aid"]),
    df_user_action_stats_features,
    df_item_action_stats_features,
    df_item_n_sess_multiple_action,
    df_user_item_history_features,
)

In [46]:
df_stage_2_dataset = (
    df_candidates_sampled
    .join(df_cand_w2v_features, on=["session", "aid"], how="left")
    .join(df_item_to_item_features, on=["session", "aid"], how="left")
    .join(df_other_features, on=["session", "aid"], how="left")
    .fill_null(0)
)

In [47]:
df_stage_2_dataset

session,aid,rank_orders,rank_carts,rank_clicks,i2i_new_score,i2i_old_score,target,w2v_cosine_sim_last_click_aid,w2v_cosine_sim_last_cart_aid,w2v_cosine_sim_last_order_aid,user_last_click_aid_carts_orders_weight,user_last_click_aid_buy2buy_weight,user_last_click_aid_click_weight,user_last_cart_aid_carts_orders_weight,user_last_cart_aid_buy2buy_weight,user_last_cart_aid_click_weight,user_last_order_aid_carts_orders_weight,user_last_order_aid_buy2buy_weight,user_last_order_aid_click_weight,user_lifetime_days,user_n_actions,user_n_uniq_items,user_buys_rate,user_uniq_clicks,user_uniq_carts,user_uniq_orders,cl_cnt,ca_cnt,or_cnt,user_ca_cl_ratio,user_or_cl_ratio,user_or_ca_ratio,item_lifetime_days,item_n_actions,item_n_uniq_users,item_buys_rate,item_uniq_clicks,item_uniq_orders,item_uniq_carts,cl_cnt_right,or_cnt_right,ca_cnt_right,item_ca_cl_ratio,item_or_cl_ratio,item_or_ca_ratio,item_n_sess_multi_clicks,item_n_sess_multi_carts,item_n_sess_multi_buys,user_item_log_recency_score,user_item_type_weighted_log_recency_score,user_item_is_in_history
i64,i64,i64,i64,i64,f64,f64,i32,f64,f64,f64,f64,i32,f64,f64,i32,f64,f64,i32,f64,f64,u32,u32,f64,u32,u32,u32,u32,u32,u32,f64,f64,f64,f64,u32,u32,f64,u32,u32,u32,u32,u32,u32,f64,f64,f64,u32,u32,u32,f64,f64,i32
11107743,224347,4,4,3,0.124877,2.639755,1,0.961559,-999.0,-999.0,241.0,6,751.939773,0.0,0,0.0,0.0,0,0.0,3.063962,3,2,0.0,2,0,0,3,0,0,0.0,0.0,0.0,27.997713,15905,1,0.067274,8664,85,742,14923,88,894,0.059908,0.005897,0.098434,2683,110,3,0.0,0.0,0
11107745,549612,13,13,15,0.070829,2.954173,1,0.984418,-999.0,-999.0,609.5,59,948.512715,0.0,0,0.0,0.0,0,0.0,0.004753,6,3,0.0,3,0,0,6,0,0,0.0,0.0,0.0,27.99701,13965,1,0.148944,5947,321,1007,12236,351,1378,0.112619,0.028686,0.254717,2282,209,24,0.0,0.0,0
11107747,618078,1,1,1,1.145752,12.037464,1,1.0,0.560938,-999.0,0.0,0,0.0,9.0,0,3.678474,0.0,0,0.0,3.915636,6,5,0.166667,4,1,0,5,1,0,0.2,0.0,0.0,27.985034,6875,1,0.0848,3588,114,280,6416,124,335,0.052213,0.019327,0.370149,1236,42,8,1.0,1.0,1
11107748,1551520,20,6,7,999.0,0.425765,1,0.945724,-999.0,-999.0,26.0,0,126.378041,0.0,0,0.0,0.0,0,0.0,0.0,1,1,0.0,1,0,0,1,0,0,0.0,0.0,0.0,27.9171,2317,1,0.015106,1798,6,17,2288,6,23,0.010052,0.002622,0.26087,288,3,0,0.0,0.0,0
11107749,153565,2,2,2,0.095744,10.289484,1,0.927574,-999.0,-999.0,4.5,0,26.293095,0.0,0,0.0,0.0,0,0.0,0.718783,2,2,0.0,2,0,0,2,0,0,0.0,0.0,0.0,22.893177,2822,1,0.025514,1740,4,48,2754,4,64,0.023239,0.001452,0.0625,493,7,0,0.071773,0.071773,1
11107750,184976,999,999,39,999.0,999.0,1,0.718452,-999.0,-999.0,46.0,0,187.39352,0.0,0,0.0,0.0,0,0.0,2.983888,2,2,0.0,2,0,0,2,0,0,0.0,0.0,0.0,27.997709,64362,1,0.068845,36845,616,2471,60594,663,3105,0.051243,0.010942,0.213527,11094,404,39,0.0,0.0,0
11107754,1438446,1,1,1,1.0,10.0,1,1.0,-999.0,-999.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,1,1,0.0,1,0,0,1,0,0,0.0,0.0,0.0,27.497284,504,1,0.087302,309,4,30,465,5,34,0.073118,0.010753,0.147059,86,4,1,1.0,1.0,1
11107755,168735,1,1,1,1.0,10.0,1,1.0,-999.0,-999.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,1,1,0.0,1,0,0,1,0,0,0.0,0.0,0.0,20.569614,7,1,0.0,7,0,0,7,0,0,0.0,0.0,0.0,0,0,0,1.0,1.0,1
11107756,100282,1,1,1,1.0,10.0,1,1.0,-999.0,-999.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,2.970453,4,4,0.0,4,0,0,4,0,0,0.0,0.0,0.0,22.752161,33,1,0.0,23,0,0,33,0,0,0.0,0.0,0.0,8,0,0,1.0,1.0,1
11107760,1781844,2,2,2,3.332681,43.225845,1,0.909404,-999.0,-999.0,0.5,0,3.545299,0.0,0,0.0,0.0,0,0.0,2.673165,24,15,0.0,15,0,0,24,0,0,0.0,0.0,0.0,23.172633,11,1,0.0,8,0,0,11,0,0,0.0,0.0,0.0,2,0,0,2.255993,2.255993,1


## train

In [48]:
# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
#     'objective': 'binary',
    'objective': 'lambdarank',
    'metric': '"None"',
    'eval_at': 20,
#     'metric': {'auc', 'binary_logloss'},
#     'min_data_in_leaf': 256, 
#     'num_leaves': 63,
    'max_depth': 7,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
#     'bagging_freq': 5,
#     'device': 'gpu',
    'verbose': -1,
    'num_threads': 40,
}

### candidates covisit top200 all types (orders, carts, clicks) + i2i top100 (old, new), features w2v cosine (CV carts 0.4163) BEST!

In [49]:
feature_cols = df_stage_2_dataset.drop(["session", "aid", "target"]).columns
print("num features: ", len(feature_cols))

num features:  49


In [51]:
skf = GroupKFold(n_splits=5)
df_valid_preds = df_stage_2_dataset.select(["session", "aid"])

for fold, (train_idx, valid_idx) in tqdm(enumerate(
    skf.split(df_stage_2_dataset,
              df_stage_2_dataset['target'],
              groups=df_stage_2_dataset['session'])
)):
    
    X_train = df_stage_2_dataset[train_idx][feature_cols].to_pandas()
    y_train = df_stage_2_dataset[train_idx]["target"].to_pandas()
    X_valid = df_stage_2_dataset[valid_idx][feature_cols].to_pandas()
    y_valid = df_stage_2_dataset[valid_idx]["target"].to_pandas()
    
    # create dataset for lightgbm
    groups_len_train = (
        df_stage_2_dataset[train_idx]
        .groupby("session").agg(pl.count("aid"))
        .sort("session")["aid"].to_numpy()
    )
    groups_len_valid = (
        df_stage_2_dataset[valid_idx]
        .groupby("session").agg(pl.count("aid"))
        .sort("session")["aid"].to_numpy()
    )    
    lgb_train = lgb.Dataset(X_train, y_train, group=groups_len_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid, group=groups_len_valid, reference=lgb_train)
    
    # train model
    gbm_ranking = lgb.train(
        params, 
        lgb_train,
        num_boost_round=2000,
#         return_cvbooster=True,
        feval=lgb_numba_recall,
        valid_sets=lgb_eval,
        callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(period=20)]
    )
    
    scores = gbm_ranking.predict(X_valid)
    df_valid_preds_fold = (
        df_stage_2_dataset[valid_idx].select(["session", "aid"])
        .with_columns([pl.Series(scores).alias(f"scores_fold{fold}")])
    )
    df_valid_preds = (
        df_valid_preds.join(df_valid_preds_fold, on=["session", "aid"], how="left")
    )
    
    # add save models
    gbm_ranking.save_model(
        f"__model__{ACT_TYPE}_covisit_all_types_merged_top200+i2i_old_new_k=100_top100_reranker_rank+i2i_score+w2v_cosine+42feat_fold{fold}.lgb"
    )

0it [00:00, ?it/s]

Training until validation scores don't improve for 100 rounds
[20]	valid_0's numba_recall@20: 0.999363
[40]	valid_0's numba_recall@20: 0.999522
[60]	valid_0's numba_recall@20: 0.999554
[80]	valid_0's numba_recall@20: 0.99957
[100]	valid_0's numba_recall@20: 0.999602
[120]	valid_0's numba_recall@20: 0.999602
[140]	valid_0's numba_recall@20: 0.999602
[160]	valid_0's numba_recall@20: 0.999586
[180]	valid_0's numba_recall@20: 0.99957


0it [02:45, ?it/s]


KeyboardInterrupt: 

In [44]:
_ = calc_oof_score_for_type(df_valid_preds, act_type=ACT_TYPE)

100%|██████████| 265206/265206 [00:00<00:00, 269916.65it/s]


validation score: 0.12489264670973022
recall per type: type
carts   0.4163
dtype: float64


## Simple train XGB

In [44]:
feature_cols = df_stage_2_dataset.drop(["session", "aid", "target"]).columns
print("num features: ", len(feature_cols))

num features:  49


In [47]:
skf = GroupKFold(n_splits=5)
df_valid_preds = df_stage_2_dataset.select(["session", "aid"])

for fold, (train_idx, valid_idx) in tqdm(enumerate(
    skf.split(df_stage_2_dataset,
              df_stage_2_dataset['target'],
              groups=df_stage_2_dataset['session'])
)):
    
    X_train = df_stage_2_dataset[train_idx][feature_cols].to_pandas()
    y_train = df_stage_2_dataset[train_idx]["target"].to_pandas()
    X_valid = df_stage_2_dataset[valid_idx][feature_cols].to_pandas()
    y_valid = df_stage_2_dataset[valid_idx]["target"].to_pandas()
    
    # create dataset for lightgbm
    groups_len_train = (
        df_stage_2_dataset[train_idx]
        .groupby("session").agg(pl.count("aid"))
        .sort("session")["aid"].to_numpy()
    )
    groups_len_valid = (
        df_stage_2_dataset[valid_idx]
        .groupby("session").agg(pl.count("aid"))
        .sort("session")["aid"].to_numpy()
    )

    dtrain = xgb.DMatrix(X_train, y_train, group=groups_len_train) 
    dvalid = xgb.DMatrix(X_valid, y_valid, group=groups_len_valid)

    xgb_params = {'objective':'rank:pairwise', 'tree_method':'gpu_hist'}
#     xgb_params = {'objective':'binary:logistic', 'tree_method':'gpu_hist'}
    model = xgb.train(xgb_params, 
        dtrain=dtrain,
        evals=[(dtrain,'train'),(dvalid,'valid')],
        num_boost_round=1000,
        verbose_eval=50)

0it [00:00, ?it/s]

[0]	train-map:0.75289	valid-map:0.75234
[50]	train-map:0.78662	valid-map:0.78428
[100]	train-map:0.79187	valid-map:0.78802
[150]	train-map:0.79406	valid-map:0.78854
[200]	train-map:0.79577	valid-map:0.78943
[250]	train-map:0.79699	valid-map:0.78971
[300]	train-map:0.79832	valid-map:0.78998
[350]	train-map:0.79961	valid-map:0.78985
[400]	train-map:0.80065	valid-map:0.79009
[450]	train-map:0.80195	valid-map:0.79013
[500]	train-map:0.80287	valid-map:0.79017
[550]	train-map:0.80405	valid-map:0.78998
[600]	train-map:0.80492	valid-map:0.79004
[650]	train-map:0.80602	valid-map:0.78985
[700]	train-map:0.80710	valid-map:0.78951
[750]	train-map:0.80786	valid-map:0.79004
[800]	train-map:0.80869	valid-map:0.78974
[850]	train-map:0.80959	valid-map:0.78970
[900]	train-map:0.81035	valid-map:0.78974
[950]	train-map:0.81095	valid-map:0.78981
[999]	train-map:0.81168	valid-map:0.78973


1it [02:11, 131.33s/it]

[0]	train-map:0.75477	valid-map:0.75508
[50]	train-map:0.78683	valid-map:0.78539
[100]	train-map:0.79153	valid-map:0.78834
[150]	train-map:0.79421	valid-map:0.78955
[200]	train-map:0.79569	valid-map:0.78998
[250]	train-map:0.79712	valid-map:0.79018
[300]	train-map:0.79830	valid-map:0.79034
[350]	train-map:0.79943	valid-map:0.79084
[400]	train-map:0.80067	valid-map:0.79077
[450]	train-map:0.80180	valid-map:0.79068
[500]	train-map:0.80272	valid-map:0.79097
[550]	train-map:0.80372	valid-map:0.79052
[600]	train-map:0.80425	valid-map:0.79026
[650]	train-map:0.80529	valid-map:0.79046
[700]	train-map:0.80641	valid-map:0.79051
[750]	train-map:0.80736	valid-map:0.79045
[800]	train-map:0.80837	valid-map:0.78990
[850]	train-map:0.80893	valid-map:0.79018
[900]	train-map:0.80993	valid-map:0.79014
[950]	train-map:0.81050	valid-map:0.78971
[999]	train-map:0.81147	valid-map:0.78953


2it [04:22, 131.22s/it]

[0]	train-map:0.75409	valid-map:0.75027
[50]	train-map:0.78678	valid-map:0.78138
[100]	train-map:0.79224	valid-map:0.78486
[150]	train-map:0.79469	valid-map:0.78631
[200]	train-map:0.79598	valid-map:0.78656
[250]	train-map:0.79714	valid-map:0.78675
[300]	train-map:0.79838	valid-map:0.78661
[350]	train-map:0.79992	valid-map:0.78677
[400]	train-map:0.80105	valid-map:0.78713
[450]	train-map:0.80204	valid-map:0.78691
[500]	train-map:0.80306	valid-map:0.78709
[550]	train-map:0.80417	valid-map:0.78698
[600]	train-map:0.80503	valid-map:0.78699
[650]	train-map:0.80606	valid-map:0.78687
[700]	train-map:0.80732	valid-map:0.78707
[750]	train-map:0.80800	valid-map:0.78686
[800]	train-map:0.80888	valid-map:0.78660
[850]	train-map:0.80946	valid-map:0.78689
[900]	train-map:0.81043	valid-map:0.78674
[950]	train-map:0.81107	valid-map:0.78657
[999]	train-map:0.81176	valid-map:0.78664


3it [06:33, 131.28s/it]

[0]	train-map:0.75260	valid-map:0.75370
[50]	train-map:0.78580	valid-map:0.78653
[100]	train-map:0.79126	valid-map:0.79076
[150]	train-map:0.79357	valid-map:0.79197
[200]	train-map:0.79515	valid-map:0.79213
[250]	train-map:0.79658	valid-map:0.79224
[300]	train-map:0.79773	valid-map:0.79222
[350]	train-map:0.79905	valid-map:0.79232
[400]	train-map:0.80023	valid-map:0.79241
[450]	train-map:0.80119	valid-map:0.79247
[500]	train-map:0.80204	valid-map:0.79218
[550]	train-map:0.80319	valid-map:0.79210
[600]	train-map:0.80409	valid-map:0.79206
[650]	train-map:0.80512	valid-map:0.79163
[700]	train-map:0.80616	valid-map:0.79140
[750]	train-map:0.80688	valid-map:0.79148
[800]	train-map:0.80778	valid-map:0.79142
[850]	train-map:0.80858	valid-map:0.79145
[900]	train-map:0.80957	valid-map:0.79082
[950]	train-map:0.81020	valid-map:0.79089
[999]	train-map:0.81097	valid-map:0.79114


4it [08:44, 131.13s/it]

[0]	train-map:0.75279	valid-map:0.75415
[50]	train-map:0.78612	valid-map:0.78609
[100]	train-map:0.79109	valid-map:0.78971
[150]	train-map:0.79380	valid-map:0.79061
[200]	train-map:0.79533	valid-map:0.79147
[250]	train-map:0.79665	valid-map:0.79171
[300]	train-map:0.79768	valid-map:0.79184
[350]	train-map:0.79894	valid-map:0.79195
[400]	train-map:0.80000	valid-map:0.79217
[450]	train-map:0.80116	valid-map:0.79177
[500]	train-map:0.80194	valid-map:0.79189
[550]	train-map:0.80323	valid-map:0.79176
[600]	train-map:0.80388	valid-map:0.79163
[650]	train-map:0.80512	valid-map:0.79151
[700]	train-map:0.80618	valid-map:0.79138
[750]	train-map:0.80684	valid-map:0.79162
[800]	train-map:0.80768	valid-map:0.79136
[850]	train-map:0.80873	valid-map:0.79102
[900]	train-map:0.80946	valid-map:0.79098
[950]	train-map:0.81028	valid-map:0.79110
[999]	train-map:0.81089	valid-map:0.79081


5it [10:55, 131.15s/it]


In [52]:
model.predict(dvalid)

(6693239,)

## Optuna time to find optimal params

In [48]:
import optuna

In [81]:
def fit_cv_xgb(params, callbacks=None):
    skf = GroupKFold(n_splits=5)
    df_valid_preds = df_stage_2_dataset.select(["session", "aid"])

    for fold, (train_idx, valid_idx) in tqdm(enumerate(
        skf.split(df_stage_2_dataset,
                  df_stage_2_dataset['target'],
                  groups=df_stage_2_dataset['session'])
    )):

        X_train = df_stage_2_dataset[train_idx][feature_cols].to_pandas()
        y_train = df_stage_2_dataset[train_idx]["target"].to_pandas()
        X_valid = df_stage_2_dataset[valid_idx][feature_cols].to_pandas()
        y_valid = df_stage_2_dataset[valid_idx]["target"].to_pandas()

        # group lengths for ranking
        groups_len_train = (
            df_stage_2_dataset[train_idx]
            .groupby("session").agg(pl.count("aid"))
            .sort("session")["aid"].to_numpy()
        )
        groups_len_valid = (
            df_stage_2_dataset[valid_idx]
            .groupby("session").agg(pl.count("aid"))
            .sort("session")["aid"].to_numpy()
        )
        dtrain = xgb.DMatrix(X_train, y_train, group=groups_len_train) 
        dvalid = xgb.DMatrix(X_valid, y_valid, group=groups_len_valid)

        model = xgb.train(
            params, 
            dtrain=dtrain,
            evals=[(dvalid,'valid')],
            num_boost_round=2000,
            early_stopping_rounds=200,
            verbose_eval=100,
            callbacks=callbacks,
        )
    
        scores = model.predict(dvalid)
        df_valid_preds_fold = (
            df_stage_2_dataset[valid_idx].select(["session", "aid"])
            .with_columns([pl.Series(scores).alias(f"scores_fold{fold}")])
        )
        df_valid_preds = df_valid_preds.join(df_valid_preds_fold, on=["session", "aid"], how="left")
    
    return df_valid_preds

In [82]:
def objective(trial: optuna.Trial):
    
    params = {
        "verbosity": 0,
        'objective':'rank:pairwise',
        "eval_metric": "map@20",
        'tree_method':'gpu_hist',
        "gpu_id": 1,
        "booster": "gbtree",
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        
        "subsample": 0.8,
        "sampling_method": "gradient_based",
    }
    
    params["max_depth"] = trial.suggest_int("max_depth", 5, 9)
    params["eta"] = trial.suggest_float("eta", 1e-3, 0.5, log=True)
    params["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
    params["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "valid-map@20")
    df_valid_preds = fit_cv_xgb(params, callbacks=[pruning_callback])
    
    # get oof recs and calc recall
    scores = (
        df_valid_preds
        .select([
            pl.col("scores_fold0"),
            pl.col("scores_fold1"),
            pl.col("scores_fold2"), 
            pl.col("scores_fold3"), 
            pl.col("scores_fold4"),
        ]).mean(axis=1)
    )
    
    df_valid_preds_sorted = (
        df_valid_preds
        .with_columns([pl.Series(scores).alias("score")])
        .sort("score", reverse=True)
        .groupby("session")
        .agg([
            pl.list("aid"),
            pl.list("score"),
        ])
    )
    
    df_recall = (
        df_valid_preds_sorted
        .join(val_df_valid_targets_orders, on="session", how="inner")
        .with_column(pl.col("aid").arr.head(20).alias("label"))
        .with_column(
            pl.col("label")
            .arr.concat("ground_truth")
            .arr.eval(pl.element().filter(pl.count().over(pl.element()) == 2))
            .arr.unique()
            .alias("hits")
        )
        .with_column(pl.col("hits").arr.lengths().clip(0, 20).alias("hits_count"))
        .with_column(pl.col("ground_truth").arr.lengths().clip(0, 20).alias("gt_count"))
    )
    
    recall = df_recall["hits_count"].sum() / df_recall["gt_count"].sum()
    return recall


In [None]:
pruner = optuna.pruners.MedianPruner(n_warmup_steps=5)
study = optuna.create_study(pruner=pruner, direction="maximize")
study.optimize(objective, n_trials=50)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2023-01-29 08:19:56,550][0m A new study created in memory with name: no-name-dd95ff3b-475d-447e-bf4f-d6830527b830[0m
0it [00:00, ?it/s]

[0]	valid-map@20:0.75368
[100]	valid-map@20:0.78627
[200]	valid-map@20:0.78643
[300]	valid-map@20:0.78535
[327]	valid-map@20:0.78507


1it [00:53, 53.99s/it]

[0]	valid-map@20:0.75555
[100]	valid-map@20:0.78783
[200]	valid-map@20:0.78669
[298]	valid-map@20:0.78541


2it [01:45, 52.41s/it]

[0]	valid-map@20:0.75300
[100]	valid-map@20:0.78408
[200]	valid-map@20:0.78322
[300]	valid-map@20:0.78250
[329]	valid-map@20:0.78241


3it [02:41, 54.03s/it]

[0]	valid-map@20:0.75691
[100]	valid-map@20:0.78957
[200]	valid-map@20:0.78806
[300]	valid-map@20:0.78704
[310]	valid-map@20:0.78709


4it [03:34, 53.72s/it]

[0]	valid-map@20:0.75703
[100]	valid-map@20:0.78820
[200]	valid-map@20:0.78761
[300]	valid-map@20:0.78693
[336]	valid-map@20:0.78655


5it [04:31, 54.23s/it]
[32m[I 2023-01-29 08:24:54,682][0m Trial 0 finished with value: 0.6503031942941193 and parameters: {'lambda': 0.0012329423754643327, 'alpha': 0.0012262134432874563, 'max_depth': 7, 'eta': 0.47704648954754, 'gamma': 0.09418712925497813, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.6503031942941193.[0m
0it [00:00, ?it/s]

[0]	valid-map@20:0.75369
[100]	valid-map@20:0.76212
[200]	valid-map@20:0.76562
[300]	valid-map@20:0.76915
[400]	valid-map@20:0.77254
[500]	valid-map@20:0.77648
[600]	valid-map@20:0.77840
[700]	valid-map@20:0.78006
[800]	valid-map@20:0.78117
[900]	valid-map@20:0.78214
[1000]	valid-map@20:0.78308
[1100]	valid-map@20:0.78361
[1200]	valid-map@20:0.78411
[1300]	valid-map@20:0.78486
[1400]	valid-map@20:0.78528
[1500]	valid-map@20:0.78589
[1600]	valid-map@20:0.78608
[1700]	valid-map@20:0.78632
[1800]	valid-map@20:0.78660
[1900]	valid-map@20:0.78671
[1999]	valid-map@20:0.78700


1it [05:09, 309.75s/it]

[0]	valid-map@20:0.75555
[100]	valid-map@20:0.76283
[200]	valid-map@20:0.76630
[300]	valid-map@20:0.77023
[400]	valid-map@20:0.77402
[500]	valid-map@20:0.77784
[600]	valid-map@20:0.77918
[700]	valid-map@20:0.78072
[800]	valid-map@20:0.78167
[900]	valid-map@20:0.78242
[1000]	valid-map@20:0.78324
[1100]	valid-map@20:0.78395
[1200]	valid-map@20:0.78458
[1300]	valid-map@20:0.78512
[1400]	valid-map@20:0.78549
[1500]	valid-map@20:0.78585
[1600]	valid-map@20:0.78635
[1700]	valid-map@20:0.78656
[1800]	valid-map@20:0.78686
[1900]	valid-map@20:0.78697
[1999]	valid-map@20:0.78728


2it [10:20, 310.50s/it]

[0]	valid-map@20:0.75300
[100]	valid-map@20:0.76182
[200]	valid-map@20:0.76413
[300]	valid-map@20:0.76722
[400]	valid-map@20:0.77083
[500]	valid-map@20:0.77431
[600]	valid-map@20:0.77600
[700]	valid-map@20:0.77732
[800]	valid-map@20:0.77813
[900]	valid-map@20:0.77887
[1000]	valid-map@20:0.77961
[1100]	valid-map@20:0.78035
[1200]	valid-map@20:0.78109
[1300]	valid-map@20:0.78147
[1400]	valid-map@20:0.78223
[1500]	valid-map@20:0.78263
[1600]	valid-map@20:0.78287
[1700]	valid-map@20:0.78337
[1800]	valid-map@20:0.78353
[1900]	valid-map@20:0.78362
[1999]	valid-map@20:0.78401


3it [15:31, 310.60s/it]

[0]	valid-map@20:0.75691
[100]	valid-map@20:0.76487
[200]	valid-map@20:0.76844
[300]	valid-map@20:0.77147
[400]	valid-map@20:0.77547
[500]	valid-map@20:0.77880
[600]	valid-map@20:0.78107
[700]	valid-map@20:0.78250
[800]	valid-map@20:0.78325
[900]	valid-map@20:0.78418
[1000]	valid-map@20:0.78519
[1100]	valid-map@20:0.78609
[1200]	valid-map@20:0.78698
[1300]	valid-map@20:0.78774
[1400]	valid-map@20:0.78806
[1500]	valid-map@20:0.78877
[1600]	valid-map@20:0.78909
[1700]	valid-map@20:0.78943
[1800]	valid-map@20:0.78972
[1900]	valid-map@20:0.78996
[1999]	valid-map@20:0.79013


4it [20:42, 310.92s/it]

[0]	valid-map@20:0.75701
[100]	valid-map@20:0.76560
[200]	valid-map@20:0.76825
[300]	valid-map@20:0.77179
[400]	valid-map@20:0.77467
[500]	valid-map@20:0.77864
[600]	valid-map@20:0.78103
[700]	valid-map@20:0.78237
[800]	valid-map@20:0.78284
[900]	valid-map@20:0.78352
[1000]	valid-map@20:0.78440
[1100]	valid-map@20:0.78522
[1200]	valid-map@20:0.78591
[1300]	valid-map@20:0.78635
[1400]	valid-map@20:0.78679
[1500]	valid-map@20:0.78716
[1600]	valid-map@20:0.78768
[1700]	valid-map@20:0.78809
[1800]	valid-map@20:0.78833
[1900]	valid-map@20:0.78853
[1999]	valid-map@20:0.78850


5it [25:52, 310.54s/it]
[32m[I 2023-01-29 08:51:13,997][0m Trial 1 finished with value: 0.6567070463186524 and parameters: {'lambda': 0.0009634010918507909, 'alpha': 0.00013223636622598109, 'max_depth': 7, 'eta': 0.010095500053950705, 'gamma': 0.00010263638195744149, 'grow_policy': 'lossguide'}. Best is trial 1 with value: 0.6567070463186524.[0m
0it [00:00, ?it/s]

[0]	valid-map@20:0.75483
[100]	valid-map@20:0.77902
[200]	valid-map@20:0.78514
[300]	valid-map@20:0.78717
[400]	valid-map@20:0.78810
[500]	valid-map@20:0.78874
[600]	valid-map@20:0.78902
[700]	valid-map@20:0.78902
[800]	valid-map@20:0.78913
[900]	valid-map@20:0.78914
[1000]	valid-map@20:0.78915
[1100]	valid-map@20:0.78911
[1200]	valid-map@20:0.78923
[1300]	valid-map@20:0.78901
[1342]	valid-map@20:0.78908


1it [03:36, 216.03s/it]

[0]	valid-map@20:0.76101
[100]	valid-map@20:0.77973
[200]	valid-map@20:0.78509
[300]	valid-map@20:0.78752
[400]	valid-map@20:0.78856
[500]	valid-map@20:0.78950
[600]	valid-map@20:0.78965
[700]	valid-map@20:0.78970
[800]	valid-map@20:0.78981
[900]	valid-map@20:0.78954
[923]	valid-map@20:0.78938


2it [06:11, 180.43s/it]

[0]	valid-map@20:0.75966
[100]	valid-map@20:0.77591
[200]	valid-map@20:0.78185
[300]	valid-map@20:0.78471
[400]	valid-map@20:0.78534
[500]	valid-map@20:0.78629
[600]	valid-map@20:0.78665
[700]	valid-map@20:0.78682
[800]	valid-map@20:0.78716
[900]	valid-map@20:0.78698
[1000]	valid-map@20:0.78703
[1100]	valid-map@20:0.78707
[1158]	valid-map@20:0.78713


3it [09:22, 185.10s/it]

[0]	valid-map@20:0.76165
[100]	valid-map@20:0.78108
[200]	valid-map@20:0.78722
[300]	valid-map@20:0.78990
[400]	valid-map@20:0.79111
[500]	valid-map@20:0.79143
[600]	valid-map@20:0.79132
[700]	valid-map@20:0.79157
[800]	valid-map@20:0.79166
[900]	valid-map@20:0.79139
[1000]	valid-map@20:0.79151
[1100]	valid-map@20:0.79165
[1183]	valid-map@20:0.79156


4it [12:36, 188.58s/it]

[0]	valid-map@20:0.76351
[100]	valid-map@20:0.78100
[200]	valid-map@20:0.78652
[300]	valid-map@20:0.78889
[400]	valid-map@20:0.78964
[500]	valid-map@20:0.79008
[600]	valid-map@20:0.79047
[700]	valid-map@20:0.79058
[800]	valid-map@20:0.79090
[900]	valid-map@20:0.79063
[989]	valid-map@20:0.79057


5it [15:21, 184.24s/it]
[32m[I 2023-01-29 09:07:01,926][0m Trial 2 finished with value: 0.6550210917769822 and parameters: {'lambda': 7.492602274207669e-05, 'alpha': 0.008440159064482585, 'max_depth': 8, 'eta': 0.047707696019001995, 'gamma': 0.97855643199289, 'grow_policy': 'lossguide'}. Best is trial 1 with value: 0.6567070463186524.[0m
0it [00:00, ?it/s]

[0]	valid-map@20:0.75483
[100]	valid-map@20:0.77309
[200]	valid-map@20:0.78040
[300]	valid-map@20:0.78385
[400]	valid-map@20:0.78606
[500]	valid-map@20:0.78723
[600]	valid-map@20:0.78804
[700]	valid-map@20:0.78836
[800]	valid-map@20:0.78850
[900]	valid-map@20:0.78890
[1000]	valid-map@20:0.78916
[1100]	valid-map@20:0.78901
[1144]	valid-map@20:0.78899


1it [03:05, 185.18s/it]

[0]	valid-map@20:0.76101
[100]	valid-map@20:0.77460
[200]	valid-map@20:0.78168
[300]	valid-map@20:0.78426
[400]	valid-map@20:0.78630
[500]	valid-map@20:0.78760
[600]	valid-map@20:0.78817
[700]	valid-map@20:0.78876
[800]	valid-map@20:0.78919
[900]	valid-map@20:0.78961
[1000]	valid-map@20:0.78977
[1100]	valid-map@20:0.78987
[1200]	valid-map@20:0.79003
[1300]	valid-map@20:0.79016
[1400]	valid-map@20:0.79001
[1500]	valid-map@20:0.78995
[1569]	valid-map@20:0.78966


2it [07:11, 220.98s/it]

[0]	valid-map@20:0.75966
[100]	valid-map@20:0.77135
[200]	valid-map@20:0.77732
[300]	valid-map@20:0.78054
[400]	valid-map@20:0.78311
[500]	valid-map@20:0.78427
[600]	valid-map@20:0.78540
[700]	valid-map@20:0.78542
[800]	valid-map@20:0.78577
[900]	valid-map@20:0.78625
[1000]	valid-map@20:0.78656
[1100]	valid-map@20:0.78660
[1200]	valid-map@20:0.78669
[1300]	valid-map@20:0.78684
[1400]	valid-map@20:0.78690
[1500]	valid-map@20:0.78682
[1600]	valid-map@20:0.78713
[1700]	valid-map@20:0.78698
[1800]	valid-map@20:0.78716
[1900]	valid-map@20:0.78719
[1999]	valid-map@20:0.78724


3it [12:16, 259.69s/it]

[0]	valid-map@20:0.76165
[100]	valid-map@20:0.77591
[200]	valid-map@20:0.78299
[300]	valid-map@20:0.78652
[400]	valid-map@20:0.78839
[500]	valid-map@20:0.78990
[600]	valid-map@20:0.79092
[700]	valid-map@20:0.79121
[800]	valid-map@20:0.79113
[900]	valid-map@20:0.79150
[1000]	valid-map@20:0.79163
[1100]	valid-map@20:0.79178
[1200]	valid-map@20:0.79192
[1300]	valid-map@20:0.79207
[1400]	valid-map@20:0.79221
[1500]	valid-map@20:0.79235
[1600]	valid-map@20:0.79211
[1700]	valid-map@20:0.79226
[1760]	valid-map@20:0.79209


4it [16:48, 264.53s/it]

[0]	valid-map@20:0.76154
[100]	valid-map@20:0.77568
[200]	valid-map@20:0.78288
[300]	valid-map@20:0.78617
[400]	valid-map@20:0.78762
[500]	valid-map@20:0.78872
[600]	valid-map@20:0.78950
[700]	valid-map@20:0.78984
[800]	valid-map@20:0.78997
[900]	valid-map@20:0.79015
[1000]	valid-map@20:0.79042
[1100]	valid-map@20:0.79063
[1200]	valid-map@20:0.79087
[1300]	valid-map@20:0.79088
[1400]	valid-map@20:0.79089
[1427]	valid-map@20:0.79088


5it [20:33, 246.76s/it]
[32m[I 2023-01-29 09:28:02,766][0m Trial 3 finished with value: 0.6554859434407349 and parameters: {'lambda': 0.003304436094342764, 'alpha': 0.0016404381771581151, 'max_depth': 8, 'eta': 0.0280840420053276, 'gamma': 2.7892538280087338e-08, 'grow_policy': 'depthwise'}. Best is trial 1 with value: 0.6567070463186524.[0m
0it [00:00, ?it/s]

[0]	valid-map@20:0.74916
[100]	valid-map@20:0.75793
[200]	valid-map@20:0.75938
[300]	valid-map@20:0.76331
[400]	valid-map@20:0.76850
[500]	valid-map@20:0.77243
[600]	valid-map@20:0.77382
[700]	valid-map@20:0.77511
[800]	valid-map@20:0.77646
[900]	valid-map@20:0.77713
[1000]	valid-map@20:0.77829
[1100]	valid-map@20:0.77937
[1200]	valid-map@20:0.78036
[1300]	valid-map@20:0.78111
[1400]	valid-map@20:0.78164
[1500]	valid-map@20:0.78208
[1600]	valid-map@20:0.78244
[1700]	valid-map@20:0.78271
[1800]	valid-map@20:0.78304
[1900]	valid-map@20:0.78348
[1999]	valid-map@20:0.78379


1it [04:45, 285.60s/it]

[0]	valid-map@20:0.74933
[100]	valid-map@20:0.75793
[200]	valid-map@20:0.75953
[300]	valid-map@20:0.76386
[400]	valid-map@20:0.76909
[500]	valid-map@20:0.77320
[600]	valid-map@20:0.77436
[700]	valid-map@20:0.77545
[800]	valid-map@20:0.77645
[900]	valid-map@20:0.77749
[1000]	valid-map@20:0.77877
[1100]	valid-map@20:0.77978
[1200]	valid-map@20:0.78053
[1300]	valid-map@20:0.78128
[1400]	valid-map@20:0.78178
[1500]	valid-map@20:0.78241
[1600]	valid-map@20:0.78281
[1700]	valid-map@20:0.78328
[1800]	valid-map@20:0.78344
[1900]	valid-map@20:0.78375
[1999]	valid-map@20:0.78395


2it [09:32, 286.57s/it]

[0]	valid-map@20:0.74635
[100]	valid-map@20:0.75645
[200]	valid-map@20:0.75720
[300]	valid-map@20:0.76160
[400]	valid-map@20:0.76622
[500]	valid-map@20:0.76994
[600]	valid-map@20:0.77093
[700]	valid-map@20:0.77207
[800]	valid-map@20:0.77358
[900]	valid-map@20:0.77451
[1000]	valid-map@20:0.77562
[1100]	valid-map@20:0.77665
[1200]	valid-map@20:0.77723
[1300]	valid-map@20:0.77808
[1400]	valid-map@20:0.77867
[1500]	valid-map@20:0.77916
[1600]	valid-map@20:0.77956
[1700]	valid-map@20:0.77998
[1800]	valid-map@20:0.78046
[1900]	valid-map@20:0.78097
[1999]	valid-map@20:0.78124


3it [14:19, 286.46s/it]

[0]	valid-map@20:0.75056
[100]	valid-map@20:0.75822
[200]	valid-map@20:0.76077
[300]	valid-map@20:0.76622
[400]	valid-map@20:0.77155
[500]	valid-map@20:0.77557
[600]	valid-map@20:0.77615
[700]	valid-map@20:0.77736
[800]	valid-map@20:0.77820
[900]	valid-map@20:0.77915
[1000]	valid-map@20:0.78003
[1100]	valid-map@20:0.78113
[1200]	valid-map@20:0.78252
[1300]	valid-map@20:0.78312
[1400]	valid-map@20:0.78413
[1500]	valid-map@20:0.78460
[1600]	valid-map@20:0.78502
[1700]	valid-map@20:0.78560
[1800]	valid-map@20:0.78607
[1900]	valid-map@20:0.78653
[1999]	valid-map@20:0.78681


4it [19:07, 287.10s/it]

[0]	valid-map@20:0.75122
[100]	valid-map@20:0.76108
[200]	valid-map@20:0.76206
[300]	valid-map@20:0.76664
[400]	valid-map@20:0.77004
[500]	valid-map@20:0.77408
[600]	valid-map@20:0.77511
[700]	valid-map@20:0.77628
[800]	valid-map@20:0.77730
[900]	valid-map@20:0.77840
[1000]	valid-map@20:0.77939
[1100]	valid-map@20:0.78058
[1200]	valid-map@20:0.78147
[1300]	valid-map@20:0.78231
[1400]	valid-map@20:0.78301
[1500]	valid-map@20:0.78358
[1600]	valid-map@20:0.78399
[1700]	valid-map@20:0.78468
[1800]	valid-map@20:0.78503
[1900]	valid-map@20:0.78537
[1999]	valid-map@20:0.78558


5it [23:52, 286.60s/it]
[32m[I 2023-01-29 09:52:22,543][0m Trial 4 finished with value: 0.6564711514445092 and parameters: {'lambda': 3.3819759217424226e-08, 'alpha': 0.003868248022545784, 'max_depth': 6, 'eta': 0.00964532776467944, 'gamma': 2.724573290260195e-07, 'grow_policy': 'lossguide'}. Best is trial 1 with value: 0.6567070463186524.[0m
0it [00:00, ?it/s]

[0]	valid-map@20:0.76077


0it [00:13, ?it/s]
[32m[I 2023-01-29 09:52:36,358][0m Trial 5 pruned. Trial was pruned at iteration 19.[0m
0it [00:00, ?it/s]

In [77]:
print("Best trial:")
trial = study.best_trial

Best trial:


In [56]:
val_df_valid_targets_orders = val_df_valid_targets.filter(pl.col("type") == "orders")

In [71]:
(
    val_df_valid_targets_orders
    .with_columns([pl.col("ground_truth").arr.head(2).alias("labels")])
    .with_column(pl.col("ground_truth").arr.lengths().clip(0, 20).alias("gt_count"))
)

20

In [None]:
import optuna

import sklearn.datasets
import xgboost as xgb


def objective(trial):
    train_x, train_y = sklearn.datasets.load_breast_cancer(return_X_y=True)
    dtrain = xgb.DMatrix(train_x, label=train_y)

    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
    }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "test-auc")
    history = xgb.cv(param, dtrain, num_boost_round=100, callbacks=[pruning_callback])

    mean_auc = history["test-auc-mean"].values[-1]
    return mean_auc


if __name__ == "__main__":
    pruner = optuna.pruners.MedianPruner(n_warmup_steps=5)
    study = optuna.create_study(pruner=pruner, direction="maximize")
    study.optimize(objective, n_trials=100)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [2]:
import xgboost as xgb

In [10]:
import numpy as np


X = np.random.randn(1000000, 20)
y = np.random.randint(2, size=(1000000))

In [14]:
from sklearn.model_selection import GroupKFold, StratifiedKFold

skf = GroupKFold(n_splits=5)
skf = StratifiedKFold(n_splits=5)

In [16]:
for fold,(train_idx, valid_idx) in enumerate(skf.split(X, y)):

    X_train = X[train_idx, :]
    y_train = y[train_idx]
    X_valid = X[valid_idx, :]
    y_valid = y[valid_idx]

    # IF YOU HAVE 50 CANDIDATE WE USE 50 BELOW
#     dtrain = xgb.DMatrix(X_train, y_train, group=[50] * (len(train_idx)//50) ) 
#     dvalid = xgb.DMatrix(X_valid, y_valid, group=[50] * (len(valid_idx)//50) ) 
    dtrain = xgb.DMatrix(X_train, y_train, ) 
    dvalid = xgb.DMatrix(X_valid, y_valid)

    xgb_params = {'objective':'rank:pairwise', 'tree_method':'gpu_hist'}
    xgb_params = {'objective':'binary:logistic', 'tree_method':'gpu_hist'}
    model = xgb.train(xgb_params, 
        dtrain=dtrain,
        evals=[(dtrain,'train'),(dvalid,'valid')],
        num_boost_round=1000,
        verbose_eval=100)

[0]	train-logloss:0.69296	valid-logloss:0.69314
[100]	train-logloss:0.67820	valid-logloss:0.69609
[200]	train-logloss:0.66506	valid-logloss:0.69876
[300]	train-logloss:0.65249	valid-logloss:0.70157
[400]	train-logloss:0.64069	valid-logloss:0.70406
[500]	train-logloss:0.62922	valid-logloss:0.70631
[600]	train-logloss:0.61819	valid-logloss:0.70854
[700]	train-logloss:0.60735	valid-logloss:0.71085
[800]	train-logloss:0.59710	valid-logloss:0.71314
[900]	train-logloss:0.58684	valid-logloss:0.71549
[999]	train-logloss:0.57724	valid-logloss:0.71763
[0]	train-logloss:0.69299	valid-logloss:0.69318
[100]	train-logloss:0.67841	valid-logloss:0.69607
[200]	train-logloss:0.66529	valid-logloss:0.69851
[300]	train-logloss:0.65313	valid-logloss:0.70072
[400]	train-logloss:0.64115	valid-logloss:0.70329
[500]	train-logloss:0.62992	valid-logloss:0.70526
[600]	train-logloss:0.61905	valid-logloss:0.70752
[700]	train-logloss:0.60828	valid-logloss:0.70984
[800]	train-logloss:0.59792	valid-logloss:0.71230
[900

In [None]:
import optuna