In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
from pathlib import Path
from collections import Counter

from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import dill
import pandas as pd
import polars as pl
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import GroupKFold

import lightgbm as lgb

import pytorch_lightning as L
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
import seaborn as sns

from tasks.data.dataset.mappers import EntityEncoder
from tasks.jobs import Splitter

In [3]:
from otto_utils import *
from otto_features import *
from otto_candidates_covisit import *
from otto_lgbm_utils import *

# Load data

## test

In [4]:
df_train = pl.read_parquet(TRAIN_PROCESSED, use_pyarrow=True)
df_test = pl.read_parquet(TEST_PROCESSED, use_pyarrow=True)

# Features

## test

### user features

In [5]:
df = df_test.unique().sort(["session", "ts"])

In [6]:
df_user_action_stats_features = user_action_stats_features(df)
df_user_time_distr_features = user_time_distr_features(df)

2022-08-28 22:00:00.278000
2022-09-04 21:59:51.563000


In [7]:
df_user_last_type_actions = (
    df
    .groupby(["session", "type"])
    .agg([
        pl.last("aid")
    ])
    .pivot(values="aid", index="session", columns="type")
    .rename({
        "0": "last_click_aid",
        "1": "last_cart_aid",
        "2": "last_order_aid",
    })
    .sort("session")
)

### user item features

In [8]:
df = df_test.unique().sort(["session", "ts"])

In [9]:
df_user_item_history_features = user_item_history_features(df)

### item features

In [10]:
df = pl.concat([
#     df_train,
    df_train.filter(pl.col("ts") >= datetime(2022, 8, 8).timestamp() * 1000),
    df_test
]).unique().sort(["session", "ts"])

In [11]:
df_item_action_stats_features = item_action_stats_features(df)
df_item_time_distr_features = item_time_distr_features(df)

2022-08-08 00:00:00.265000
2022-09-04 21:59:51.563000


In [12]:
df_item_n_sess_multiple_action = item_n_sessions_with_repeated_actions(df)

# Candidates

In [13]:
df = df_test.unique().sort(["session", "ts"])

In [14]:
# Use top X for clicks, carts and orders
clicks_th = 15
carts_th  = 20
orders_th = 20

def get_top(df, th):
    return (
        df
        .with_column(pl.lit(1).alias("ones"))
        .with_column(pl.col("ones").cumsum().over("aid").alias("rank"))
        .filter(pl.col("rank") <= th)
    )

TOPK_RECOMMEND = 20
TOPK_RERANK = 40

## valid

In [15]:
carts_orders = pl.read_parquet("__subm__covisit_carts_orders_all_v1.parquet")
buys2buys = pl.read_parquet("__subm__covisit_buys2buys_all_v1.parquet")
clicks = pl.read_parquet("__subm__covisit_clicks_all_v1.parquet")

In [16]:
carts_orders_top = get_top(carts_orders, carts_th)
buys2buys_top = get_top(buys2buys, orders_th)
clicks_top = get_top(clicks, clicks_th)

In [17]:
top_clicks = df.filter(pl.col("type") == 0)["aid"].value_counts(sort=True)[:TOPK_RECOMMEND]["aid"].to_list()
top_carts = df.filter(pl.col("type") == 1)["aid"].value_counts(sort=True)[:TOPK_RECOMMEND]["aid"].to_list()
top_orders = df.filter(pl.col("type") == 2)["aid"].value_counts(sort=True)[:TOPK_RECOMMEND]["aid"].to_list()

In [18]:
covisit_rec = CovisitationRecommender(
    df_top_k_buys=carts_orders_top,
    df_top_k_buy2buy=buys2buys_top,
    df_top_k_clicks=clicks_top,
    top_carts=top_carts,
    top_orders=top_orders,
    top_clicks=top_clicks,
)

In [19]:
# test_df = val_df_valid_input.sort(["session", "ts"])
test_df = df.unique().sort(["session", "ts"])
test_sessions_dict = test_df.groupby('session').agg([pl.list("aid"), pl.list("type")])
test_sessions_dict = dict(zip(
    test_sessions_dict["session"].to_list(),
    tuple(zip(test_sessions_dict["aid"].to_list(), test_sessions_dict["type"].to_list()))
))

In [20]:
candidates_dict = {
    "session": [],
    "type": [],
    "candidates": [],
    "rank": [],
}

types = ["clicks", "carts", "orders"]
topk = TOPK_RERANK * 5
# topk = TOPK_RERANK

for session_id, (session_aid_list, session_type_list) in tqdm(test_sessions_dict.items()):
    rec_items_clicks = covisit_rec.recommend_clicks(session_aid_list, session_type_list, topk)
    rec_items_carts = covisit_rec.recommend_carts(session_aid_list, session_type_list, topk)
    rec_items_buys = covisit_rec.recommend_buys(session_aid_list, session_type_list, topk)

    candidates = [rec_items_clicks, rec_items_carts, rec_items_buys]
    ranks = [
        np.arange(1, len(rec_items) + 1).tolist()
        for rec_items in [rec_items_clicks, rec_items_carts, rec_items_buys]
    ]
    
    candidates_dict["session"].extend([session_id] * len(types))
    candidates_dict["type"].extend(types)
    candidates_dict["candidates"].extend(candidates)
    candidates_dict["rank"].extend(ranks)

df_candidates = pl.DataFrame(candidates_dict)

100%|██████████| 1671803/1671803 [02:23<00:00, 11610.88it/s]


In [None]:
(
    df_candidates
    .filter(pl.col("type") == "carts")
    .explode(["candidates", "ranks"])
    .rename({"candidates": "aid"})
)

In [21]:
submission_dict = {
    "session_type": [],
    "labels": [],
}

types = ["clicks", "carts", "orders"]
topk = TOPK_RECOMMEND

for session_id, (session_aid_list, session_type_list) in tqdm(test_sessions_dict.items()):
    rec_items_clicks = covisit_rec.recommend_clicks(session_aid_list, session_type_list, topk)
    rec_items_carts = covisit_rec.recommend_carts(session_aid_list, session_type_list, topk)
    rec_items_buys = covisit_rec.recommend_buys(session_aid_list, session_type_list, topk)

    session_types = [f"{session_id}_{t}" for t in types]
    labels_list = [
        " ".join(str(aid) for aid in rec_items)
        for rec_items in [rec_items_clicks, rec_items_carts, rec_items_buys]
    ]
    
    submission_dict["session_type"].extend(session_types)
    submission_dict["labels"].extend(labels_list)

df_submission_valid = pl.DataFrame(submission_dict)

100%|██████████| 1671803/1671803 [02:04<00:00, 13410.15it/s]


# Reranker

## carts

### dataset for pred and join features

In [None]:
df_stage_2_dataset_for_pred = (
    df_candidates
    .filter(pl.col("type") == "carts")
    .drop("type")
    .explode(["candidates", "rank"])
    .rename({"candidates": "aid"})
    .unique(subset=["session", "aid"], keep="last")
)

In [41]:
df_stage_2_dataset_for_pred = (
    df_stage_2_dataset_for_pred
    .join(df_user_last_type_actions, on="session", how="left")
    .join(
        carts_orders.rename({"weight": "user_last_click_aid_carts_orders_weight"}),
        left_on=["aid", "last_click_aid"], right_on=["aid", "aid_right"], how="left"
    )
    .join(
        buys2buys.rename({"weight": "user_last_click_aid_buy2buy_weight"}),
        left_on=["aid", "last_click_aid"], right_on=["aid", "aid_right"], how="left"
    )
    .join(
        clicks.rename({"weight": "user_last_click_aid_click_weight"}),
        left_on=["aid", "last_click_aid"], right_on=["aid", "aid_right"], how="left"
    )
    .join(
        carts_orders.rename({"weight": "user_last_cart_aid_carts_orders_weight"}),
        left_on=["aid", "last_cart_aid"], right_on=["aid", "aid_right"], how="left"
    )
    .join(
        buys2buys.rename({"weight": "user_last_cart_aid_buy2buy_weight"}),
        left_on=["aid", "last_cart_aid"], right_on=["aid", "aid_right"], how="left"
    )
    .join(
        clicks.rename({"weight": "user_last_cart_aid_click_weight"}),
        left_on=["aid", "last_cart_aid"], right_on=["aid", "aid_right"], how="left"
    )
    .join(
        carts_orders.rename({"weight": "user_last_order_aid_carts_orders_weight"}),
        left_on=["aid", "last_order_aid"], right_on=["aid", "aid_right"], how="left"
    )
    .join(
        buys2buys.rename({"weight": "user_last_order_aid_buy2buy_weight"}),
        left_on=["aid", "last_order_aid"], right_on=["aid", "aid_right"], how="left"
    )
    .join(
        clicks.rename({"weight": "user_last_order_aid_click_weight"}),
        left_on=["aid", "last_order_aid"], right_on=["aid", "aid_right"], how="left"
    )
    .drop(["last_click_aid", "last_cart_aid", "last_order_aid"])
    .fill_null(0)
)

In [42]:
df_stage_2_dataset_for_pred = (
    df_stage_2_dataset_for_pred
    .join(df_user_action_stats_features, on="session", how="left")
#     .join(df_user_time_distr_features, on="session", how="left")
    .join(df_item_action_stats_features, on="aid", how="left")
#     .join(df_item_time_distr_features, on="aid", how="left")
    .join(df_item_n_sess_multiple_action, on="aid", how="left")
    .join(df_user_item_history_features, on=["session", "aid"], how="left")
    .sort("session")
    .fill_null(0)
)

In [43]:
df_stage_2_dataset_for_pred

session,aid,rank,user_last_click_aid_carts_orders_weight,user_last_click_aid_buy2buy_weight,user_last_click_aid_click_weight,user_last_cart_aid_carts_orders_weight,user_last_cart_aid_buy2buy_weight,user_last_cart_aid_click_weight,user_last_order_aid_carts_orders_weight,user_last_order_aid_buy2buy_weight,user_last_order_aid_click_weight,user_lifetime_days,user_n_actions,user_n_uniq_items,user_buys_rate,user_uniq_clicks,user_uniq_carts,user_uniq_orders,cl_cnt,ca_cnt,or_cnt,user_ca_cl_ratio,user_or_cl_ratio,user_or_ca_ratio,item_lifetime_days,item_n_actions,item_n_uniq_users,item_buys_rate,item_uniq_clicks,item_uniq_orders,item_uniq_carts,cl_cnt_right,or_cnt_right,ca_cnt_right,item_ca_cl_ratio,item_or_cl_ratio,item_or_ca_ratio,item_n_sess_multi_clicks,item_n_sess_multi_carts,item_n_sess_multi_buys,user_item_log_recency_score,user_item_type_weighted_log_recency_score,user_item_is_in_history
i64,i64,i64,f64,i32,f64,f64,i32,f64,f64,i32,f64,f64,u32,u32,f64,u32,u32,u32,u32,u32,u32,f64,f64,f64,f64,u32,u32,f64,u32,u32,u32,u32,u32,u32,f64,f64,f64,u32,u32,u32,f64,f64,i32
12899779,59625,1,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,1,1,0.0,1,0,0,1,0,0,0.0,0.0,0.0,18.275758,8,1,0.0,7,0,0,8,0,0,0.0,0.0,0.0,1,0,0,1.0,1.0,1
12899779,1253524,2,1.0,0,6.427448,0.0,0,0.0,0.0,0,0.0,0.0,1,1,0.0,1,0,0,1,0,0,0.0,0.0,0.0,24.013138,223,1,0.026906,163,0,6,217,0,6,0.02765,0.0,0.0,38,0,0,0.0,0.0,0
12899779,737445,3,1.0,0,4.967441,0.0,0,0.0,0.0,0,0.0,0.0,1,1,0.0,1,0,0,1,0,0,0.0,0.0,0.0,27.754624,759,1,0.019763,475,0,11,744,0,15,0.020161,0.0,0.0,140,4,0,0.0,0.0,0
12899779,438191,4,1.0,0,4.208226,0.0,0,0.0,0.0,0,0.0,0.0,1,1,0.0,1,0,0,1,0,0,0.0,0.0,0.0,27.719658,2498,1,0.035228,1566,4,68,2414,4,80,0.03314,0.001657,0.05,376,8,0,0.0,0.0,0
12899779,731692,5,1.0,0,4.122878,0.0,0,0.0,0.0,0,0.0,0.0,1,1,0.0,1,0,0,1,0,0,0.0,0.0,0.0,27.48988,93,1,0.172043,45,2,12,79,2,12,0.151899,0.025316,0.166667,12,0,0,0.0,0.0,0
12899779,1790770,6,1.0,0,3.518963,0.0,0,0.0,0.0,0,0.0,0.0,1,1,0.0,1,0,0,1,0,0,0.0,0.0,0.0,18.141994,23,1,0.0,21,0,0,23,0,0,0.0,0.0,0.0,2,0,0,0.0,0.0,0
12899779,1660529,7,0.5,0,3.294204,0.0,0,0.0,0.0,0,0.0,0.0,1,1,0.0,1,0,0,1,0,0,0.0,0.0,0.0,24.97091,94,1,0.042553,68,0,4,90,0,4,0.044444,0.0,0.0,16,0,0,0.0,0.0,0
12899779,94230,8,0.5,0,3.293742,0.0,0,0.0,0.0,0,0.0,0.0,1,1,0.0,1,0,0,1,0,0,0.0,0.0,0.0,27.585517,485,1,0.030928,322,1,12,471,1,13,0.027601,0.002123,0.076923,73,1,0,0.0,0.0,0
12899779,1660089,9,0.5,0,3.218493,0.0,0,0.0,0.0,0,0.0,0.0,1,1,0.0,1,0,0,1,0,0,0.0,0.0,0.0,27.537446,471,1,0.038217,350,1,15,454,1,16,0.035242,0.002203,0.0625,70,1,0,0.0,0.0,0
12899779,339846,10,0.5,0,3.133652,0.0,0,0.0,0.0,0,0.0,0.0,1,1,0.0,1,0,0,1,0,0,0.0,0.0,0.0,27.833971,2287,1,0.024923,1397,1,48,2231,1,55,0.024653,0.000448,0.018182,374,7,0,0.0,0.0,0


### predict

In [46]:
feature_cols = ["rank"] + df_stage_2_dataset_for_pred.columns[3:]
feature_cols

['rank',
 'user_last_click_aid_carts_orders_weight',
 'user_last_click_aid_buy2buy_weight',
 'user_last_click_aid_click_weight',
 'user_last_cart_aid_carts_orders_weight',
 'user_last_cart_aid_buy2buy_weight',
 'user_last_cart_aid_click_weight',
 'user_last_order_aid_carts_orders_weight',
 'user_last_order_aid_buy2buy_weight',
 'user_last_order_aid_click_weight',
 'user_lifetime_days',
 'user_n_actions',
 'user_n_uniq_items',
 'user_buys_rate',
 'user_uniq_clicks',
 'user_uniq_carts',
 'user_uniq_orders',
 'cl_cnt',
 'ca_cnt',
 'or_cnt',
 'user_ca_cl_ratio',
 'user_or_cl_ratio',
 'user_or_ca_ratio',
 'item_lifetime_days',
 'item_n_actions',
 'item_n_uniq_users',
 'item_buys_rate',
 'item_uniq_clicks',
 'item_uniq_orders',
 'item_uniq_carts',
 'cl_cnt_right',
 'or_cnt_right',
 'ca_cnt_right',
 'item_ca_cl_ratio',
 'item_or_cl_ratio',
 'item_or_ca_ratio',
 'item_n_sess_multi_clicks',
 'item_n_sess_multi_carts',
 'item_n_sess_multi_buys',
 'user_item_log_recency_score',
 'user_item_type_w

In [None]:
df_valid_preds = df_stage_2_dataset_for_pred.select(["session", "aid"])
X_test = df_stage_2_dataset_for_pred[feature_cols].to_numpy()

In [47]:
for fold in tqdm(range(5)):
    model_file = f"__model__carts_covisit_top200_reranker_rank+42feat_fold{fold}.lgb"
    gbm_ranking = lgb.Booster(model_file=model_file)
    scores = gbm_ranking.predict(X_test)
    df_valid_preds_fold = (
        df_stage_2_dataset_for_pred.select(["session", "aid"])
        .with_columns([pl.Series(scores).alias(f"scores_fold{fold}")])
    )
    df_valid_preds = (
        df_valid_preds.join(df_valid_preds_fold, on=["session", "aid"], how="left")
    )

100%|██████████| 5/5 [12:20<00:00, 148.11s/it]


In [None]:
df_valid_preds.write_parquet("__subm__carts_candidates_preds_covisit_top200_reranker_rank+42feat.parquet")


### Make submission

In [34]:
df_valid_preds = pl.read_parquet("__subm__carts_candidates_preds_covisit_top200_reranker_rank+42feat.parquet")


In [35]:
scores = (
    df_valid_preds
    .select([
        pl.col("scores_fold0"),
        pl.col("scores_fold1"),
        pl.col("scores_fold2"), 
        pl.col("scores_fold3"), 
        pl.col("scores_fold4"),
    ]).mean(axis=1)
)

df_valid_preds_sorted = (
    df_valid_preds
    .with_column(scores.alias("score"))
    .select(["session", "aid", "score"])
    .sort("score", reverse=True)
    .groupby("session")
    .agg([
        pl.list("aid"),
        pl.list("score"),
    ])
)

In [36]:
df_valid_preds_sorted

session,aid,score
i64,list[i64],list[f64]
13235520,"[1311747, 1178814, ... 485256]","[3.945397, 1.733589, ... -6.130174]"
12956032,"[1841938, 199256, ... 485256]","[3.106247, 1.243602, ... -6.297891]"
13018624,"[1699757, 637586, ... 33343]","[4.069848, 1.339004, ... -5.78944]"
13227328,"[435177, 1622133, ... 485256]","[3.771995, 0.827353, ... -6.072482]"
14509888,"[262314, 638085, ... 152547]","[3.821347, 0.820153, ... -5.75965]"
13722432,"[636732, 1238813, ... 485256]","[2.9975, 1.286138, ... -5.8114]"
13823296,"[1235182, 1811296, ... 485256]","[3.227237, 0.437216, ... -6.033522]"
13979584,"[1196256, 688503, ... 660655]","[3.911405, 1.068237, ... -5.208064]"
12918400,"[906935, 263056, ... 485256]","[3.267979, -0.311764, ... -6.033522]"
14514176,"[96467, 1709883, ... 485256]","[4.069719, 0.673662, ... -5.99104]"


In [37]:
# stage 2 valid carts reranked - recall@20 optimized
submission_dict = {
    "session_type": [],
    "labels": [],
}

types = ["carts"]
topk = 20

for row in tqdm(df_valid_preds_sorted.rows()):
    session_id = row[0]
    rec_items = row[1][:topk]
    
    session_types = [f"{session_id}_{t}" for t in types]
    labels = " ".join(str(aid) for aid in rec_items)
    labels_list = [labels]
    
    submission_dict["session_type"].extend(session_types)
    submission_dict["labels"].extend(labels_list)

100%|██████████| 1671803/1671803 [00:06<00:00, 262400.56it/s]


In [38]:
df_submission_carts_reranked = pl.DataFrame(submission_dict)

## orders

### dataset for pred and join features

In [22]:
df_stage_2_dataset_for_pred = (
    df_candidates
    .filter(pl.col("type") == "orders")
    .drop("type")
    .explode(["candidates", "rank"])
    .rename({"candidates": "aid"})
    .unique(subset=["session", "aid"], keep="last")
)

In [23]:
df_stage_2_dataset_for_pred = (
    df_stage_2_dataset_for_pred
    .join(df_user_last_type_actions, on="session", how="left")
    .join(
        carts_orders.rename({"weight": "user_last_click_aid_carts_orders_weight"}),
        left_on=["aid", "last_click_aid"], right_on=["aid", "aid_right"], how="left"
    )
    .join(
        buys2buys.rename({"weight": "user_last_click_aid_buy2buy_weight"}),
        left_on=["aid", "last_click_aid"], right_on=["aid", "aid_right"], how="left"
    )
    .join(
        clicks.rename({"weight": "user_last_click_aid_click_weight"}),
        left_on=["aid", "last_click_aid"], right_on=["aid", "aid_right"], how="left"
    )
    .join(
        carts_orders.rename({"weight": "user_last_cart_aid_carts_orders_weight"}),
        left_on=["aid", "last_cart_aid"], right_on=["aid", "aid_right"], how="left"
    )
    .join(
        buys2buys.rename({"weight": "user_last_cart_aid_buy2buy_weight"}),
        left_on=["aid", "last_cart_aid"], right_on=["aid", "aid_right"], how="left"
    )
    .join(
        clicks.rename({"weight": "user_last_cart_aid_click_weight"}),
        left_on=["aid", "last_cart_aid"], right_on=["aid", "aid_right"], how="left"
    )
    .join(
        carts_orders.rename({"weight": "user_last_order_aid_carts_orders_weight"}),
        left_on=["aid", "last_order_aid"], right_on=["aid", "aid_right"], how="left"
    )
    .join(
        buys2buys.rename({"weight": "user_last_order_aid_buy2buy_weight"}),
        left_on=["aid", "last_order_aid"], right_on=["aid", "aid_right"], how="left"
    )
    .join(
        clicks.rename({"weight": "user_last_order_aid_click_weight"}),
        left_on=["aid", "last_order_aid"], right_on=["aid", "aid_right"], how="left"
    )
    .drop(["last_click_aid", "last_cart_aid", "last_order_aid"])
    .fill_null(0)
)

In [24]:
df_stage_2_dataset_for_pred = (
    df_stage_2_dataset_for_pred
    .join(df_user_action_stats_features, on="session", how="left")
#     .join(df_user_time_distr_features, on="session", how="left")
    .join(df_item_action_stats_features, on="aid", how="left")
#     .join(df_item_time_distr_features, on="aid", how="left")
    .join(df_item_n_sess_multiple_action, on="aid", how="left")
    .join(df_user_item_history_features, on=["session", "aid"], how="left")
    .sort("session")
    .fill_null(0)
)

In [25]:
df_stage_2_dataset_for_pred

session,aid,rank,user_last_click_aid_carts_orders_weight,user_last_click_aid_buy2buy_weight,user_last_click_aid_click_weight,user_last_cart_aid_carts_orders_weight,user_last_cart_aid_buy2buy_weight,user_last_cart_aid_click_weight,user_last_order_aid_carts_orders_weight,user_last_order_aid_buy2buy_weight,user_last_order_aid_click_weight,user_lifetime_days,user_n_actions,user_n_uniq_items,user_buys_rate,user_uniq_clicks,user_uniq_carts,user_uniq_orders,cl_cnt,ca_cnt,or_cnt,user_ca_cl_ratio,user_or_cl_ratio,user_or_ca_ratio,item_lifetime_days,item_n_actions,item_n_uniq_users,item_buys_rate,item_uniq_clicks,item_uniq_carts,item_uniq_orders,cl_cnt_right,ca_cnt_right,or_cnt_right,item_ca_cl_ratio,item_or_cl_ratio,item_or_ca_ratio,item_n_sess_multi_clicks,item_n_sess_multi_carts,item_n_sess_multi_buys,user_item_log_recency_score,user_item_type_weighted_log_recency_score,user_item_is_in_history
i64,i64,i64,f64,i32,f64,f64,i32,f64,f64,i32,f64,f64,u32,u32,f64,u32,u32,u32,u32,u32,u32,f64,f64,f64,f64,u32,u32,f64,u32,u32,u32,u32,u32,u32,f64,f64,f64,u32,u32,u32,f64,f64,i32
12899779,59625,1,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,1,1,0.0,1,0,0,1,0,0,0.0,0.0,0.0,18.275758,8,1,0.0,7,0,0,8,0,0,0.0,0.0,0.0,1,0,0,1.0,1.0,1
12899779,689970,2,0.5,0,1.329271,0.0,0,0.0,0.0,0,0.0,0.0,1,1,0.0,1,0,0,1,0,0,0.0,0.0,0.0,27.044139,26,1,0.038462,20,1,0,25,1,0,0.04,0.0,0.0,4,0,0,0.0,0.0,0
12899779,397451,3,0.5,0,1.328555,0.0,0,0.0,0.0,0,0.0,0.0,1,1,0.0,1,0,0,1,0,0,0.0,0.0,0.0,19.943631,86,1,0.22093,50,12,2,69,15,2,0.217391,0.028986,0.133333,8,3,0,0.0,0.0,0
12899779,1493965,4,0.5,0,1.329107,0.0,0,0.0,0.0,0,0.0,0.0,1,1,0.0,1,0,0,1,0,0,0.0,0.0,0.0,26.406428,148,1,0.047297,110,7,0,141,7,0,0.049645,0.0,0.0,20,0,0,0.0,0.0,0
12899779,469285,5,0.5,0,1.075815,0.0,0,0.0,0.0,0,0.0,0.0,1,1,0.0,1,0,0,1,0,0,0.0,0.0,0.0,11.129111,32,1,0.0625,17,2,0,30,2,0,0.066667,0.0,0.0,6,0,0,0.0,0.0,0
12899779,1253524,6,1.0,0,6.427448,0.0,0,0.0,0.0,0,0.0,0.0,1,1,0.0,1,0,0,1,0,0,0.0,0.0,0.0,24.013138,223,1,0.026906,163,6,0,217,6,0,0.02765,0.0,0.0,38,0,0,0.0,0.0,0
12899779,737445,7,1.0,0,4.967441,0.0,0,0.0,0.0,0,0.0,0.0,1,1,0.0,1,0,0,1,0,0,0.0,0.0,0.0,27.754624,759,1,0.019763,475,11,0,744,15,0,0.020161,0.0,0.0,140,4,0,0.0,0.0,0
12899779,1790770,8,1.0,0,3.518963,0.0,0,0.0,0.0,0,0.0,0.0,1,1,0.0,1,0,0,1,0,0,0.0,0.0,0.0,18.141994,23,1,0.0,21,0,0,23,0,0,0.0,0.0,0.0,2,0,0,0.0,0.0,0
12899779,438191,9,1.0,0,4.208226,0.0,0,0.0,0.0,0,0.0,0.0,1,1,0.0,1,0,0,1,0,0,0.0,0.0,0.0,27.719658,2498,1,0.035228,1566,68,4,2414,80,4,0.03314,0.001657,0.05,376,8,0,0.0,0.0,0
12899779,731692,10,1.0,0,4.122878,0.0,0,0.0,0.0,0,0.0,0.0,1,1,0.0,1,0,0,1,0,0,0.0,0.0,0.0,27.48988,93,1,0.172043,45,12,2,79,12,2,0.151899,0.025316,0.166667,12,0,0,0.0,0.0,0


### predict

In [26]:
feature_cols = ["rank"] + df_stage_2_dataset_for_pred.columns[3:]
feature_cols

['rank',
 'user_last_click_aid_carts_orders_weight',
 'user_last_click_aid_buy2buy_weight',
 'user_last_click_aid_click_weight',
 'user_last_cart_aid_carts_orders_weight',
 'user_last_cart_aid_buy2buy_weight',
 'user_last_cart_aid_click_weight',
 'user_last_order_aid_carts_orders_weight',
 'user_last_order_aid_buy2buy_weight',
 'user_last_order_aid_click_weight',
 'user_lifetime_days',
 'user_n_actions',
 'user_n_uniq_items',
 'user_buys_rate',
 'user_uniq_clicks',
 'user_uniq_carts',
 'user_uniq_orders',
 'cl_cnt',
 'ca_cnt',
 'or_cnt',
 'user_ca_cl_ratio',
 'user_or_cl_ratio',
 'user_or_ca_ratio',
 'item_lifetime_days',
 'item_n_actions',
 'item_n_uniq_users',
 'item_buys_rate',
 'item_uniq_clicks',
 'item_uniq_carts',
 'item_uniq_orders',
 'cl_cnt_right',
 'ca_cnt_right',
 'or_cnt_right',
 'item_ca_cl_ratio',
 'item_or_cl_ratio',
 'item_or_ca_ratio',
 'item_n_sess_multi_clicks',
 'item_n_sess_multi_carts',
 'item_n_sess_multi_buys',
 'user_item_log_recency_score',
 'user_item_type_w

In [27]:
df_valid_preds = df_stage_2_dataset_for_pred.select(["session", "aid"])
X_test = df_stage_2_dataset_for_pred[feature_cols].to_numpy()

In [28]:
for fold in tqdm(range(5)):
    model_file = f"__model__orders_covisit_top200_reranker_rank+42feat_fold{fold}.lgb"
    gbm_ranking = lgb.Booster(model_file=model_file)
    scores = gbm_ranking.predict(X_test)
    df_valid_preds_fold = (
        df_stage_2_dataset_for_pred.select(["session", "aid"])
        .with_columns([pl.Series(scores).alias(f"scores_fold{fold}")])
    )
    df_valid_preds = (
        df_valid_preds.join(df_valid_preds_fold, on=["session", "aid"], how="left")
    )

100%|██████████| 5/5 [05:41<00:00, 68.27s/it]


In [29]:
df_valid_preds.write_parquet("__subm__orders_candidates_preds_covisit_top200_reranker_rank+42feat.parquet")

### Metrics

In [None]:
df_valid_preds = pl.read_parquet("__subm__orders_candidates_preds_covisit_top200_reranker_rank+42feat.parquet")

In [30]:
scores = (
    df_valid_preds
    .select([
        pl.col("scores_fold0"),
        pl.col("scores_fold1"),
        pl.col("scores_fold2"), 
        pl.col("scores_fold3"), 
        pl.col("scores_fold4"),
    ]).mean(axis=1)
)

df_valid_preds_sorted = (
    df_valid_preds
    .with_column(scores.alias("score"))
    .select(["session", "aid", "score"])
    .sort("score", reverse=True)
    .groupby("session")
    .agg([
        pl.list("aid"),
        pl.list("score"),
    ])
)

In [31]:
df_valid_preds_sorted

session,aid,score
i64,list[i64],list[f64]
14488256,"[1594735, 1428102, ... 508883]","[3.455723, 0.801064, ... -6.796847]"
13802240,"[1643015, 924195, ... 166037]","[2.280398, 0.098136, ... -7.281384]"
14225280,"[1765898, 1742560, ... 166037]","[2.185088, -0.691654, ... -7.281384]"
13820928,"[1332942, 1796573, ... 227109]","[2.148335, 0.075526, ... -8.750716]"
13518208,"[890058, 1713285, ... 159789]","[2.868863, 1.419302, ... -8.305559]"
14016576,"[552595, 726316, ... 1236775]","[1.813848, 1.553693, ... -6.055742]"
14190336,"[389584, 490397, ... 166037]","[4.936186, 0.173489, ... -7.383453]"
13832832,"[1509329, 244481, ... 1236775]","[2.821979, 0.949956, ... -6.897627]"
13441920,"[1277535, 510924, ... 166037]","[0.702853, -0.308974, ... -7.281384]"
14176384,"[925386, 376484, ... 1043508]","[3.397152, 3.089667, ... -5.63228]"


In [32]:
# stage 2 valid orders reranked - recall@20 optimized
submission_dict = {
    "session_type": [],
    "labels": [],
}

types = ["orders"]
topk = 20

for row in tqdm(df_valid_preds_sorted.rows()):
    session_id = row[0]
    rec_items = row[1][:topk]
    
    session_types = [f"{session_id}_{t}" for t in types]
    labels = " ".join(str(aid) for aid in rec_items)
    labels_list = [labels]
    
    submission_dict["session_type"].extend(session_types)
    submission_dict["labels"].extend(labels_list)

100%|██████████| 1671803/1671803 [00:06<00:00, 252865.15it/s]


In [33]:
df_submission_orders_reranked = pl.DataFrame(submission_dict)

In [39]:
df_submission_reranked_all = (
    df_submission_valid
    .join(
        pl.concat([
            df_submission_carts_reranked,
            df_submission_orders_reranked
        ]), 
        on="session_type", how="left"
    )
    .with_columns([
        (
            pl.when(pl.col("labels_right").is_null())
            .then(pl.col("labels"))
            .otherwise(pl.col("labels_right"))
        ).alias("labels_final")
    ])
    .select(["session_type", "labels_final"])
    .rename({"labels_final": "labels"})
)

In [40]:
# LB - 0.587
(
    df_submission_reranked_all.to_pandas()
    .to_csv("covisit_top200_reranker_rank+42feat_submission.csv.gz",
            compression="gzip", index=False)
)