In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
from pathlib import Path
from collections import Counter

from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import dill
import pandas as pd
import polars as pl
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import GroupKFold

import lightgbm as lgb

import pytorch_lightning as L
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
import seaborn as sns

from tasks.data.dataset.mappers import EntityEncoder
from tasks.jobs import Splitter

In [3]:
from otto_utils import *
from otto_features import *
from otto_candidates_covisit import *
from otto_lgbm_utils import *
from otto_implicit import *

In [4]:
%env PYTHONHASHSEED=1
from otto_word2vec import *

w2vec = Word2Vec.load("__subm__word2vec_window=10_negative=20.w2v")

env: PYTHONHASHSEED=1


# Load data

In [5]:
df_train = pl.read_parquet(TRAIN_PROCESSED, use_pyarrow=True)
df_test = pl.read_parquet(TEST_PROCESSED, use_pyarrow=True)

# Features

## user features

In [6]:
df = df_test.unique().sort(["session", "ts"])

In [7]:
df_user_action_stats_features = user_action_stats_features(df)

In [None]:
# df_user_time_distr_features = user_time_distr_features(df)

## user item features

In [8]:
df = df_test.unique().sort(["session", "ts"])

In [9]:
df_user_item_history_features = user_item_history_features(df)

In [10]:
df_user_last_type_actions = (
    df
    .groupby(["session", "type"])
    .agg([
        pl.last("aid")
    ])
    .pivot(values="aid", index="session", columns="type")
    .rename({
        "0": "last_click_aid",
        "1": "last_cart_aid",
        "2": "last_order_aid",
    })
    .sort("session")
)

## item features

In [10]:
df = pl.concat([
#     df_train,
    df_train.filter(pl.col("ts") >= datetime(2022, 8, 8).timestamp() * 1000),
    df_test
]).unique().sort(["session", "ts"])

In [11]:
df_item_action_stats_features = item_action_stats_features(df)

In [13]:
# df_item_time_distr_features = item_time_distr_features(df)

In [12]:
df_item_n_sess_multiple_action = item_n_sessions_with_repeated_actions(df)

# Candidates

In [13]:
df = df_test.unique().sort(["session", "ts"])

In [14]:
test_df = df
test_sessions_dict = test_df.groupby('session').agg([pl.list("aid"), pl.list("type")])
test_sessions_dict = dict(zip(
    test_sessions_dict["session"].to_list(),
    tuple(zip(test_sessions_dict["aid"].to_list(), test_sessions_dict["type"].to_list()))
))

## covisitation top200

In [17]:
# Use top X for clicks, carts and orders
clicks_th = 15
carts_th  = 20
orders_th = 20

def get_top(df, th):
    return (
        df
        .with_column(pl.lit(1).alias("ones"))
        .with_column(pl.col("ones").cumsum().over("aid").alias("rank"))
        .filter(pl.col("rank") <= th)
    )

TOPK_RECOMMEND = 20
TOPK_RERANK = 40

In [18]:
carts_orders = pl.read_parquet("__subm__covisit_carts_orders_all_v1.parquet")
buys2buys = pl.read_parquet("__subm__covisit_buys2buys_all_v1.parquet")
clicks = pl.read_parquet("__subm__covisit_clicks_all_v1.parquet")

In [18]:
carts_orders_top = get_top(carts_orders, carts_th)
buys2buys_top = get_top(buys2buys, orders_th)
clicks_top = get_top(clicks, clicks_th)

In [19]:
top_clicks = df.filter(pl.col("type") == 0)["aid"].value_counts(sort=True)[:TOPK_RECOMMEND]["aid"].to_list()
top_carts = df.filter(pl.col("type") == 1)["aid"].value_counts(sort=True)[:TOPK_RECOMMEND]["aid"].to_list()
top_orders = df.filter(pl.col("type") == 2)["aid"].value_counts(sort=True)[:TOPK_RECOMMEND]["aid"].to_list()

In [20]:
covisit_rec = CovisitationRecommender(
    df_top_k_buys=carts_orders_top,
    df_top_k_buy2buy=buys2buys_top,
    df_top_k_clicks=clicks_top,
    top_carts=top_carts,
    top_orders=top_orders,
    top_clicks=top_clicks,
)

In [21]:
candidates_dict = {
    "session": [],
    "type": [],
    "candidates": [],
    "rank": [],
}

types = ["clicks", "carts", "orders"]
# types = ["orders"]
topk = TOPK_RERANK * 5
# topk = TOPK_RERANK

for session_id, (session_aid_list, session_type_list) in tqdm(test_sessions_dict.items()):
    rec_items_clicks = covisit_rec.recommend_clicks(session_aid_list, session_type_list, topk)
    rec_items_carts = covisit_rec.recommend_carts(session_aid_list, session_type_list, topk)
    rec_items_buys = covisit_rec.recommend_buys(session_aid_list, session_type_list, topk)

    candidates = [rec_items_clicks, rec_items_carts, rec_items_buys]
#     candidates = [rec_items_buys]
    ranks = [
        np.arange(1, len(rec_items) + 1).tolist()
        for rec_items in candidates
    ]
    
    candidates_dict["session"].extend([session_id] * len(types))
    candidates_dict["type"].extend(types)
    candidates_dict["candidates"].extend(candidates)
    candidates_dict["rank"].extend(ranks)

df_candidates_covisit = pl.DataFrame(candidates_dict)

100%|██████████| 1671803/1671803 [02:19<00:00, 11945.40it/s]


## implicit i2i k=100 top100

In [22]:
from implicit.nearest_neighbours import CosineRecommender

### old weights

In [23]:
df = pl.concat([df_train, df_test]).unique().sort(["session", "ts"])
df = implicit_old_weight_interactions(df)

In [24]:
train_data = make_sparse_matrix(df)
i2i = CosineRecommender(K=100)
i2i.fit(train_data)

  0%|          | 0/1855603 [00:00<?, ?it/s]

In [25]:
df_candidates_i2i_old = implicit_batch_candidates_for_all_types(
    model=i2i, model_name="i2i_old",
    train_data=train_data, test_users=list(test_sessions_dict.keys()),
    topk=100,
)

100%|██████████| 1672/1672 [02:25<00:00, 11.52it/s]


### new weights

In [26]:
df = pl.concat([df_train, df_test]).unique().sort(["session", "ts"])
df = implicit_new_weight_interactions(df)

In [27]:
train_data = make_sparse_matrix(df)
i2i = CosineRecommender(K=100)
i2i.fit(train_data)

  0%|          | 0/1855603 [00:00<?, ?it/s]

In [28]:
df_candidates_i2i_new = implicit_batch_candidates_for_all_types(
    model=i2i, model_name="i2i_new",
    train_data=train_data, test_users=list(test_sessions_dict.keys()),
    topk=100,
)

100%|██████████| 1672/1672 [02:27<00:00, 11.34it/s]


## Initial covisit top20 

In [21]:
submission_dict = {
    "session_type": [],
    "labels": [],
}

types = ["clicks", "carts", "orders"]
topk = TOPK_RECOMMEND

for session_id, (session_aid_list, session_type_list) in tqdm(test_sessions_dict.items()):
    rec_items_clicks = covisit_rec.recommend_clicks(session_aid_list, session_type_list, topk)
    rec_items_carts = covisit_rec.recommend_carts(session_aid_list, session_type_list, topk)
    rec_items_buys = covisit_rec.recommend_buys(session_aid_list, session_type_list, topk)

    session_types = [f"{session_id}_{t}" for t in types]
    labels_list = [
        " ".join(str(aid) for aid in rec_items)
        for rec_items in [rec_items_clicks, rec_items_carts, rec_items_buys]
    ]
    
    submission_dict["session_type"].extend(session_types)
    submission_dict["labels"].extend(labels_list)

df_submission_valid = pl.DataFrame(submission_dict)

100%|██████████| 1671803/1671803 [02:04<00:00, 13410.15it/s]


# Reranker

In [29]:
df_candidates_for_orders_all = (
    df_candidates_covisit
    .filter(pl.col("type") == "orders")
    .drop("type")
    .explode(["candidates", "rank"])
    .rename({"candidates": "aid", "rank": "rank_orders"})
    .join(
        (
            df_candidates_covisit
            .filter(pl.col("type") == "carts")
            .drop("type")
            .explode(["candidates", "rank"])
            .rename({"candidates": "aid", "rank": "rank_carts"})
        ),
        on=["session", "aid"],
        how="outer"
    )
    .join(
        (
            df_candidates_covisit
            .filter(pl.col("type") == "clicks")
            .drop("type")
            .explode(["candidates", "rank"])
            .rename({"candidates": "aid", "rank": "rank_clicks"})
        ),
        on=["session", "aid"],
        how="outer"
    )
    .join(
        (
            df_candidates_i2i_new
            .explode(["aid", "i2i_new_score"])
        ),
        on=["session", "aid"],
        how="outer"
    )
    .join(
        (
            df_candidates_i2i_old
            .explode(["aid", "i2i_old_score"])
        ),
        on=["session", "aid"],
        how="outer"
    )
    .fill_null(999)
    .unique(subset=["session", "aid"], keep="last")
    .sort("session")
)

In [30]:
df_candidates_for_orders_all

session,aid,rank_orders,rank_carts,rank_clicks,i2i_new_score,i2i_old_score
i64,i64,i64,i64,i64,f64,f64
12899779,1615858,999,999,999,0.013027,999.0
12899779,110837,999,999,999,0.013067,0.339032
12899779,779973,999,999,999,0.013039,999.0
12899779,678095,999,999,999,0.013587,999.0
12899779,1515928,999,999,999,0.013443,999.0
12899779,39615,999,999,999,0.016761,999.0
12899779,1328718,999,999,999,0.013632,0.479463
12899779,687401,999,999,999,0.013863,999.0
12899779,475447,999,999,999,0.014789,999.0
12899779,1648283,999,999,999,0.014888,999.0


In [63]:
df_candidates_for_orders_all.write_parquet("__subm__candidates_reranking_v3_1.parquet")

In [19]:
##

df_candidates_for_orders_all = pl.read_parquet("__subm__candidates_reranking_v3_1.parquet")

In [20]:
def cand_w2v_features(df_candidates):
    df_stage_2_dataset_train = (
        df_candidates
        .sort("session")
        .unique(subset=["session", "aid"], keep="last").fill_null(0)
    )
    
    df_stage_2_dataset_train = (
        df_stage_2_dataset_train
        .join(df_user_last_type_actions, on="session", how="left")
    )

    for t in ["last_click_aid", "last_cart_aid", "last_order_aid"]:
        df_w2v_score = (
            df_stage_2_dataset_train.filter(
                (pl.col(t).is_not_null()) &
                (pl.col("aid") != -1)
            ).select(["session", "aid", t])
        )

        l_aids = df_w2v_score["aid"].to_numpy()
        r_aids = df_w2v_score[t].to_numpy()

        cosine_sim_score = w2v_cosine_sim(w2vec, l_aids, r_aids)
        df_w2v_score = df_w2v_score.with_column(pl.Series(cosine_sim_score).alias(f"w2v_cosine_sim_{t}"))

        df_stage_2_dataset_train = (
            df_stage_2_dataset_train.join(df_w2v_score, on=["session", "aid", t], how="left")
            .with_column(pl.col(f"w2v_cosine_sim_{t}").fill_null(pl.lit(-999)))
        )
        
    df_stage_2_dataset_train = df_stage_2_dataset_train.select([
        "session", "aid", 
        "w2v_cosine_sim_last_click_aid", "w2v_cosine_sim_last_cart_aid", "w2v_cosine_sim_last_order_aid"
    ])
    
    return df_stage_2_dataset_train

In [21]:
def cand_item_item_features(df_candidates):
    df_stage_2_dataset_train = (
        df_candidates
        .sort("session")
        .unique(subset=["session", "aid"], keep="last").fill_null(0)
    )
    
    df_stage_2_dataset_train = (
        df_stage_2_dataset_train
        .join(df_user_last_type_actions, on="session", how="left")
    )

    print("join item item weights")
    df_stage_2_dataset_train = (
        df_stage_2_dataset_train
        .join(
            carts_orders.rename({"weight": "user_last_click_aid_carts_orders_weight"}),
            left_on=["aid", "last_click_aid"], right_on=["aid", "aid_right"], how="left"
        )
        .join(
            buys2buys.rename({"weight": "user_last_click_aid_buy2buy_weight"}),
            left_on=["aid", "last_click_aid"], right_on=["aid", "aid_right"], how="left"
        )
        .join(
            clicks.rename({"weight": "user_last_click_aid_click_weight"}),
            left_on=["aid", "last_click_aid"], right_on=["aid", "aid_right"], how="left"
        )
        .join(
            carts_orders.rename({"weight": "user_last_cart_aid_carts_orders_weight"}),
            left_on=["aid", "last_cart_aid"], right_on=["aid", "aid_right"], how="left"
        )
        .join(
            buys2buys.rename({"weight": "user_last_cart_aid_buy2buy_weight"}),
            left_on=["aid", "last_cart_aid"], right_on=["aid", "aid_right"], how="left"
        )
        .join(
            clicks.rename({"weight": "user_last_cart_aid_click_weight"}),
            left_on=["aid", "last_cart_aid"], right_on=["aid", "aid_right"], how="left"
        )
        .join(
            carts_orders.rename({"weight": "user_last_order_aid_carts_orders_weight"}),
            left_on=["aid", "last_order_aid"], right_on=["aid", "aid_right"], how="left"
        )
        .join(
            buys2buys.rename({"weight": "user_last_order_aid_buy2buy_weight"}),
            left_on=["aid", "last_order_aid"], right_on=["aid", "aid_right"], how="left"
        )
        .join(
            clicks.rename({"weight": "user_last_order_aid_click_weight"}),
            left_on=["aid", "last_order_aid"], right_on=["aid", "aid_right"], how="left"
        )
        .drop(["last_click_aid", "last_cart_aid", "last_order_aid"])
        .fill_null(0)
    )
    return df_stage_2_dataset_train

In [22]:
df_candidates_item_item_features = cand_item_item_features(df_candidates_for_orders_all)

join item item weights


In [23]:
(
    df_candidates_item_item_features
    .write_parquet("__subm__features_item_item_weights_candidates_reranking_v3_1.parquet")
)

In [24]:
df_candidates_item_item_features

session,aid,rank_orders,rank_carts,rank_clicks,i2i_new_score,i2i_old_score,user_last_click_aid_carts_orders_weight,user_last_click_aid_buy2buy_weight,user_last_click_aid_click_weight,user_last_cart_aid_carts_orders_weight,user_last_cart_aid_buy2buy_weight,user_last_cart_aid_click_weight,user_last_order_aid_carts_orders_weight,user_last_order_aid_buy2buy_weight,user_last_order_aid_click_weight
i64,i64,i64,i64,i64,f64,f64,f64,i32,f64,f64,i32,f64,f64,i32,f64
12899779,1615858,999,999,999,0.013027,999.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0
12899779,110837,999,999,999,0.013067,0.339032,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0
12899779,779973,999,999,999,0.013039,999.0,0.5,0,2.514567,0.0,0,0.0,0.0,0,0.0
12899779,678095,999,999,999,0.013587,999.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0
12899779,1515928,999,999,999,0.013443,999.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0
12899779,39615,999,999,999,0.016761,999.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0
12899779,1328718,999,999,999,0.013632,0.479463,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0
12899779,687401,999,999,999,0.013863,999.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0
12899779,475447,999,999,999,0.014789,999.0,0.5,0,1.959381,0.0,0,0.0,0.0,0,0.0
12899779,1648283,999,999,999,0.014888,999.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0


In [28]:
def cand_other_features(df_stage_2_dataset_train):
    return (
        df_stage_2_dataset_train
        .join(df_user_action_stats_features, on="session", how="left")
    #     .join(df_user_time_distr_features, on="session", how="left")
        .join(df_item_action_stats_features, on="aid", how="left")
    #     .join(df_item_time_distr_features, on="aid", how="left") # лишний признак
        .join(df_item_n_sess_multiple_action, on="aid", how="left")
        .join(df_user_item_history_features, on=["session", "aid"], how="left")
        .sort("session")
        .fill_null(0)
    )

In [None]:
df_candidates_other_features = cand_other_features(df_candidates_item_item_features.select(["session", "aid"]))

In [None]:
(
    df_candidates_other_features
    .write_parquet("__subm__features_item_item_weights_candidates_reranking_v3_1.parquet")
)

In [None]:
test_users = list(test_sessions_dict.keys())

b_sz = 100000
df_candidates_w2v_features = []
for test_session_start in tqdm(range(0, len(test_users), b_sz)):
    test_sessions = test_users[test_session_start : test_session_start + b_sz]
    df_candidates_batch = df_candidates_for_orders_all.filter(pl.col("session").is_in(test_sessions))
    df_cand_w2v_features = cand_w2v_features(df_candidates_batch)
    df_candidates_w2v_features.append(df_cand_w2v_features)

In [23]:
df_candidates_w2v_features = pl.concat(df_candidates_w2v_features)

In [25]:
(
    df_candidates_w2v_features
    .write_parquet("__subm__features_w2v_cosine_sim_candidates_reranking_v3_1.parquet")
)

## batch predictions

In [15]:
df_candidates_item_item_features = pl.read_parquet("__subm__features_item_item_weights_candidates_reranking_v3_1.parquet")
df_candidates_w2v_features = pl.read_parquet("__subm__features_w2v_cosine_sim_candidates_reranking_v3_1.parquet")

In [21]:
def cand_other_features(df_stage_2_dataset_train):
    return (
        df_stage_2_dataset_train
        .join(df_user_action_stats_features, on="session", how="left")
    #     .join(df_user_time_distr_features, on="session", how="left")
        .join(df_item_action_stats_features, on="aid", how="left")
    #     .join(df_item_time_distr_features, on="aid", how="left") # лишний признак
        .join(df_item_n_sess_multiple_action, on="aid", how="left")
        .join(df_user_item_history_features, on=["session", "aid"], how="left")
        .sort("session")
        .fill_null(0)
    )


In [17]:
df_candidates_item_item_features.head()

session,aid,rank_orders,rank_carts,rank_clicks,i2i_new_score,i2i_old_score,user_last_click_aid_carts_orders_weight,user_last_click_aid_buy2buy_weight,user_last_click_aid_click_weight,user_last_cart_aid_carts_orders_weight,user_last_cart_aid_buy2buy_weight,user_last_cart_aid_click_weight,user_last_order_aid_carts_orders_weight,user_last_order_aid_buy2buy_weight,user_last_order_aid_click_weight
i64,i64,i64,i64,i64,f64,f64,f64,i32,f64,f64,i32,f64,f64,i32,f64
12899779,1615858,999,999,999,0.013027,999.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0
12899779,110837,999,999,999,0.013067,0.339032,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0
12899779,779973,999,999,999,0.013039,999.0,0.5,0,2.514567,0.0,0,0.0,0.0,0,0.0
12899779,678095,999,999,999,0.013587,999.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0
12899779,1515928,999,999,999,0.013443,999.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0


In [18]:
df_candidates_w2v_features.head()

session,aid,w2v_cosine_sim_last_click_aid,w2v_cosine_sim_last_cart_aid,w2v_cosine_sim_last_order_aid
i64,i64,f64,f64,f64
12899779,1615858,0.810406,-999.0,-999.0
12899779,110837,0.787047,-999.0,-999.0
12899779,779973,0.717006,-999.0,-999.0
12899779,678095,0.871848,-999.0,-999.0
12899779,1515928,0.742414,-999.0,-999.0


In [19]:
def lgb_cv_folds_predictions(df, model_file_tmplt, action_type):
    feature_cols = df.drop(["session", "aid"]).columns
    df_valid_preds = df.select(["session", "aid"])
    X_test = df[feature_cols].to_numpy()

    for fold in tqdm(range(5)):
        model_file = model_file_tmplt.format(act_type=action_type, fold=fold)
        gbm_ranking = lgb.Booster(model_file=model_file)
        scores = gbm_ranking.predict(X_test)
        df_valid_preds_fold = (
            df.select(["session", "aid"])
            .with_columns([pl.Series(scores).alias(f"scores_fold{fold}")])
        )
        df_valid_preds = (
            df_valid_preds.join(df_valid_preds_fold, on=["session", "aid"], how="left")
        )
    
    scores = (
        df_valid_preds
        .select([
            pl.col("scores_fold0"),
            pl.col("scores_fold1"),
            pl.col("scores_fold2"), 
            pl.col("scores_fold3"), 
            pl.col("scores_fold4"),
        ]).mean(axis=1)
    )
    
    df_valid_preds = (
        df_valid_preds.with_column(scores.alias(f"{action_type}_score"))
        .select(["session", "aid", f"{action_type}_score"])
    )
    return df_valid_preds

In [None]:
clicks_model_file_tmplt = (
    f"__model__{ACT_TYPE}_covisit_all_types_merged_top200_reranker_rank+42feat_fold{fold}.lgb"
)

In [22]:
carts_model_file_tmplt = (
    "__model__{act_type}"
    "_covisit_all_types_merged_top200+i2i_old_new_k=100_top100"
    "_reranker_rank+i2i_score+w2v_cosine+42feat_fold{fold}.lgb"
)

In [23]:
orders_model_file_tmplt = (
    "__model__{act_type}"
    "_covisit_all_types_merged_top200+i2i_old_new_k=100_top100"
    "_reranker_rank+i2i_score+w2v_cosine+42feat_fold{fold}.lgb"
)

In [32]:
def collect_features(df_candidates_batch):
    feature_cols = [
        'rank_orders', 'rank_carts', 'rank_clicks', 'i2i_new_score', 'i2i_old_score',
        'w2v_cosine_sim_last_click_aid', 'w2v_cosine_sim_last_cart_aid', 'w2v_cosine_sim_last_order_aid',
        'user_last_click_aid_carts_orders_weight',
        'user_last_click_aid_buy2buy_weight',
        'user_last_click_aid_click_weight',
        'user_last_cart_aid_carts_orders_weight',
        'user_last_cart_aid_buy2buy_weight',
        'user_last_cart_aid_click_weight',
        'user_last_order_aid_carts_orders_weight',
        'user_last_order_aid_buy2buy_weight',
        'user_last_order_aid_click_weight',
        'user_lifetime_days',
        'user_n_actions',
        'user_n_uniq_items',
        'user_buys_rate',
        'user_uniq_clicks',
        'user_uniq_carts',
        'user_uniq_orders',
        'cl_cnt',
        'ca_cnt',
        'or_cnt',
        'user_ca_cl_ratio',
        'user_or_cl_ratio',
        'user_or_ca_ratio',
        'item_lifetime_days',
        'item_n_actions',
        'item_n_uniq_users',
        'item_buys_rate',
        'item_uniq_orders',
        'item_uniq_carts',
        'item_uniq_clicks',
        'or_cnt_right',
        'ca_cnt_right',
        'cl_cnt_right',
        'item_ca_cl_ratio',
        'item_or_cl_ratio',
        'item_or_ca_ratio',
        'item_n_sess_multi_clicks',
        'item_n_sess_multi_carts',
        'item_n_sess_multi_buys',
        'user_item_log_recency_score',
        'user_item_type_weighted_log_recency_score',
        'user_item_is_in_history'
    ]
    
    df_other_features_batch = cand_other_features(df_candidates_batch)
    df_features = (
        df_candidates_batch
        .join(df_candidates_item_item_features, on=["session", "aid"])
        .join(df_candidates_w2v_features, on=["session", "aid"])
        .join(df_other_features_batch, on=["session", "aid"])
        .select(["session", "aid"] +feature_cols)
    )
    return df_features

In [37]:
test_users = list(test_sessions_dict.keys())

b_sz = 100000
# df_candidates_scores = df_candidates_item_item_features.select(["session", "aid"])

df_candidates_carts_scores = []
df_candidates_orders_scores = []

for test_session_start in tqdm(range(0, len(test_users), b_sz)):
    test_sessions = test_users[test_session_start : test_session_start + b_sz]
    df_candidates_batch = (
        df_candidates_item_item_features.filter(pl.col("session").is_in(test_sessions))
        .select(["session", "aid"])
    )

    # collect features for batch
    df_stage_2_batch = collect_features(df_candidates_batch)
    
    print("predict carts -> save scores to df_candidates_scores")
    df_carts_scores = lgb_cv_folds_predictions(df_stage_2_batch, carts_model_file_tmplt, "carts")
    df_candidates_carts_scores.append(df_carts_scores)
    
    print("predict orders -> save scores to df_candidates_scores")
    df_orders_scores = lgb_cv_folds_predictions(df_stage_2_batch, orders_model_file_tmplt, "orders")
    df_candidates_orders_scores.append(df_orders_scores)


  0%|          | 0/17 [00:00<?, ?it/s]

predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:12<00:48, 12.16s/it][A
 40%|████      | 2/5 [00:22<00:33, 11.02s/it][A
 60%|██████    | 3/5 [00:34<00:23, 11.55s/it][A
 80%|████████  | 4/5 [00:53<00:14, 14.63s/it][A
100%|██████████| 5/5 [01:08<00:00, 13.67s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:18<01:12, 18.04s/it][A
 40%|████      | 2/5 [00:26<00:37, 12.63s/it][A
 60%|██████    | 3/5 [00:40<00:26, 13.28s/it][A
 80%|████████  | 4/5 [00:48<00:11, 11.07s/it][A
100%|██████████| 5/5 [00:59<00:00, 11.98s/it][A
  6%|▌         | 1/17 [02:24<38:27, 144.24s/it]

predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:12<00:50, 12.63s/it][A
 40%|████      | 2/5 [00:23<00:34, 11.34s/it][A
 60%|██████    | 3/5 [00:35<00:23, 11.84s/it][A
 80%|████████  | 4/5 [00:54<00:14, 14.84s/it][A
100%|██████████| 5/5 [01:09<00:00, 13.85s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:18<01:12, 18.05s/it][A
 40%|████      | 2/5 [00:27<00:38, 12.92s/it][A
 60%|██████    | 3/5 [00:41<00:27, 13.53s/it][A
 80%|████████  | 4/5 [00:49<00:11, 11.35s/it][A
100%|██████████| 5/5 [01:01<00:00, 12.28s/it][A
 12%|█▏        | 2/17 [04:50<36:20, 145.38s/it]

predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:12<00:50, 12.54s/it][A
 40%|████      | 2/5 [00:22<00:33, 11.22s/it][A
 60%|██████    | 3/5 [00:35<00:23, 11.73s/it][A
 80%|████████  | 4/5 [00:54<00:14, 14.74s/it][A
100%|██████████| 5/5 [01:08<00:00, 13.79s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:18<01:12, 18.15s/it][A
 40%|████      | 2/5 [00:27<00:39, 13.12s/it][A
 60%|██████    | 3/5 [00:41<00:26, 13.45s/it][A
 80%|████████  | 4/5 [00:49<00:11, 11.41s/it][A
100%|██████████| 5/5 [01:01<00:00, 12.31s/it][A
 18%|█▊        | 3/17 [07:16<34:01, 145.82s/it]

predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:13<00:53, 13.44s/it][A
 40%|████      | 2/5 [00:24<00:35, 11.97s/it][A
 60%|██████    | 3/5 [00:37<00:25, 12.64s/it][A
 80%|████████  | 4/5 [00:58<00:15, 15.62s/it][A
100%|██████████| 5/5 [01:13<00:00, 14.71s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:19<01:16, 19.07s/it][A
 40%|████      | 2/5 [00:29<00:41, 13.90s/it][A
 60%|██████    | 3/5 [00:44<00:28, 14.39s/it][A
 80%|████████  | 4/5 [00:53<00:12, 12.17s/it][A
100%|██████████| 5/5 [01:05<00:00, 13.13s/it][A
 24%|██▎       | 4/17 [09:51<32:22, 149.45s/it]

predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:12<00:51, 12.92s/it][A
 40%|████      | 2/5 [00:23<00:34, 11.66s/it][A
 60%|██████    | 3/5 [00:36<00:24, 12.13s/it][A
 80%|████████  | 4/5 [00:55<00:15, 15.04s/it][A
100%|██████████| 5/5 [01:10<00:00, 14.13s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:18<01:13, 18.26s/it][A
 40%|████      | 2/5 [00:27<00:39, 13.11s/it][A
 60%|██████    | 3/5 [00:41<00:27, 13.54s/it][A
 80%|████████  | 4/5 [00:49<00:11, 11.37s/it][A
100%|██████████| 5/5 [01:01<00:00, 12.34s/it][A
 29%|██▉       | 5/17 [12:20<29:48, 149.03s/it]

predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:13<00:52, 13.13s/it][A
 40%|████      | 2/5 [00:23<00:35, 11.79s/it][A
 60%|██████    | 3/5 [00:36<00:24, 12.31s/it][A
 80%|████████  | 4/5 [00:56<00:15, 15.23s/it][A
100%|██████████| 5/5 [01:11<00:00, 14.33s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:18<01:14, 18.66s/it][A
 40%|████      | 2/5 [00:28<00:40, 13.42s/it][A
 60%|██████    | 3/5 [00:42<00:27, 13.92s/it][A
 80%|████████  | 4/5 [00:51<00:11, 11.73s/it][A
100%|██████████| 5/5 [01:03<00:00, 12.68s/it][A
 35%|███▌      | 6/17 [14:50<27:26, 149.67s/it]

predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:12<00:49, 12.36s/it][A
 40%|████      | 2/5 [00:22<00:33, 11.06s/it][A
 60%|██████    | 3/5 [00:35<00:23, 11.73s/it][A
 80%|████████  | 4/5 [00:54<00:14, 14.62s/it][A
100%|██████████| 5/5 [01:08<00:00, 13.70s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:18<01:12, 18.21s/it][A
 40%|████      | 2/5 [00:27<00:38, 12.94s/it][A
 60%|██████    | 3/5 [00:41<00:27, 13.55s/it][A
 80%|████████  | 4/5 [00:49<00:11, 11.34s/it][A
100%|██████████| 5/5 [01:01<00:00, 12.28s/it][A
 41%|████      | 7/17 [17:17<24:45, 148.51s/it]

predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:12<00:50, 12.51s/it][A
 40%|████      | 2/5 [00:23<00:34, 11.37s/it][A
 60%|██████    | 3/5 [00:35<00:23, 11.91s/it][A
 80%|████████  | 4/5 [00:54<00:14, 14.81s/it][A
100%|██████████| 5/5 [01:09<00:00, 13.83s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:18<01:12, 18.03s/it][A
 40%|████      | 2/5 [00:27<00:38, 12.90s/it][A
 60%|██████    | 3/5 [00:41<00:26, 13.34s/it][A
 80%|████████  | 4/5 [00:49<00:11, 11.22s/it][A
100%|██████████| 5/5 [01:00<00:00, 12.15s/it][A
 47%|████▋     | 8/17 [19:43<22:09, 147.73s/it]

predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:12<00:51, 12.96s/it][A
 40%|████      | 2/5 [00:23<00:35, 11.68s/it][A
 60%|██████    | 3/5 [00:36<00:24, 12.24s/it][A
 80%|████████  | 4/5 [00:56<00:15, 15.18s/it][A
100%|██████████| 5/5 [01:11<00:00, 14.21s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:18<01:13, 18.47s/it][A
 40%|████      | 2/5 [00:28<00:39, 13.32s/it][A
 60%|██████    | 3/5 [00:42<00:27, 13.77s/it][A
 80%|████████  | 4/5 [00:50<00:11, 11.61s/it][A
100%|██████████| 5/5 [01:02<00:00, 12.54s/it][A
 53%|█████▎    | 9/17 [22:12<19:46, 148.28s/it]

predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:12<00:51, 12.84s/it][A
 40%|████      | 2/5 [00:23<00:34, 11.51s/it][A
 60%|██████    | 3/5 [00:36<00:24, 12.04s/it][A
 80%|████████  | 4/5 [00:55<00:14, 14.99s/it][A
100%|██████████| 5/5 [01:10<00:00, 14.06s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:18<01:13, 18.27s/it][A
 40%|████      | 2/5 [00:27<00:39, 13.22s/it][A
 60%|██████    | 3/5 [00:42<00:27, 13.62s/it][A
 80%|████████  | 4/5 [00:50<00:11, 11.51s/it][A
100%|██████████| 5/5 [01:02<00:00, 12.41s/it][A
 59%|█████▉    | 10/17 [24:40<17:17, 148.22s/it]

predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:12<00:49, 12.37s/it][A
 40%|████      | 2/5 [00:22<00:33, 11.19s/it][A
 60%|██████    | 3/5 [00:35<00:23, 11.78s/it][A
 80%|████████  | 4/5 [00:54<00:14, 14.71s/it][A
100%|██████████| 5/5 [01:08<00:00, 13.72s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:17<01:11, 17.87s/it][A
 40%|████      | 2/5 [00:27<00:38, 12.74s/it][A
 60%|██████    | 3/5 [00:40<00:26, 13.21s/it][A
 80%|████████  | 4/5 [00:48<00:11, 11.08s/it][A
100%|██████████| 5/5 [01:00<00:00, 12.01s/it][A
 65%|██████▍   | 11/17 [27:05<14:42, 147.11s/it]

predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:12<00:51, 12.89s/it][A
 40%|████      | 2/5 [00:23<00:34, 11.51s/it][A
 60%|██████    | 3/5 [00:36<00:24, 12.08s/it][A
 80%|████████  | 4/5 [00:55<00:15, 15.03s/it][A
100%|██████████| 5/5 [01:10<00:00, 14.12s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:18<01:13, 18.39s/it][A
 40%|████      | 2/5 [00:28<00:39, 13.32s/it][A
 60%|██████    | 3/5 [00:42<00:27, 13.71s/it][A
 80%|████████  | 4/5 [00:50<00:11, 11.53s/it][A
100%|██████████| 5/5 [01:02<00:00, 12.45s/it][A
 71%|███████   | 12/17 [29:34<12:18, 147.60s/it]

predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:12<00:50, 12.66s/it][A
 40%|████      | 2/5 [00:23<00:34, 11.39s/it][A
 60%|██████    | 3/5 [00:35<00:23, 11.99s/it][A
 80%|████████  | 4/5 [00:55<00:14, 14.90s/it][A
100%|██████████| 5/5 [01:09<00:00, 13.95s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:18<01:13, 18.25s/it][A
 40%|████      | 2/5 [00:27<00:39, 13.08s/it][A
 60%|██████    | 3/5 [00:41<00:27, 13.57s/it][A
 80%|████████  | 4/5 [00:50<00:11, 11.50s/it][A
100%|██████████| 5/5 [01:01<00:00, 12.40s/it][A
 76%|███████▋  | 13/17 [32:01<09:50, 147.63s/it]

predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:12<00:51, 12.97s/it][A
 40%|████      | 2/5 [00:23<00:34, 11.63s/it][A
 60%|██████    | 3/5 [00:36<00:24, 12.18s/it][A
 80%|████████  | 4/5 [00:56<00:15, 15.15s/it][A
100%|██████████| 5/5 [01:10<00:00, 14.17s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:18<01:13, 18.40s/it][A
 40%|████      | 2/5 [00:28<00:39, 13.30s/it][A
 60%|██████    | 3/5 [00:42<00:27, 13.85s/it][A
 80%|████████  | 4/5 [00:50<00:11, 11.61s/it][A
100%|██████████| 5/5 [01:02<00:00, 12.55s/it][A
 82%|████████▏ | 14/17 [34:31<07:24, 148.21s/it]

predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:12<00:51, 12.95s/it][A
 40%|████      | 2/5 [00:23<00:34, 11.61s/it][A
 60%|██████    | 3/5 [00:36<00:24, 12.23s/it][A
 80%|████████  | 4/5 [00:56<00:15, 15.11s/it][A
100%|██████████| 5/5 [01:10<00:00, 14.13s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:18<01:14, 18.74s/it][A
 40%|████      | 2/5 [00:28<00:41, 13.74s/it][A
 60%|██████    | 3/5 [00:43<00:28, 14.14s/it][A
 80%|████████  | 4/5 [00:52<00:11, 11.89s/it][A
100%|██████████| 5/5 [01:03<00:00, 12.77s/it][A
 88%|████████▊ | 15/17 [37:01<04:57, 148.89s/it]

predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:12<00:48, 12.01s/it][A
 40%|████      | 2/5 [00:21<00:31, 10.59s/it][A
 60%|██████    | 3/5 [00:33<00:22, 11.22s/it][A
 80%|████████  | 4/5 [00:52<00:14, 14.20s/it][A
100%|██████████| 5/5 [01:06<00:00, 13.25s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:17<01:10, 17.62s/it][A
 40%|████      | 2/5 [00:26<00:37, 12.41s/it][A
 60%|██████    | 3/5 [00:39<00:25, 12.83s/it][A
 80%|████████  | 4/5 [00:47<00:10, 10.68s/it][A
100%|██████████| 5/5 [00:58<00:00, 11.63s/it][A
 94%|█████████▍| 16/17 [39:22<02:26, 146.34s/it]

predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:09<00:36,  9.19s/it][A
 40%|████      | 2/5 [00:16<00:24,  8.18s/it][A
 60%|██████    | 3/5 [00:25<00:17,  8.59s/it][A
 80%|████████  | 4/5 [00:39<00:10, 10.64s/it][A
100%|██████████| 5/5 [00:49<00:00,  9.97s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:13<00:52, 13.02s/it][A
 40%|████      | 2/5 [00:19<00:27,  9.32s/it][A
 60%|██████    | 3/5 [00:29<00:19,  9.65s/it][A
 80%|████████  | 4/5 [00:35<00:08,  8.17s/it][A
100%|██████████| 5/5 [00:44<00:00,  8.81s/it][A
100%|██████████| 17/17 [41:10<00:00, 145.31s/it]


In [38]:
df_candidates_carts_scores = pl.concat(df_candidates_carts_scores)

In [39]:
df_candidates_carts_scores.write_parquet("__subm__carts_scores_candidates_reranking_v3_1.parquet")

In [40]:
df_candidates_orders_scores = pl.concat(df_candidates_orders_scores)

In [41]:
df_candidates_orders_scores.write_parquet("__subm__orders_scores_candidates_reranking_v3_1.parquet")

## Make submission

In [None]:
df_candidates_carts_scores = pl.read_parquet("__subm__carts_scores_candidates_reranking_v3_1.parquet")

In [None]:
df_candidates_orders_scores = pl.read_parquet("__subm__orders_scores_candidates_reranking_v3_1.parquet")

### carts

In [44]:
df_valid_preds_sorted = (
    df_candidates_carts_scores
    .sort("carts_score", reverse=True)
    .groupby("session")
    .agg([
        pl.list("aid"),
        pl.list("carts_score"),
    ])
)

In [45]:
df_valid_preds_sorted

session,aid,carts_score
i64,list[i64],list[f64]
13904512,"[1499058, 539232, ... 1446321]","[4.468547, 1.071693, ... -8.090623]"
13223424,"[1681602, 756239, ... 1077258]","[4.199444, 1.262667, ... -9.297122]"
13067328,"[270299, 1397613, ... 705091]","[2.861832, 2.11989, ... -9.426415]"
13412224,"[1549424, 567649, ... 381558]","[2.218644, 1.884741, ... -7.32574]"
13450240,"[193902, 187188, ... 837267]","[4.56709, 2.21168, ... -7.913151]"
14074368,"[279169, 347824, ... 1286876]","[3.971112, 1.672568, ... -7.569641]"
14269760,"[43683, 1256092, ... 208610]","[3.628027, -1.107034, ... -8.725884]"
13822720,"[878290, 1789123, ... 1810805]","[4.132432, 1.370438, ... -9.273899]"
14194944,"[434744, 627426, ... 1621972]","[2.82526, 1.350362, ... -7.724257]"
14160960,"[290668, 1392535, ... 1595238]","[2.589168, 1.644011, ... -8.742621]"


In [46]:
# stage 2 valid carts reranked - recall@20 optimized
submission_dict = {
    "session_type": [],
    "labels": [],
}

types = ["carts"]
topk = 20

for row in tqdm(df_valid_preds_sorted.rows()):
    session_id = row[0]
    rec_items = row[1][:topk]
    
    session_types = [f"{session_id}_{t}" for t in types]
    labels = " ".join(str(aid) for aid in rec_items)
    labels_list = [labels]
    
    submission_dict["session_type"].extend(session_types)
    submission_dict["labels"].extend(labels_list)

100%|██████████| 1671803/1671803 [00:06<00:00, 257944.64it/s]


In [47]:
df_submission_carts_reranked = pl.DataFrame(submission_dict)

### orders

In [49]:
df_valid_preds_sorted = (
    df_candidates_orders_scores
    .sort("orders_score", reverse=True)
    .groupby("session")
    .agg([
        pl.list("aid"),
        pl.list("orders_score"),
    ])
)

In [50]:
df_valid_preds_sorted.head()

session,aid,orders_score
i64,list[i64],list[f64]
13265216,"[788882, 881836, ... 1018756]","[6.173031, 0.443758, ... -9.036292]"
14303744,"[1237195, 412500, ... 1501890]","[3.522048, -0.000175, ... -11.114448]"
13724480,"[382659, 775612, ... 708896]","[4.208566, 1.833268, ... -8.829755]"
13318464,"[1545490, 88754, ... 630417]","[3.629433, 1.646683, ... -11.168585]"
13322560,"[1163362, 496188, ... 152547]","[2.009353, 1.545776, ... -10.996039]"


In [51]:
# stage 2 valid carts reranked - recall@20 optimized
submission_dict = {
    "session_type": [],
    "labels": [],
}

types = ["orders"]
topk = 20

for row in tqdm(df_valid_preds_sorted.rows()):
    session_id = row[0]
    rec_items = row[1][:topk]
    
    session_types = [f"{session_id}_{t}" for t in types]
    labels = " ".join(str(aid) for aid in rec_items)
    labels_list = [labels]
    
    submission_dict["session_type"].extend(session_types)
    submission_dict["labels"].extend(labels_list)

100%|██████████| 1671803/1671803 [00:06<00:00, 258567.45it/s]


In [52]:
df_submission_orders_reranked = pl.DataFrame(submission_dict)

### Altogether

In [53]:
pd.read_csv("covisit_top200_reranker_rank+42feat_submission.csv.gz", compression="gzip")

Unnamed: 0,session_type,labels
0,12899779_clicks,59625 1253524 737445 438191 731692 1790770 166...
1,12899779_carts,59625 731692 1253524 1790770 737445 438191 166...
2,12899779_orders,59625 689970 731692 1790770 397451 1253524 469...
3,12899780_clicks,1142000 736515 973453 582732 889686 487136 141...
4,12899780_carts,1142000 736515 582732 973453 760500 1360606 17...
...,...,...
5015404,14571580_carts,202353 1314576 433425 1231403 888228 679257 68...
5015405,14571580_orders,202353 433425 1314576 1231403 888228 891417 92...
5015406,14571581_clicks,1100210 1684953 462056 1158237 1401429 622489 ...
5015407,14571581_carts,1100210 1684953 1401429 1072049 622489 462056 ...


In [54]:
df_submission_reranked_all = (
    pl.from_pandas(
        pd.read_csv("covisit_top200_reranker_rank+42feat_submission.csv.gz", compression="gzip")
    )
    .join(
        pl.concat([
            df_submission_carts_reranked,
            df_submission_orders_reranked
        ]), 
        on="session_type", how="left"
    )
    .with_columns([
        (
            pl.when(pl.col("labels_right").is_null())
            .then(pl.col("labels"))
            .otherwise(pl.col("labels_right"))
        ).alias("labels_final")
    ])
    .select(["session_type", "labels_final"])
    .rename({"labels_final": "labels"})
)


In [57]:
df_submission_reranked_all.head()

session_type,labels
str,str
"""12899779_click...","""59625 1253524 ..."
"""12899779_carts...","""59625 731692 1..."
"""12899779_order...","""59625 1253524 ..."
"""12899780_click...","""1142000 736515..."
"""12899780_carts...","""1142000 582732..."


In [56]:
# LB - 0.588
(
    df_submission_reranked_all.to_pandas()
    .to_csv("covisit_top200_all_i2i_top_100_old_new+rank_w2v_42feat_submission.csv.gz",
            compression="gzip", index=False)
)