In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from otto_utils import *
from otto_features import *
from otto_candidates_covisit import *
from otto_lgbm_utils import *
from otto_implicit import *
from otto_reranker import *
from otto_jobs_candidates import *
from otto_jobs_datasets import *

In [3]:
df_test = pl.read_parquet(TEST_PROCESSED, use_pyarrow=True)
test_users = df_test["session"].unique().sort().to_list()

# Submissions

## carts + orders

In [6]:
action_type = "orders"
mode = "subm"
version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"])
feature_groups = ["cand_user_item", "w2v_item2item", "covisit_item2item", "other"]

In [16]:
def lgb_cv_folds_predictions(
    df,
    model_file_tmplt,
    action_type,
    save_path: Path = Path("reranker_finale"),
):
    feature_cols = df.drop(["session", "aid"]).columns
    df_valid_preds = df.select(["session", "aid"])
    X_test = df[feature_cols].to_numpy()

    for fold in tqdm(range(5)):
#         model_file = model_file_tmplt.format(action_type=action_type, fold=fold)
        model_file = model_file_tmplt.format(action_type=action_type, fold=fold)
        gbm_ranking = lgb.Booster(model_file=(save_path / model_file).as_posix())
        scores = gbm_ranking.predict(X_test)
        df_valid_preds_fold = (
            df.select(["session", "aid"])
            .with_columns([pl.Series(scores).alias(f"scores_fold{fold}")])
        )
        df_valid_preds = (
            df_valid_preds.join(df_valid_preds_fold, on=["session", "aid"], how="left")
        )
    
    scores = (
        df_valid_preds
        .select([
            pl.col("scores_fold0"),
            pl.col("scores_fold1"),
            pl.col("scores_fold2"), 
            pl.col("scores_fold3"), 
            pl.col("scores_fold4"),
        ]).mean(axis=1)
    )
    
    df_valid_preds = (
        df_valid_preds.with_column(scores.alias(f"{action_type}_score"))
        .select(["session", "aid", f"{action_type}_score"])
    )
    return df_valid_preds

In [10]:
action_type = "orders"
model_version = "v2"
orders_model_file_tmplt = f"__model__{action_type}_{version}_reranker_fold{{fold}}_{model_version}.lgb"
print("orders tmplt: ", orders_model_file_tmplt)

action_type = "carts"
model_version = "v0"
carts_model_file_tmplt = f"__model__{action_type}_{version}_reranker_fold{{fold}}_{model_version}.lgb"
print("carts tmplt: ", carts_model_file_tmplt)

orders tmplt:  __model__orders_covisit_all+tdidf_new_top_100+tdidf_old_top_100_reranker_fold{fold}_v2.lgb
orders tmplt:  __model__carts_covisit_all+tdidf_new_top_100+tdidf_old_top_100_reranker_fold{fold}_v0.lgb


In [18]:
b_sz = 100000
# df_candidates_scores = df_candidates_item_item_features.select(["session", "aid"])

df_candidates_clicks_scores = []
df_candidates_carts_scores = []
df_candidates_orders_scores = []

action_type = "orders" # common dataset for carts and orders

for test_session_start in tqdm(range(0, len(test_users), b_sz)):
    test_sessions = test_users[test_session_start : test_session_start + b_sz]
    
    df_stage_2_dataset_batch = gather_dataset_features(
        mode = mode,
        action_type = action_type,
        version = version,
        feature_groups = feature_groups,
        batch_users=test_sessions,
    )

#     print("predict clicks -> save scores to df_candidates_scores")
#     df_clicks_scores = lgb_cv_folds_predictions(df_stage_2_batch, clicks_model_file_tmplt, "clicks")
#     df_candidates_clicks_scores.append(df_clicks_scores)
    
    print("predict carts -> save scores to df_candidates_scores")
    df_carts_scores = lgb_cv_folds_predictions(df_stage_2_dataset_batch, carts_model_file_tmplt, "carts")
    df_candidates_carts_scores.append(df_carts_scores)
    
    print("predict orders -> save scores to df_candidates_scores")
    df_orders_scores = lgb_cv_folds_predictions(df_stage_2_dataset_batch, orders_model_file_tmplt, "orders")
    df_candidates_orders_scores.append(df_orders_scores)


  0%|          | 0/17 [00:00<?, ?it/s]

mode:  subm
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:17<01:09, 17.48s/it][A
 40%|████      | 2/5 [00:32<00:48, 16.30s/it][A
 60%|██████    | 3/5 [00:51<00:34, 17.44s/it][A
 80%|████████  | 4/5 [01:16<00:20, 20.17s/it][A
100%|██████████| 5/5 [01:34<00:00, 18.94s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:38<02:33, 38.26s/it][A
 40%|████      | 2/5 [00:54<01:15, 25.14s/it][A
 60%|██████    | 3/5 [01:36<01:05, 32.99s/it][A
 80%|████████  | 4/5 [02:23<00:38, 38.54s/it][A
100%|██████████| 5/5 [02:49<00:00, 33.88s/it][A
  6%|▌         | 1/17 [05:30<1:28:07, 330.48s/it]

mode:  subm
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:18<01:12, 18.13s/it][A
 40%|████      | 2/5 [00:34<00:51, 17.10s/it][A
 60%|██████    | 3/5 [00:53<00:35, 17.99s/it][A
 80%|████████  | 4/5 [01:18<00:20, 20.64s/it][A
100%|██████████| 5/5 [01:37<00:00, 19.44s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:38<02:32, 38.02s/it][A
 40%|████      | 2/5 [00:53<01:15, 25.04s/it][A
 60%|██████    | 3/5 [01:36<01:06, 33.04s/it][A
 80%|████████  | 4/5 [02:22<00:38, 38.14s/it][A
100%|██████████| 5/5 [02:48<00:00, 33.68s/it][A
 12%|█▏        | 2/17 [11:02<1:22:48, 331.26s/it]

mode:  subm
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:17<01:11, 17.89s/it][A
 40%|████      | 2/5 [00:34<00:51, 17.06s/it][A
 60%|██████    | 3/5 [00:53<00:36, 18.05s/it][A
 80%|████████  | 4/5 [01:18<00:20, 20.57s/it][A
100%|██████████| 5/5 [01:36<00:00, 19.40s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:38<02:34, 38.60s/it][A
 40%|████      | 2/5 [00:54<01:15, 25.21s/it][A
 60%|██████    | 3/5 [01:36<01:05, 32.87s/it][A
 80%|████████  | 4/5 [02:22<00:37, 37.90s/it][A
100%|██████████| 5/5 [02:47<00:00, 33.58s/it][A
 18%|█▊        | 3/17 [16:33<1:17:18, 331.31s/it]

mode:  subm
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:17<01:11, 17.76s/it][A
 40%|████      | 2/5 [00:33<00:50, 16.70s/it][A
 60%|██████    | 3/5 [00:52<00:35, 17.77s/it][A
 80%|████████  | 4/5 [01:17<00:20, 20.42s/it][A
100%|██████████| 5/5 [01:36<00:00, 19.22s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:37<02:31, 37.89s/it][A
 40%|████      | 2/5 [00:52<01:12, 24.22s/it][A
 60%|██████    | 3/5 [01:34<01:04, 32.38s/it][A
 80%|████████  | 4/5 [02:20<00:37, 37.64s/it][A
100%|██████████| 5/5 [02:45<00:00, 33.08s/it][A
 24%|██▎       | 4/17 [22:02<1:11:34, 330.33s/it]

mode:  subm
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:17<01:11, 17.83s/it][A
 40%|████      | 2/5 [00:33<00:50, 16.76s/it][A
 60%|██████    | 3/5 [00:52<00:35, 17.75s/it][A
 80%|████████  | 4/5 [01:16<00:20, 20.29s/it][A
100%|██████████| 5/5 [01:35<00:00, 19.13s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:37<02:30, 37.56s/it][A
 40%|████      | 2/5 [00:53<01:14, 24.72s/it][A
 60%|██████    | 3/5 [01:35<01:05, 32.52s/it][A
 80%|████████  | 4/5 [02:20<00:37, 37.59s/it][A
100%|██████████| 5/5 [02:46<00:00, 33.21s/it][A
 29%|██▉       | 5/17 [27:31<1:05:57, 329.78s/it]

mode:  subm
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:17<01:10, 17.62s/it][A
 40%|████      | 2/5 [00:33<00:49, 16.51s/it][A
 60%|██████    | 3/5 [00:52<00:35, 17.54s/it][A
 80%|████████  | 4/5 [01:15<00:19, 20.00s/it][A
100%|██████████| 5/5 [01:34<00:00, 18.86s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:37<02:29, 37.40s/it][A
 40%|████      | 2/5 [00:52<01:13, 24.35s/it][A
 60%|██████    | 3/5 [01:34<01:04, 32.44s/it][A
 80%|████████  | 4/5 [02:19<00:37, 37.45s/it][A
100%|██████████| 5/5 [02:45<00:00, 33.07s/it][A
 35%|███▌      | 6/17 [32:57<1:00:15, 328.70s/it]

mode:  subm
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:17<01:10, 17.63s/it][A
 40%|████      | 2/5 [00:33<00:49, 16.39s/it][A
 60%|██████    | 3/5 [00:51<00:34, 17.38s/it][A
 80%|████████  | 4/5 [01:15<00:20, 20.01s/it][A
100%|██████████| 5/5 [01:34<00:00, 18.84s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:37<02:29, 37.34s/it][A
 40%|████      | 2/5 [00:52<01:13, 24.35s/it][A
 60%|██████    | 3/5 [01:34<01:04, 32.18s/it][A
 80%|████████  | 4/5 [02:18<00:37, 37.19s/it][A
100%|██████████| 5/5 [02:44<00:00, 32.87s/it][A
 41%|████      | 7/17 [38:23<54:37, 327.71s/it]  

mode:  subm
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:17<01:09, 17.48s/it][A
 40%|████      | 2/5 [00:32<00:48, 16.20s/it][A
 60%|██████    | 3/5 [00:51<00:34, 17.22s/it][A
 80%|████████  | 4/5 [01:14<00:19, 19.76s/it][A
100%|██████████| 5/5 [01:32<00:00, 18.58s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:36<02:27, 36.87s/it][A
 40%|████      | 2/5 [00:51<01:12, 24.00s/it][A
 60%|██████    | 3/5 [01:33<01:03, 31.87s/it][A
 80%|████████  | 4/5 [02:17<00:36, 36.87s/it][A
100%|██████████| 5/5 [02:42<00:00, 32.50s/it][A
 47%|████▋     | 8/17 [43:46<48:54, 326.04s/it]

mode:  subm
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:17<01:10, 17.56s/it][A
 40%|████      | 2/5 [00:33<00:49, 16.59s/it][A
 60%|██████    | 3/5 [00:52<00:35, 17.60s/it][A
 80%|████████  | 4/5 [01:16<00:20, 20.03s/it][A
100%|██████████| 5/5 [01:34<00:00, 18.96s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:37<02:29, 37.39s/it][A
 40%|████      | 2/5 [00:52<01:13, 24.36s/it][A
 60%|██████    | 3/5 [01:34<01:04, 32.29s/it][A
 80%|████████  | 4/5 [02:19<00:37, 37.28s/it][A
100%|██████████| 5/5 [02:44<00:00, 32.91s/it][A
 53%|█████▎    | 9/17 [49:12<43:30, 326.27s/it]

mode:  subm
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:17<01:11, 17.91s/it][A
 40%|████      | 2/5 [00:33<00:50, 16.68s/it][A
 60%|██████    | 3/5 [00:52<00:35, 17.72s/it][A
 80%|████████  | 4/5 [01:16<00:20, 20.22s/it][A
100%|██████████| 5/5 [01:35<00:00, 19.05s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:37<02:29, 37.35s/it][A
 40%|████      | 2/5 [00:52<01:12, 24.30s/it][A
 60%|██████    | 3/5 [01:33<01:04, 32.13s/it][A
 80%|████████  | 4/5 [02:18<00:37, 37.21s/it][A
100%|██████████| 5/5 [02:44<00:00, 32.88s/it][A
 59%|█████▉    | 10/17 [54:39<38:04, 326.42s/it]

mode:  subm
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:18<01:13, 18.33s/it][A
 40%|████      | 2/5 [00:34<00:51, 17.04s/it][A
 60%|██████    | 3/5 [00:53<00:36, 18.03s/it][A
 80%|████████  | 4/5 [01:18<00:20, 20.56s/it][A
100%|██████████| 5/5 [01:37<00:00, 19.46s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:38<02:32, 38.04s/it][A
 40%|████      | 2/5 [00:53<01:14, 24.83s/it][A
 60%|██████    | 3/5 [01:35<01:05, 32.77s/it][A
 80%|████████  | 4/5 [02:21<00:37, 37.81s/it][A
100%|██████████| 5/5 [02:47<00:00, 33.48s/it][A
 65%|██████▍   | 11/17 [1:00:11<32:48, 328.11s/it]

mode:  subm
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:17<01:10, 17.62s/it][A
 40%|████      | 2/5 [00:33<00:49, 16.57s/it][A
 60%|██████    | 3/5 [00:52<00:35, 17.61s/it][A
 80%|████████  | 4/5 [01:16<00:20, 20.14s/it][A
100%|██████████| 5/5 [01:35<00:00, 19.01s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:37<02:30, 37.74s/it][A
 40%|████      | 2/5 [00:53<01:14, 24.68s/it][A
 60%|██████    | 3/5 [01:34<01:04, 32.46s/it][A
 80%|████████  | 4/5 [02:20<00:37, 37.53s/it][A
100%|██████████| 5/5 [02:45<00:00, 33.14s/it][A
 71%|███████   | 12/17 [1:05:39<27:20, 328.14s/it]

mode:  subm
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:18<01:12, 18.03s/it][A
 40%|████      | 2/5 [00:34<00:50, 16.86s/it][A
 60%|██████    | 3/5 [00:53<00:36, 18.11s/it][A
 80%|████████  | 4/5 [01:18<00:20, 20.70s/it][A
100%|██████████| 5/5 [01:37<00:00, 19.41s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:37<02:30, 37.73s/it][A
 40%|████      | 2/5 [00:53<01:14, 24.77s/it][A
100%|██████████| 5/5 [02:46<00:00, 33.39s/it][A
 76%|███████▋  | 13/17 [1:11:11<21:56, 329.23s/it]

mode:  subm
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:18<01:15, 18.83s/it][A
 40%|████      | 2/5 [00:35<00:52, 17.63s/it][A
 60%|██████    | 3/5 [00:55<00:37, 18.69s/it][A
 80%|████████  | 4/5 [01:20<00:21, 21.13s/it][A
100%|██████████| 5/5 [01:40<00:00, 20.05s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:38<02:33, 38.47s/it][A
 40%|████      | 2/5 [00:54<01:16, 25.54s/it][A
 60%|██████    | 3/5 [01:37<01:06, 33.17s/it][A
 80%|████████  | 4/5 [02:23<00:38, 38.21s/it][A
100%|██████████| 5/5 [02:49<00:00, 33.97s/it][A
 82%|████████▏ | 14/17 [1:16:49<16:35, 331.87s/it]

mode:  subm
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:18<01:12, 18.01s/it][A
 40%|████      | 2/5 [00:33<00:50, 16.82s/it][A
 60%|██████    | 3/5 [00:52<00:35, 17.72s/it][A
 80%|████████  | 4/5 [01:16<00:20, 20.21s/it][A
100%|██████████| 5/5 [01:35<00:00, 19.06s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:37<02:29, 37.41s/it][A
 40%|████      | 2/5 [00:52<01:13, 24.43s/it][A
 60%|██████    | 3/5 [01:34<01:04, 32.30s/it][A
 80%|████████  | 4/5 [02:19<00:37, 37.24s/it][A
100%|██████████| 5/5 [02:44<00:00, 32.97s/it][A
 88%|████████▊ | 15/17 [1:22:17<11:01, 330.62s/it]

mode:  subm
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:17<01:08, 17.24s/it][A
 40%|████      | 2/5 [00:32<00:48, 16.30s/it][A
 60%|██████    | 3/5 [00:51<00:34, 17.21s/it][A
 80%|████████  | 4/5 [01:14<00:19, 19.70s/it][A
100%|██████████| 5/5 [01:32<00:00, 18.56s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:36<02:26, 36.69s/it][A
 40%|████      | 2/5 [00:51<01:11, 23.89s/it][A
 60%|██████    | 3/5 [01:32<01:03, 31.72s/it][A
 80%|████████  | 4/5 [02:17<00:36, 36.77s/it][A
100%|██████████| 5/5 [02:42<00:00, 32.41s/it][A
 94%|█████████▍| 16/17 [1:27:39<05:28, 328.16s/it]

mode:  subm
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
predict carts -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:12<00:49, 12.39s/it][A
 40%|████      | 2/5 [00:23<00:34, 11.59s/it][A
 60%|██████    | 3/5 [00:36<00:24, 12.26s/it][A
 80%|████████  | 4/5 [00:53<00:14, 14.07s/it][A
100%|██████████| 5/5 [01:06<00:00, 13.24s/it][A


predict orders -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:26<01:45, 26.37s/it][A
 40%|████      | 2/5 [00:37<00:51, 17.12s/it][A
 60%|██████    | 3/5 [01:06<00:45, 22.63s/it][A
 80%|████████  | 4/5 [01:37<00:26, 26.12s/it][A
100%|██████████| 5/5 [01:55<00:00, 23.08s/it][A
100%|██████████| 17/17 [1:31:46<00:00, 323.92s/it]


In [19]:
df_candidates_carts_scores = pl.concat(df_candidates_carts_scores)
df_candidates_carts_scores.write_parquet(f"reranker_finale/__submission__carts_{version}_v0.parquet")

In [20]:
df_candidates_orders_scores = pl.concat(df_candidates_orders_scores)
df_candidates_orders_scores.write_parquet(f"reranker_finale/__submission__orders_{version}_v2.parquet")

## clicks

In [4]:
action_type = "clicks"
mode = "subm"
version = "+".join(["tdidf_new_top_100", "tdidf_old_top_100"])
feature_groups = ["cand_user_item", "w2v_item2item", "covisit_item2item", "other"]

In [5]:
def lgb_cv_folds_predictions(
    df,
    model_file_tmplt,
    action_type,
    save_path: Path = Path("reranker_finale"),
):
    feature_cols = df.drop(["session", "aid"]).columns
    df_valid_preds = df.select(["session", "aid"])
    X_test = df[feature_cols].to_numpy()

    for fold in tqdm(range(5)):
#         model_file = model_file_tmplt.format(action_type=action_type, fold=fold)
        model_file = model_file_tmplt.format(action_type=action_type, fold=fold)
        gbm_ranking = lgb.Booster(model_file=(save_path / model_file).as_posix())
        scores = gbm_ranking.predict(X_test)
        df_valid_preds_fold = (
            df.select(["session", "aid"])
            .with_columns([pl.Series(scores).alias(f"scores_fold{fold}")])
        )
        df_valid_preds = (
            df_valid_preds.join(df_valid_preds_fold, on=["session", "aid"], how="left")
        )
    
    scores = (
        df_valid_preds
        .select([
            pl.col("scores_fold0"),
            pl.col("scores_fold1"),
            pl.col("scores_fold2"), 
            pl.col("scores_fold3"), 
            pl.col("scores_fold4"),
        ]).mean(axis=1)
    )
    
    df_valid_preds = (
        df_valid_preds.with_column(scores.alias(f"{action_type}_score"))
        .select(["session", "aid", f"{action_type}_score"])
    )
    return df_valid_preds

In [6]:
action_type = "clicks"
model_version = "v0"
clicks_model_file_tmplt = f"__model__{action_type}_{version}_reranker_fold{{fold}}_{model_version}.lgb"
print("clicks tmplt: ", clicks_model_file_tmplt)

clicks tmplt:  __model__clicks_tdidf_new_top_100+tdidf_old_top_100_reranker_fold{fold}_v0.lgb


In [8]:
b_sz = 100000

df_candidates_clicks_scores = []
df_candidates_carts_scores = []
df_candidates_orders_scores = []

action_type = "clicks"

for test_session_start in tqdm(range(0, len(test_users), b_sz)):
    test_sessions = test_users[test_session_start : test_session_start + b_sz]
    
    df_stage_2_dataset_batch = gather_dataset_features(
        mode = mode,
        action_type = action_type,
        version = version,
        feature_groups = feature_groups,
        batch_users=test_sessions,
    )

    print("predict clicks -> save scores to df_candidates_scores")
    df_clicks_scores = lgb_cv_folds_predictions(df_stage_2_dataset_batch, clicks_model_file_tmplt, "clicks")
    df_candidates_clicks_scores.append(df_clicks_scores)


  0%|          | 0/17 [00:00<?, ?it/s]

mode:  subm
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet
predict clicks -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:27<01:51, 27.80s/it][A
 40%|████      | 2/5 [00:43<01:02, 20.97s/it][A
 60%|██████    | 3/5 [01:05<00:42, 21.23s/it][A
 80%|████████  | 4/5 [01:35<00:24, 24.70s/it][A
100%|██████████| 5/5 [01:58<00:00, 23.68s/it][A
  6%|▌         | 1/17 [03:54<1:02:35, 234.72s/it]

mode:  subm
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet
predict clicks -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:27<01:50, 27.73s/it][A
 40%|████      | 2/5 [00:43<01:02, 20.92s/it][A
 60%|██████    | 3/5 [01:05<00:42, 21.39s/it][A
 80%|████████  | 4/5 [01:35<00:24, 24.70s/it][A
100%|██████████| 5/5 [01:58<00:00, 23.67s/it][A
 12%|█▏        | 2/17 [06:35<47:45, 191.01s/it]  

mode:  subm
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet
predict clicks -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:28<01:52, 28.17s/it][A
 40%|████      | 2/5 [00:44<01:02, 20.95s/it][A
 60%|██████    | 3/5 [01:06<00:42, 21.44s/it][A
 80%|████████  | 4/5 [01:35<00:24, 24.69s/it][A
100%|██████████| 5/5 [01:58<00:00, 23.70s/it][A
 18%|█▊        | 3/17 [09:15<41:19, 177.14s/it]

mode:  subm
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet
predict clicks -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:28<01:52, 28.14s/it][A

predict clicks -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:27<01:51, 27.79s/it][A
 40%|████      | 2/5 [00:43<01:02, 20.88s/it][A
 60%|██████    | 3/5 [01:05<00:42, 21.22s/it][A
 80%|████████  | 4/5 [01:35<00:24, 24.56s/it][A
100%|██████████| 5/5 [01:57<00:00, 23.56s/it][A
 29%|██▉       | 5/17 [14:37<33:24, 167.07s/it]

mode:  subm
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet
predict clicks -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:28<01:52, 28.15s/it][A
 40%|████      | 2/5 [00:44<01:02, 20.98s/it][A
 60%|██████    | 3/5 [01:05<00:42, 21.25s/it][A
 80%|████████  | 4/5 [01:35<00:24, 24.66s/it][A
100%|██████████| 5/5 [01:58<00:00, 23.67s/it][A
 35%|███▌      | 6/17 [17:18<30:14, 164.98s/it]

mode:  subm
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet
predict clicks -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:27<01:51, 27.94s/it][A
 40%|████      | 2/5 [00:44<01:02, 20.97s/it][A
 60%|██████    | 3/5 [01:05<00:42, 21.36s/it][A
 80%|████████  | 4/5 [01:35<00:24, 24.58s/it][A
100%|██████████| 5/5 [01:58<00:00, 23.61s/it][A
 41%|████      | 7/17 [19:58<27:15, 163.58s/it]

mode:  subm
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet
predict clicks -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:28<01:52, 28.11s/it][A
 40%|████      | 2/5 [00:44<01:03, 21.24s/it][A
 60%|██████    | 3/5 [01:06<00:43, 21.66s/it][A
 80%|████████  | 4/5 [01:36<00:24, 24.97s/it][A
100%|██████████| 5/5 [01:59<00:00, 23.95s/it][A
 47%|████▋     | 8/17 [22:41<24:29, 163.23s/it]

mode:  subm
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet
predict clicks -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:28<01:52, 28.09s/it][A
 40%|████      | 2/5 [00:44<01:02, 20.95s/it][A
 60%|██████    | 3/5 [01:05<00:42, 21.22s/it][A
 80%|████████  | 4/5 [01:35<00:24, 24.52s/it][A
100%|██████████| 5/5 [01:57<00:00, 23.53s/it][A
 53%|█████▎    | 9/17 [25:21<21:38, 162.30s/it]

mode:  subm
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet
predict clicks -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:28<01:52, 28.03s/it][A
 40%|████      | 2/5 [00:44<01:03, 21.23s/it][A
 60%|██████    | 3/5 [01:06<00:43, 21.55s/it][A
 80%|████████  | 4/5 [01:36<00:24, 24.87s/it][A
100%|██████████| 5/5 [01:59<00:00, 23.84s/it][A
 59%|█████▉    | 10/17 [28:03<18:55, 162.22s/it]

mode:  subm
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet
predict clicks -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:27<01:51, 27.80s/it][A
 40%|████      | 2/5 [00:43<01:01, 20.66s/it][A
 60%|██████    | 3/5 [01:04<00:42, 21.06s/it][A
 80%|████████  | 4/5 [01:34<00:24, 24.45s/it][A
100%|██████████| 5/5 [01:57<00:00, 23.44s/it][A
 65%|██████▍   | 11/17 [30:43<16:09, 161.56s/it]

mode:  subm
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet
predict clicks -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:28<01:52, 28.08s/it][A
 40%|████      | 2/5 [00:44<01:03, 21.26s/it][A
 60%|██████    | 3/5 [01:06<00:43, 21.54s/it][A
 80%|████████  | 4/5 [01:36<00:24, 24.83s/it][A
100%|██████████| 5/5 [01:59<00:00, 23.87s/it][A
 71%|███████   | 12/17 [33:26<13:29, 161.81s/it]

mode:  subm
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet
predict clicks -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:28<01:52, 28.22s/it][A
 40%|████      | 2/5 [00:44<01:03, 21.03s/it][A
 60%|██████    | 3/5 [01:06<00:42, 21.40s/it][A
 80%|████████  | 4/5 [01:35<00:24, 24.76s/it][A
100%|██████████| 5/5 [01:58<00:00, 23.75s/it][A
 76%|███████▋  | 13/17 [36:07<10:47, 161.78s/it]

mode:  subm
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet
predict clicks -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:28<01:53, 28.37s/it][A
 40%|████      | 2/5 [00:44<01:04, 21.35s/it][A
 60%|██████    | 3/5 [01:06<00:43, 21.65s/it][A
 80%|████████  | 4/5 [01:36<00:24, 24.95s/it][A
100%|██████████| 5/5 [02:00<00:00, 24.02s/it][A
 82%|████████▏ | 14/17 [38:50<08:06, 162.11s/it]

mode:  subm
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet
predict clicks -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:27<01:51, 27.86s/it][A
 40%|████      | 2/5 [00:43<01:02, 20.88s/it][A
 60%|██████    | 3/5 [01:05<00:42, 21.22s/it][A
 80%|████████  | 4/5 [01:35<00:24, 24.69s/it][A
100%|██████████| 5/5 [01:58<00:00, 23.60s/it][A
 88%|████████▊ | 15/17 [41:31<05:23, 161.79s/it]

mode:  subm
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet
predict clicks -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:27<01:50, 27.59s/it][A
 40%|████      | 2/5 [00:43<01:02, 20.69s/it][A
 60%|██████    | 3/5 [01:04<00:42, 21.02s/it][A
 80%|████████  | 4/5 [01:34<00:24, 24.41s/it][A
100%|██████████| 5/5 [01:56<00:00, 23.38s/it][A
 94%|█████████▍| 16/17 [44:12<02:41, 161.30s/it]

mode:  subm
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet
predict clicks -> save scores to df_candidates_scores



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:20<01:20, 20.03s/it][A
 40%|████      | 2/5 [00:31<00:45, 15.02s/it][A
 60%|██████    | 3/5 [00:47<00:30, 15.32s/it][A
 80%|████████  | 4/5 [01:08<00:17, 17.70s/it][A
100%|██████████| 5/5 [01:24<00:00, 16.97s/it][A
100%|██████████| 17/17 [46:19<00:00, 163.48s/it]


In [9]:
df_candidates_clicks_scores = pl.concat(df_candidates_clicks_scores)
df_candidates_clicks_scores.write_parquet(f"reranker_finale/__submission__clicks_{version}_v0.parquet")

# Submission

## click

In [21]:
# df_candidates_clicks_scores = pl.read_parquet("__subm__clicks_scores_candidates_reranking_v3_1.parquet")
version = "+".join(["tdidf_new_top_100", "tdidf_old_top_100"])
df_candidates_clicks_scores = pl.read_parquet(f"reranker_finale/__submission__clicks_{version}_v0.parquet")

In [10]:
df_valid_preds_sorted = (
    df_candidates_clicks_scores
    .sort("clicks_score", reverse=True)
    .groupby("session")
    .agg([
        pl.list("aid"),
        pl.list("clicks_score"),
    ])
)

In [11]:
# stage 2 valid carts reranked - recall@20 optimized
submission_dict = {
    "session_type": [],
    "labels": [],
}

types = ["clicks"]
topk = 20

for row in tqdm(df_valid_preds_sorted.rows()):
    session_id = row[0]
    rec_items = row[1][:topk]
    
    session_types = [f"{session_id}_{t}" for t in types]
    labels = " ".join(str(aid) for aid in rec_items)
    labels_list = [labels]
    
    submission_dict["session_type"].extend(session_types)
    submission_dict["labels"].extend(labels_list)

100%|██████████| 1671803/1671803 [00:06<00:00, 258656.70it/s]


In [12]:
df_submission_clicks_reranked = pl.DataFrame(submission_dict)

In [13]:
df_submission_clicks_reranked.head()

session_type,labels
str,str
"""12990016_click...","""1678423 453451..."
"""13136960_click...","""824944 1460571..."
"""12977024_click...","""1650609 167637..."
"""14140032_click...","""1679462 184957..."
"""13433728_click...","""1180465 439222..."


## carts

In [14]:
version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"])
df_candidates_carts_scores = pl.read_parquet(f"reranker_finale/__submission__carts_{version}_v0.parquet")

In [15]:
df_valid_preds_sorted = (
    df_candidates_carts_scores
    .sort("carts_score", reverse=True)
    .groupby("session")
    .agg([
        pl.list("aid"),
        pl.list("carts_score"),
    ])
)

In [16]:
df_valid_preds_sorted

session,aid,carts_score
i64,list[i64],list[f64]
14350144,"[337471, 1466369, ... 984617]","[1.814851, -0.477041, ... -10.662985]"
13535488,"[1692706, 1142507, ... 1243358]","[3.183787, 0.517683, ... -11.981055]"
13165696,"[633760, 309703, ... 1679400]","[2.96021, 1.857757, ... -38.552639]"
14139072,"[378765, 1840567, ... 477922]","[2.778367, -0.397153, ... -11.058259]"
14086208,"[1768879, 671101, ... 361995]","[1.535539, 0.840866, ... -8.710822]"
13757440,"[859334, 1117208, ... 121534]","[3.208259, 0.84079, ... -10.911554]"
13039040,"[749432, 268669, ... 1688832]","[2.950695, 1.282091, ... -9.715829]"
13259456,"[215649, 1322823, ... 125524]","[4.219167, 3.040282, ... -7.666617]"
13976192,"[62659, 1039611, ... 1823626]","[3.15935, 1.95204, ... -10.939]"
13635008,"[696431, 1581568, ... 1673678]","[4.744767, 1.841567, ... -8.920913]"


In [17]:
# stage 2 valid carts reranked - recall@20 optimized
submission_dict = {
    "session_type": [],
    "labels": [],
}

types = ["carts"]
topk = 20

for row in tqdm(df_valid_preds_sorted.rows()):
    session_id = row[0]
    rec_items = row[1][:topk]
    
    session_types = [f"{session_id}_{t}" for t in types]
    labels = " ".join(str(aid) for aid in rec_items)
    labels_list = [labels]
    
    submission_dict["session_type"].extend(session_types)
    submission_dict["labels"].extend(labels_list)

100%|██████████| 1671803/1671803 [00:06<00:00, 260266.38it/s]


In [18]:
df_submission_carts_reranked = pl.DataFrame(submission_dict)

In [19]:
df_submission_carts_reranked.head()

session_type,labels
str,str
"""14350144_carts...","""337471 1466369..."
"""13535488_carts...","""1692706 114250..."
"""13165696_carts...","""633760 309703 ..."
"""14139072_carts...","""378765 1840567..."
"""14086208_carts...","""1768879 671101..."


## orders

In [20]:
version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"])
df_candidates_orders_scores = pl.read_parquet(f"reranker_finale/__submission__orders_{version}_v2.parquet")

In [21]:
df_valid_preds_sorted = (
    df_candidates_orders_scores
    .sort("orders_score", reverse=True)
    .groupby("session")
    .agg([
        pl.list("aid"),
        pl.list("orders_score"),
    ])
)

In [22]:
df_valid_preds_sorted.head()

session,aid,orders_score
i64,list[i64],list[f64]
14424768,"[885917, 350578, ... 152547]","[3.154448, 1.613243, ... -10.283263]"
13432832,"[469367, 180672, ... 152547]","[3.047045, -0.751042, ... -10.226279]"
13355392,"[1286372, 554623, ... 152547]","[4.787866, 0.194021, ... -9.799678]"
14112832,"[335640, 882058, ... 152547]","[2.586109, 1.370388, ... -9.899193]"
13729472,"[394862, 1127676, ... 152547]","[3.593447, 1.387175, ... -10.165423]"


In [23]:
# stage 2 valid carts reranked - recall@20 optimized
submission_dict = {
    "session_type": [],
    "labels": [],
}

types = ["orders"]
topk = 20

for row in tqdm(df_valid_preds_sorted.rows()):
    session_id = row[0]
    rec_items = row[1][:topk]
    
    session_types = [f"{session_id}_{t}" for t in types]
    labels = " ".join(str(aid) for aid in rec_items)
    labels_list = [labels]
    
    submission_dict["session_type"].extend(session_types)
    submission_dict["labels"].extend(labels_list)

100%|██████████| 1671803/1671803 [00:06<00:00, 260444.13it/s]


In [24]:
df_submission_orders_reranked = pl.DataFrame(submission_dict)

In [25]:
df_submission_orders_reranked.head()

session_type,labels
str,str
"""14424768_order...","""885917 350578 ..."
"""13432832_order...","""469367 180672 ..."
"""13355392_order...","""1286372 554623..."
"""14112832_order...","""335640 882058 ..."
"""13729472_order...","""394862 1127676..."


## Combine

### clicks, carts, orders

In [26]:
df_submission_reranked_all = (
    pl.from_pandas(
        pd.read_csv("covisit_top200_reranker_rank+42feat_submission.csv.gz", compression="gzip")
    )
    .join(
        pl.concat([
            df_submission_clicks_reranked,
            df_submission_carts_reranked,
            df_submission_orders_reranked
        ]), 
        on="session_type", how="left"
    )
    .with_columns([
        (
            pl.when(pl.col("labels_right").is_null())
            .then(pl.col("labels"))
            .otherwise(pl.col("labels_right"))
        ).alias("labels_final")
    ])
    .select(["session_type", "labels_final"])
    .rename({"labels_final": "labels"})
)


In [27]:
df_submission_reranked_all.head()

session_type,labels
str,str
"""12899779_click...","""59625 737445 1..."
"""12899779_carts...","""59625 731692 1..."
"""12899779_order...","""59625 731692 6..."
"""12899780_click...","""1142000 736515..."
"""12899780_carts...","""1142000 582732..."


In [28]:
# LB - 0.595 (BEST)
(
    df_submission_reranked_all.to_pandas()
    .to_csv("reranker_finale/__b__submission_finale.csv.gz",
            compression="gzip", index=False)
)

In [38]:
# LB - 0.595
(
    df_submission_reranked_all.to_pandas()
    .to_csv("reranker_finale/__a__submission_finale.csv.gz",
            compression="gzip", index=False)
)