In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
from pathlib import Path
from collections import Counter

from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import dill
import pandas as pd
import polars as pl
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import GroupKFold

import lightgbm as lgb
import xgboost as xgb

import pytorch_lightning as L
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
import seaborn as sns

from tasks.data.dataset.mappers import EntityEncoder
from tasks.jobs import Splitter

In [4]:
from otto_utils import *
from otto_features import *
from otto_candidates_covisit import *
from otto_lgbm_utils import *
from otto_implicit import *
from otto_reranker import *
from otto_jobs_candidates import *

In [5]:
%env PYTHONHASHSEED=1
from otto_word2vec import *

w2vec = Word2Vec.load("__valid__word2vec_window=10_negative=20.w2v")

env: PYTHONHASHSEED=1


# Load data

In [6]:
val_df_train = pl.read_parquet(VALIDATION_PATH / "train.parquet", use_pyarrow=True)
val_df_valid_input = pl.read_parquet(VALIDATION_PATH / "valid.parquet", use_pyarrow=True)
val_df_valid_targets = pl.read_parquet(VALIDATION_PATH / "test_labels.parquet", use_pyarrow=True)

# Candidates (Clicks)

In [7]:
ACT_TYPE = "clicks"

## prepare target and candidates set with this target in holdout (other users for action does not change metric)

In [8]:
df_act_target = (
    val_df_valid_targets
    .filter(pl.col("type") == ACT_TYPE)
    .join(val_df_valid_input.select(["session"]).unique(), on="session", how="inner")
    .drop("type")
    .explode("ground_truth")
    .with_column(pl.lit(1).alias("target"))
    .rename({"ground_truth": "aid"})
)

In [9]:
df = val_df_valid_input.unique().sort(["session", "ts"])

In [10]:
df_test_users = (
    df
    .join(df_act_target.select(["session"]).unique(), on="session", how="inner")
    .unique().sort(["session", "ts"])
)
test_sessions_dict = df_test_users.groupby('session').agg([pl.list("aid"), pl.list("type")])
test_sessions_dict = dict(zip(
    test_sessions_dict["session"].to_list(),
    tuple(zip(test_sessions_dict["aid"].to_list(), test_sessions_dict["type"].to_list()))
))

## covisitation top200

In [11]:
# Use top X for clicks, carts and orders
clicks_th = 15
carts_th  = 20
orders_th = 20

def get_top(df, th):
    return (
        df
        .with_column(pl.lit(1).alias("ones"))
        .with_column(pl.col("ones").cumsum().over("aid").alias("rank"))
        .filter(pl.col("rank") <= th)
    )

TOPK_RECOMMEND = 20
TOPK_RERANK = 40

In [12]:
carts_orders = pl.read_parquet("__valid__covisit_carts_orders_all_v3.parquet")
buys2buys = pl.read_parquet("__valid__covisit_buys2buys_all_v4.parquet")
clicks = pl.read_parquet("__valid__covisit_clicks_all_v3.parquet")

In [13]:
carts_orders_top = get_top(carts_orders, carts_th)
buys2buys_top = get_top(buys2buys, orders_th)
clicks_top = get_top(clicks, clicks_th)

In [14]:
top_clicks = df.filter(pl.col("type") == 0)["aid"].value_counts(sort=True)[:TOPK_RECOMMEND]["aid"].to_list()
top_carts = df.filter(pl.col("type") == 1)["aid"].value_counts(sort=True)[:TOPK_RECOMMEND]["aid"].to_list()
top_orders = df.filter(pl.col("type") == 2)["aid"].value_counts(sort=True)[:TOPK_RECOMMEND]["aid"].to_list()

In [15]:
covisit_rec = CovisitationRecommender(
    df_top_k_buys=carts_orders_top,
    df_top_k_buy2buy=buys2buys_top,
    df_top_k_clicks=clicks_top,
    top_carts=top_carts,
    top_orders=top_orders,
    top_clicks=top_clicks,
)

In [16]:
candidates_dict = {
    "session": [],
    "type": [],
    "candidates": [],
    "rank": [],
}

types = ["clicks", "carts", "orders"]
topk = TOPK_RERANK * 5

for session_id, (session_aid_list, session_type_list) in tqdm(test_sessions_dict.items()):
    rec_items_clicks = covisit_rec.recommend_clicks(session_aid_list, session_type_list, topk)
    rec_items_carts = covisit_rec.recommend_carts(session_aid_list, session_type_list, topk)
    rec_items_buys = covisit_rec.recommend_buys(session_aid_list, session_type_list, topk)

    candidates = [rec_items_clicks, rec_items_carts, rec_items_buys]
#     candidates = [rec_items_buys]
    ranks = [
        np.arange(1, len(rec_items) + 1).tolist()
        for rec_items in candidates
    ]
    
    candidates_dict["session"].extend([session_id] * len(types))
    candidates_dict["type"].extend(types)
    candidates_dict["candidates"].extend(candidates)
    candidates_dict["rank"].extend(ranks)

df_candidates_covisit = pl.DataFrame(candidates_dict)

100%|██████████| 1265979/1265979 [02:05<00:00, 10049.34it/s]


In [17]:
# contains target column and all test users
df_candidates_covisit_all = (
    df_candidates_covisit
    .filter(pl.col("type") == "orders")
    .drop("type")
    .explode(["candidates", "rank"])
    .rename({"candidates": "aid", "rank": "rank_orders"})
    .join(
        (
            df_candidates_covisit
            .filter(pl.col("type") == "carts")
            .drop("type")
            .explode(["candidates", "rank"])
            .rename({"candidates": "aid", "rank": "rank_carts"})
        ),
        on=["session", "aid"],
        how="outer"
    )
    .join(
        (
            df_candidates_covisit
            .filter(pl.col("type") == "clicks")
            .drop("type")
            .explode(["candidates", "rank"])
            .rename({"candidates": "aid", "rank": "rank_clicks"})
        ),
        on=["session", "aid"],
        how="outer"
    )
    .fill_null(999)
    .filter(pl.col("aid") != -1)  # some strange items from implicit
    .unique(subset=["session", "aid"], keep="last")
    .join(df_act_target, on=["session", "aid"], how="left")  # if using rank column as a feature
    .sort("session")
    .fill_null(0)
)

In [19]:
print_candidates_hit_rate(df_candidates_covisit_all)

sessions with positives in candidates:  742940
sessions with positives in test dataframe:  1265979
hit rate:  0.5868501768196787


## Implicit

In [20]:
from implicit.nearest_neighbours import CosineRecommender
from implicit.nearest_neighbours import TFIDFRecommender

In [21]:
df = pl.concat([val_df_train, val_df_valid_input]).unique().sort(["session", "ts"])
df = implicit_new_weight_interactions(df)

In [22]:
train_data = make_sparse_matrix(df)
tfidf_new = TFIDFRecommender(K=200)
tfidf_new.fit(train_data)

  0%|          | 0/1855603 [00:00<?, ?it/s]

In [23]:
df_candidates_tfidf_new = implicit_batch_candidates_for_all_types(
    model=tfidf_new, model_name="tfidf_new",
    train_data=train_data, test_users=list(test_sessions_dict.keys()),
    topk=100,
)

100%|██████████| 1266/1266 [02:00<00:00, 10.54it/s]


In [26]:
df_candidates_tfidf_new = (
    df_candidates_tfidf_new
    .explode(["aid", "tfidf_new_score"])
    .filter(pl.col("aid") != -1)  # some strange items from implicit
    .unique(subset=["session", "aid"], keep="last")
    .join(df_act_target, on=["session", "aid"], how="left")  # if using rank column as a feature
    .sort("session")
    .fill_null(0)
)

In [38]:
def print_candidates_hit_rate(df_candidates):
    df_sessions_with_positives = (
        df_candidates
        .groupby(["session"]).agg(pl.sum("target"))
        .filter(pl.col("target") > 0)
        .select(["session"])
    )

    print("sessions with positives in candidates: ", df_sessions_with_positives["session"].n_unique())
    print("sessions with positives in test dataframe: ", df_candidates["session"].n_unique())
    print(
        "hit rate: ", 
        df_sessions_with_positives["session"].n_unique() / df_candidates["session"].n_unique()
    )
    
    df_candidates_with_positives = df_candidates.join(df_sessions_with_positives, on="session", how="inner")
    print(df_candidates_with_positives["target"].value_counts())
    
    return df_candidates_with_positives

In [28]:
print_candidates_hit_rate(df_candidates_tfidf_new)

sessions with positives in candidates:  760393
sessions with positives in test dataframe:  1265979
hit rate:  0.6006363454686057


In [29]:
df_candidates_tfidf_new_top_200 = implicit_batch_candidates_for_all_types(
    model=tfidf_new, model_name="tfidf_new",
    train_data=train_data, test_users=list(test_sessions_dict.keys()),
    topk=200,
)

100%|██████████| 1266/1266 [02:21<00:00,  8.97it/s]


In [30]:
df_candidates_tfidf_new_top_200 = (
    df_candidates_tfidf_new_top_200
    .explode(["aid", "tfidf_new_score"])
    .filter(pl.col("aid") != -1)  # some strange items from implicit
    .unique(subset=["session", "aid"], keep="last")
    .join(df_act_target, on=["session", "aid"], how="left")  # if using rank column as a feature
    .sort("session")
    .fill_null(0)
)

In [41]:
df_candidates_tfidf_new_top_200_with_positives = print_candidates_hit_rate(df_candidates_tfidf_new_top_200)

sessions with positives in candidates:  813370
sessions with positives in test dataframe:  1265979
hit rate:  0.6424830111715913
shape: (2, 2)
┌────────┬───────────┐
│ target ┆ counts    │
│ ---    ┆ ---       │
│ i32    ┆ u32       │
╞════════╪═══════════╡
│ 0      ┆ 160571535 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 1      ┆ 813370    │
└────────┴───────────┘


In [36]:
df_candidates_covisit_all_tfidf_new_top_200 = pl.concat([
    df_candidates_covisit_all.select(["session", "aid", "target"]),
    df_candidates_tfidf_new_top_200.select(["session", "aid", "target"]),
]).unique()

In [39]:
df_candidates_covisit_all_tfidf_new_top_200_with_positives = print_candidates_hit_rate(
    df_candidates_covisit_all_tfidf_new_top_200
)

sessions with positives in candidates:  852314
sessions with positives in test dataframe:  1265979
hit rate:  0.6732449748376553
shape: (2, 2)
┌────────┬───────────┐
│ target ┆ counts    │
│ ---    ┆ ---       │
│ i32    ┆ u32       │
╞════════╪═══════════╡
│ 0      ┆ 224152550 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 1      ┆ 852314    │
└────────┴───────────┘


# Candidates (Carts)

In [42]:
ACT_TYPE = "carts"

## prepare target and candidates set with this target in holdout (other users for action does not change metric)

In [43]:
df_act_target = (
    val_df_valid_targets
    .filter(pl.col("type") == ACT_TYPE)
    .join(val_df_valid_input.select(["session"]).unique(), on="session", how="inner")
    .drop("type")
    .explode("ground_truth")
    .with_column(pl.lit(1).alias("target"))
    .rename({"ground_truth": "aid"})
)

In [44]:
df = val_df_valid_input.unique().sort(["session", "ts"])

In [45]:
df_test_users = (
    df
    .join(df_act_target.select(["session"]).unique(), on="session", how="inner")
    .unique().sort(["session", "ts"])
)
test_sessions_dict = df_test_users.groupby('session').agg([pl.list("aid"), pl.list("type")])
test_sessions_dict = dict(zip(
    test_sessions_dict["session"].to_list(),
    tuple(zip(test_sessions_dict["aid"].to_list(), test_sessions_dict["type"].to_list()))
))

## covisitation top200

In [11]:
# Use top X for clicks, carts and orders
clicks_th = 15
carts_th  = 20
orders_th = 20

def get_top(df, th):
    return (
        df
        .with_column(pl.lit(1).alias("ones"))
        .with_column(pl.col("ones").cumsum().over("aid").alias("rank"))
        .filter(pl.col("rank") <= th)
    )

TOPK_RECOMMEND = 20
TOPK_RERANK = 40

In [12]:
carts_orders = pl.read_parquet("__valid__covisit_carts_orders_all_v3.parquet")
buys2buys = pl.read_parquet("__valid__covisit_buys2buys_all_v4.parquet")
clicks = pl.read_parquet("__valid__covisit_clicks_all_v3.parquet")

In [13]:
carts_orders_top = get_top(carts_orders, carts_th)
buys2buys_top = get_top(buys2buys, orders_th)
clicks_top = get_top(clicks, clicks_th)

In [14]:
top_clicks = df.filter(pl.col("type") == 0)["aid"].value_counts(sort=True)[:TOPK_RECOMMEND]["aid"].to_list()
top_carts = df.filter(pl.col("type") == 1)["aid"].value_counts(sort=True)[:TOPK_RECOMMEND]["aid"].to_list()
top_orders = df.filter(pl.col("type") == 2)["aid"].value_counts(sort=True)[:TOPK_RECOMMEND]["aid"].to_list()

In [15]:
covisit_rec = CovisitationRecommender(
    df_top_k_buys=carts_orders_top,
    df_top_k_buy2buy=buys2buys_top,
    df_top_k_clicks=clicks_top,
    top_carts=top_carts,
    top_orders=top_orders,
    top_clicks=top_clicks,
)

In [46]:
candidates_dict = {
    "session": [],
    "type": [],
    "candidates": [],
    "rank": [],
}

types = ["clicks", "carts", "orders"]
topk = TOPK_RERANK * 5

for session_id, (session_aid_list, session_type_list) in tqdm(test_sessions_dict.items()):
    rec_items_clicks = covisit_rec.recommend_clicks(session_aid_list, session_type_list, topk)
    rec_items_carts = covisit_rec.recommend_carts(session_aid_list, session_type_list, topk)
    rec_items_buys = covisit_rec.recommend_buys(session_aid_list, session_type_list, topk)

    candidates = [rec_items_clicks, rec_items_carts, rec_items_buys]
#     candidates = [rec_items_buys]
    ranks = [
        np.arange(1, len(rec_items) + 1).tolist()
        for rec_items in candidates
    ]
    
    candidates_dict["session"].extend([session_id] * len(types))
    candidates_dict["type"].extend(types)
    candidates_dict["candidates"].extend(candidates)
    candidates_dict["rank"].extend(ranks)

df_candidates_covisit = pl.DataFrame(candidates_dict)

100%|██████████| 265206/265206 [00:33<00:00, 7975.94it/s]


In [47]:
# contains target column and all test users
df_candidates_covisit_all = (
    df_candidates_covisit
    .filter(pl.col("type") == "orders")
    .drop("type")
    .explode(["candidates", "rank"])
    .rename({"candidates": "aid", "rank": "rank_orders"})
    .join(
        (
            df_candidates_covisit
            .filter(pl.col("type") == "carts")
            .drop("type")
            .explode(["candidates", "rank"])
            .rename({"candidates": "aid", "rank": "rank_carts"})
        ),
        on=["session", "aid"],
        how="outer"
    )
    .join(
        (
            df_candidates_covisit
            .filter(pl.col("type") == "clicks")
            .drop("type")
            .explode(["candidates", "rank"])
            .rename({"candidates": "aid", "rank": "rank_clicks"})
        ),
        on=["session", "aid"],
        how="outer"
    )
    .fill_null(999)
    .filter(pl.col("aid") != -1)  # some strange items from implicit
    .unique(subset=["session", "aid"], keep="last")
    .join(df_act_target, on=["session", "aid"], how="left")  # if using rank column as a feature
    .sort("session")
    .fill_null(0)
)

In [51]:
df_candidates_covisit_all_with_positives = print_candidates_hit_rate(df_candidates_covisit_all)

sessions with positives in candidates:  192741
sessions with positives in test dataframe:  265206
hit rate:  0.7267595755752132
shape: (2, 2)
┌────────┬──────────┐
│ target ┆ counts   │
│ ---    ┆ ---      │
│ i32    ┆ u32      │
╞════════╪══════════╡
│ 0      ┆ 23475373 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1      ┆ 252550   │
└────────┴──────────┘


## Implicit

In [20]:
from implicit.nearest_neighbours import CosineRecommender
from implicit.nearest_neighbours import TFIDFRecommender

### old

In [58]:
df = pl.concat([val_df_train, val_df_valid_input]).unique().sort(["session", "ts"])
df = implicit_old_weight_interactions(df)
train_data = make_sparse_matrix(df)

In [59]:
tfidf_old = TFIDFRecommender(K=200)
tfidf_old.fit(train_data)

  0%|          | 0/1855603 [00:00<?, ?it/s]

In [67]:
i2i_old = CosineRecommender(K=100)
i2i_old.fit(train_data)

  0%|          | 0/1855603 [00:00<?, ?it/s]

In [60]:
df_candidates_tfidf_old = implicit_batch_candidates_for_all_types(
    model=tfidf_old, model_name="tfidf_old",
    train_data=train_data, test_users=list(test_sessions_dict.keys()),
    topk=100,
)

100%|██████████| 266/266 [00:29<00:00,  8.89it/s]


In [68]:
df_candidates_i2i_old = implicit_batch_candidates_for_all_types(
    model=i2i_old, model_name="i2i_old",
    train_data=train_data, test_users=list(test_sessions_dict.keys()),
    topk=100,
)

100%|██████████| 266/266 [00:24<00:00, 10.95it/s]


In [61]:
df_candidates_tfidf_old = (
    df_candidates_tfidf_old
    .explode(["aid", "tfidf_old_score"])
    .filter(pl.col("aid") != -1)  # some strange items from implicit
    .unique(subset=["session", "aid"], keep="last")
    .join(df_act_target, on=["session", "aid"], how="left")  # if using rank column as a feature
    .sort("session")
    .fill_null(0)
)

In [62]:
df_candidates_tfidf_old_with_positives = print_candidates_hit_rate(df_candidates_tfidf_old)

sessions with positives in candidates:  188605
sessions with positives in test dataframe:  265206
hit rate:  0.7111641516406114
shape: (2, 2)
┌────────┬──────────┐
│ target ┆ counts   │
│ ---    ┆ ---      │
│ i32    ┆ u32      │
╞════════╪══════════╡
│ 0      ┆ 18563809 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1      ┆ 242361   │
└────────┴──────────┘


In [69]:
df_candidates_i2i_old = (
    df_candidates_i2i_old
    .explode(["aid", "i2i_old_score"])
    .filter(pl.col("aid") != -1)  # some strange items from implicit
    .unique(subset=["session", "aid"], keep="last")
    .join(df_act_target, on=["session", "aid"], how="left")  # if using rank column as a feature
    .sort("session")
    .fill_null(0)
)

In [70]:
df_candidates_i2i_old_with_positives = print_candidates_hit_rate(df_candidates_i2i_old)

sessions with positives in candidates:  176272
sessions with positives in test dataframe:  265206
hit rate:  0.6646606788685022
shape: (2, 2)
┌────────┬──────────┐
│ target ┆ counts   │
│ ---    ┆ ---      │
│ i32    ┆ u32      │
╞════════╪══════════╡
│ 0      ┆ 17354499 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1      ┆ 218371   │
└────────┴──────────┘


### new

In [71]:
df = pl.concat([val_df_train, val_df_valid_input]).unique().sort(["session", "ts"])
df = implicit_new_weight_interactions(df)
train_data = make_sparse_matrix(df)

In [22]:
tfidf_new = TFIDFRecommender(K=200)
tfidf_new.fit(train_data)

  0%|          | 0/1855603 [00:00<?, ?it/s]

In [72]:
i2i_new = CosineRecommender(K=200)
i2i_new.fit(train_data)

  0%|          | 0/1855603 [00:00<?, ?it/s]

In [49]:
df_candidates_tfidf_new = implicit_batch_candidates_for_all_types(
    model=tfidf_new, model_name="tfidf_new",
    train_data=train_data, test_users=list(test_sessions_dict.keys()),
    topk=100,
)

100%|██████████| 266/266 [00:32<00:00,  8.30it/s]


In [73]:
df_candidates_i2i_new = implicit_batch_candidates_for_all_types(
    model=i2i_new, model_name="i2i_new",
    train_data=train_data, test_users=list(val_df_valid_input["sessi".keys()),
    topk=100,
)

100%|██████████| 266/266 [00:33<00:00,  7.90it/s]


In [50]:
df_candidates_tfidf_new = (
    df_candidates_tfidf_new
    .explode(["aid", "tfidf_new_score"])
    .filter(pl.col("aid") != -1)  # some strange items from implicit
    .unique(subset=["session", "aid"], keep="last")
    .join(df_act_target, on=["session", "aid"], how="left")  # if using rank column as a feature
    .sort("session")
    .fill_null(0)
)

In [74]:
df_candidates_i2i_new = (
    df_candidates_i2i_new
    .explode(["aid", "i2i_new_score"])
    .filter(pl.col("aid") != -1)  # some strange items from implicit
    .unique(subset=["session", "aid"], keep="last")
    .join(df_act_target, on=["session", "aid"], how="left")  # if using rank column as a feature
    .sort("session")
    .fill_null(0)
)

In [52]:
df_candidates_tfidf_new_with_positives = print_candidates_hit_rate(df_candidates_tfidf_new)

sessions with positives in candidates:  188028
sessions with positives in test dataframe:  265206
hit rate:  0.7089884844234293
shape: (2, 2)
┌────────┬──────────┐
│ target ┆ counts   │
│ ---    ┆ ---      │
│ i32    ┆ u32      │
╞════════╪══════════╡
│ 0      ┆ 18508030 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1      ┆ 240440   │
└────────┴──────────┘


In [75]:
df_candidates_i2i_new_with_positives = print_candidates_hit_rate(df_candidates_i2i_new)

sessions with positives in candidates:  175969
sessions with positives in test dataframe:  265206
hit rate:  0.66351817078045
shape: (2, 2)
┌────────┬──────────┐
│ target ┆ counts   │
│ ---    ┆ ---      │
│ i32    ┆ u32      │
╞════════╪══════════╡
│ 0      ┆ 17325477 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1      ┆ 217093   │
└────────┴──────────┘


In [63]:
df_candidates_tfidf_old_new_top_100 = pl.concat([
    df_candidates_tfidf_old.select(["session", "aid", "target"]),
    df_candidates_tfidf_new.select(["session", "aid", "target"]),
]).unique()

In [64]:
df_candidates_tfidf_old_new_top_100_with_positives = print_candidates_hit_rate(
    df_candidates_tfidf_old_new_top_100
)

sessions with positives in candidates:  192544
sessions with positives in test dataframe:  265206
hit rate:  0.7260167567852914
shape: (2, 2)
┌────────┬──────────┐
│ target ┆ counts   │
│ ---    ┆ ---      │
│ i32    ┆ u32      │
╞════════╪══════════╡
│ 0      ┆ 24362998 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1      ┆ 250224   │
└────────┴──────────┘


In [76]:
df_candidates_tfidf_i2i_old_new_top_100 = pl.concat([
    df_candidates_tfidf_old.select(["session", "aid", "target"]),
    df_candidates_tfidf_new.select(["session", "aid", "target"]),
    df_candidates_i2i_old.select(["session", "aid", "target"]),
    df_candidates_i2i_new.select(["session", "aid", "target"]),
]).unique()

In [77]:
df_candidates_tfidf_i2i_old_new_top_100_with_positives = print_candidates_hit_rate(
    df_candidates_tfidf_i2i_old_new_top_100
)

sessions with positives in candidates:  193285
sessions with positives in test dataframe:  265206
hit rate:  0.728810811218449
shape: (2, 2)
┌────────┬──────────┐
│ target ┆ counts   │
│ ---    ┆ ---      │
│ i32    ┆ u32      │
╞════════╪══════════╡
│ 0      ┆ 35347915 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1      ┆ 251858   │
└────────┴──────────┘


In [53]:
df_candidates_tfidf_new_top_200 = implicit_batch_candidates_for_all_types(
    model=tfidf_new, model_name="tfidf_new",
    train_data=train_data, test_users=list(test_sessions_dict.keys()),
    topk=200,
)

100%|██████████| 266/266 [00:31<00:00,  8.44it/s]


In [54]:
df_candidates_tfidf_new_top_200 = (
    df_candidates_tfidf_new_top_200
    .explode(["aid", "tfidf_new_score"])
    .filter(pl.col("aid") != -1)  # some strange items from implicit
    .unique(subset=["session", "aid"], keep="last")
    .join(df_act_target, on=["session", "aid"], how="left")  # if using rank column as a feature
    .sort("session")
    .fill_null(0)
)

In [55]:
df_candidates_tfidf_new_top_200_with_positives = print_candidates_hit_rate(df_candidates_tfidf_new_top_200)

sessions with positives in candidates:  195930
sessions with positives in test dataframe:  265206
hit rate:  0.7387841904029321
shape: (2, 2)
┌────────┬──────────┐
│ target ┆ counts   │
│ ---    ┆ ---      │
│ i32    ┆ u32      │
╞════════╪══════════╡
│ 0      ┆ 38713692 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1      ┆ 257752   │
└────────┴──────────┘


In [56]:
df_candidates_covisit_all_tfidf_new_top_200 = pl.concat([
    df_candidates_covisit_all.select(["session", "aid", "target"]),
    df_candidates_tfidf_new_top_200.select(["session", "aid", "target"]),
]).unique()

In [57]:
df_candidates_covisit_all_tfidf_new_top_200_with_positives = print_candidates_hit_rate(
    df_candidates_covisit_all_tfidf_new_top_200
)

sessions with positives in candidates:  204472
sessions with positives in test dataframe:  265206
hit rate:  0.7709931147862417
shape: (2, 2)
┌────────┬──────────┐
│ target ┆ counts   │
│ ---    ┆ ---      │
│ i32    ┆ u32      │
╞════════╪══════════╡
│ 0      ┆ 57356795 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1      ┆ 278081   │
└────────┴──────────┘


In [65]:
df_candidates_covisit_all_tfidf_old_new_top_100 = pl.concat([
    df_candidates_covisit_all.select(["session", "aid", "target"]),
    df_candidates_tfidf_old_new_top_100.select(["session", "aid", "target"]),
]).unique()

In [66]:
df_candidates_covisit_all_tfidf_old_new_top_100_with_positives = print_candidates_hit_rate(
    df_candidates_covisit_all_tfidf_old_new_top_100
)

sessions with positives in candidates:  202061
sessions with positives in test dataframe:  265206
hit rate:  0.7619020685806505
shape: (2, 2)
┌────────┬──────────┐
│ target ┆ counts   │
│ ---    ┆ ---      │
│ i32    ┆ u32      │
╞════════╪══════════╡
│ 0      ┆ 42828456 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1      ┆ 272271   │
└────────┴──────────┘


In [81]:
df_candidates_covisit_all_tfidf_i2i_old_new_top_100 = pl.concat([
    df_candidates_covisit_all.select(["session", "aid", "target"]),
    df_candidates_i2i_old.select(["session", "aid", "target"]),
    df_candidates_i2i_new.select(["session", "aid", "target"]),
    df_candidates_tfidf_old.select(["session", "aid", "target"]),
    df_candidates_tfidf_new.select(["session", "aid", "target"]),
]).unique()

In [82]:
df_candidates_covisit_all_tfidf_i2i_old_new_top_100_with_positives = print_candidates_hit_rate(
    df_candidates_covisit_all_tfidf_i2i_old_new_top_100
)

sessions with positives in candidates:  202507
sessions with positives in test dataframe:  265206
hit rate:  0.7635837801558034
shape: (2, 2)
┌────────┬──────────┐
│ target ┆ counts   │
│ ---    ┆ ---      │
│ i32    ┆ u32      │
╞════════╪══════════╡
│ 0      ┆ 54014228 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1      ┆ 273327   │
└────────┴──────────┘


## current candidates

In [78]:
df_candidates_covisit_all_i2i_old_new_top_100 = pl.concat([
    df_candidates_covisit_all.select(["session", "aid", "target"]),
    df_candidates_i2i_old.select(["session", "aid", "target"]),
    df_candidates_i2i_new.select(["session", "aid", "target"]),
]).unique()

In [79]:
df_candidates_covisit_all_i2i_old_new_top_100_with_positives = print_candidates_hit_rate(
    df_candidates_covisit_all_i2i_old_new_top_100
)

sessions with positives in candidates:  198261
sessions with positives in test dataframe:  265206
hit rate:  0.7475735843080473
shape: (2, 2)
┌────────┬──────────┐
│ target ┆ counts   │
│ ---    ┆ ---      │
│ i32    ┆ u32      │
╞════════╪══════════╡
│ 0      ┆ 46329774 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1      ┆ 263968   │
└────────┴──────────┘
