In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from collections import Counter

from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import dill
import pandas as pd
import polars as pl
import numpy as np
from scipy.sparse import csr_matrix

# boosting
import lightgbm as lgb


import pytorch_lightning as L
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
import seaborn as sns

from tasks.data.dataset.mappers import EntityEncoder
from tasks.jobs import Splitter

In [3]:
from otto_utils import *
from otto_candidates_covisit import *

# Create covisitation matrices for submission

In [None]:
# v1
# params:
#   clicks_th = 15
#   carts_th  = 20
#   orders_th = 20
# 
#   carts_orders:
#     weights_func=weights_func_carts_order,
#     drop_dupli_func=drop_dupli_cols_for_carts_orders,
#     chunk_size=200_000,
#     attention_period=24*60*60*1000,
#     attention_type=None,
#     drop_th_sess_num=30,
#     save_top_k=carts_th,
# 
#   buys2buys:
#     weights_func=weights_func_orders,
#     drop_dupli_func=drop_dupli_cols_for_carts_orders,
#     chunk_size=200_000,
#     attention_period=14*24*60*60*1000,  # v4
#     attention_type=[1, 2],
#     drop_th_sess_num=30,
#     save_top_k=orders_th
# 
#   clicks:
#     weights_func=weights_func_clicks,
#     weights_func_params={"_type": "valid"},
#     drop_dupli_func=drop_dupli_cols_for_clicks,
#     chunk_size=200_000,
#     attention_period=24*60*60*1000,
#     attention_type=None,
#     drop_th_sess_num=30,
#     save_top_k=clicks_th,

In [4]:
df_train = pl.read_parquet(TRAIN_PROCESSED, use_pyarrow=True)
df_test = pl.read_parquet(TEST_PROCESSED, use_pyarrow=True)

In [5]:
df = pl.concat([df_train, df_test])

In [6]:
# Use top X for clicks, carts and orders
clicks_th = 15
carts_th  = 20
orders_th = 20

In [11]:
carts_orders, carts_orders_top = co_visitation_matrix(
    df,
    weights_func=weights_func_carts_order,
    drop_dupli_func=drop_dupli_cols_for_carts_orders,
    chunk_size=200_000,
    attention_period=24*60*60*1000,
    attention_type=None,
    drop_th_sess_num=30,
    save_top_k=carts_th,
)

100%|██████████| 73/73 [32:11<00:00, 26.46s/it]


In [12]:
carts_orders.write_parquet("__subm__covisit_carts_orders_all_v1.parquet")

In [7]:
buys2buys, buys2buys_top = co_visitation_matrix(
    df,
    weights_func=weights_func_orders,
    drop_dupli_func=drop_dupli_cols_for_carts_orders,
    chunk_size=200_000,
#     attention_period=7*24*60*60*1000,  # v3
    attention_period=14*24*60*60*1000,  # v4
    attention_type=[1, 2],
    drop_th_sess_num=30,
    save_top_k=orders_th
)

100%|██████████| 73/73 [05:04<00:00,  4.18s/it]


In [8]:
buys2buys.write_parquet("__subm__covisit_buys2buys_all_v1.parquet")

In [9]:
clicks, clicks_top = co_visitation_matrix(
    df,
    weights_func=weights_func_clicks,
    weights_func_params={"_type": "test"},
    drop_dupli_func=drop_dupli_cols_for_clicks,
    chunk_size=200_000,
    attention_period=24*60*60*1000,
    attention_type=None,
    drop_th_sess_num=30,
    save_top_k=clicks_th,
)

100%|██████████| 73/73 [31:55<00:00, 26.24s/it]


In [10]:
clicks.write_parquet("__subm__covisit_clicks_all_v1.parquet")

# Valid

In [9]:
# load
val_df_train = pl.read_parquet(VALIDATION_PATH / "train.parquet", use_pyarrow=True)
val_df_valid_input = pl.read_parquet(VALIDATION_PATH / "valid.parquet", use_pyarrow=True)
val_df_valid_targets = pl.read_parquet(VALIDATION_PATH / "test_labels.parquet", use_pyarrow=True)

In [10]:
val_df_valid_input.sort(["session", "ts"], reverse=[False, True])

session,aid,ts,type
i64,i64,i64,i64
11107743,1243310,1661391126514,0
11107743,731062,1661126418250,0
11107743,1243310,1661126400192,0
11107744,1556644,1661126463911,0
11107744,1637748,1661126441080,0
11107744,1556644,1661126439064,1
11107744,220086,1661126401190,0
11107745,670066,1661126813401,0
11107745,1682122,1661126776415,0
11107745,670066,1661126651614,0


In [11]:
df_train = pl.read_parquet(TRAIN_PROCESSED, use_pyarrow=True)
df_test = pl.read_parquet(TEST_PROCESSED, use_pyarrow=True)

In [6]:
df = pl.concat([val_df_train, val_df_valid_input])

In [7]:
df["session"].n_unique()

12411098

In [8]:
val_df_train["session"].max(), val_df_valid_input["session"].min(), val_df_valid_input["session"].max()

(11107742, 11107743, 12899777)

In [17]:
val_df_valid_input.groupby("session").agg([pl.count("aid")])["aid"].value_counts().sort("aid")

aid,counts
u32,u32
1,473109
2,233168
3,139059
4,91769
5,64497
6,48172
7,36971
8,29171
9,23410
10,19633


In [16]:
df_test.groupby("session").agg([pl.count("aid")])["aid"].value_counts().sort("aid")

aid,counts
u32,u32
1,750099
2,294443
3,160436
4,101337
5,69364
6,49942
7,37946
8,29605
9,23411
10,19412


In [5]:
import numpy as np

np.logspace(0.1,1, 30, base=2, endpoint=True) - 1

array([0.07177346, 0.09507864, 0.11889059, 0.14322031, 0.16807907,
       0.19347836, 0.21942996, 0.24594586, 0.27303833, 0.30071992,
       0.32900343, 0.35790195, 0.38742885, 0.41759781, 0.44842277,
       0.479918  , 0.51209808, 0.54497791, 0.57857268, 0.61289796,
       0.64796963, 0.68380391, 0.72041739, 0.75782701, 0.79605009,
       0.83510431, 0.87500774, 0.91577885, 0.95743651, 1.        ])

# utils 

In [9]:
def weights_func_carts_order(df, _type_weight=None):
    if _type_weight is None:
        _type_weight = {"type_right": [0, 1, 2], "weight": [0.5, 9, 0.5]}
    df_type_weight = pl.DataFrame(_type_weight)
    df = df.join(df_type_weight, on="type_right")
    return df

def weights_func_orders(df):
    df = df.with_column(pl.lit(1).alias("weight"))
    return df

def weights_func_clicks(df, _type="valid"):
    if _type == "valid":
        df = df.with_column(
            (1 + 3 * (pl.col("ts") - 1659304800025) / (1661723998621 - 1659304800025)).alias("weight")
        )
    elif _type == "test":
        df = df.with_column(
            (1 + 3 * (pl.col("ts") - 1659304800025) / (1662328791563 - 1659304800025)).alias("weight")
        )
    # 1659304800025 : minimum timestamp
    # 1661723998621 : maximum timestamp valid
    # 1662328791563 : maximum timestamp test
    return df

In [10]:
def drop_dupli_cols_for_carts_orders(df):
    df = (
        df.select(['session', 'aid', 'aid_right', 'type_right']).unique()
    )
    return df

def drop_dupli_cols_for_clicks(df):
    df = (
        df
        .groupby(['session', 'aid', 'aid_right'])
        .agg([pl.first('ts').alias("ts")])
    )
    return df

In [11]:
def co_visitation_matrix(
    df,
    weights_func,
    drop_dupli_func,
    weights_func_params=None,
    chunk_size=200_000,
    drop_th_sess_num=30,
    attention_period=24*60*60*1000,
    attention_type=None,
    save_top_k=20,
):
    for session in tqdm(range(0, df["session"].max(), chunk_size)):
        s_start = session
        s_end = session + chunk_size - 1

        df_chunk = df.filter(pl.col("session").is_between(s_start, s_end))
        if attention_type is not None:
            df_chunk = df_chunk.filter(pl.col("type").is_in(attention_type))
        df_chunk = (
            df_chunk
            .sort(["session", "ts"], reverse=[False, True])
            .with_column(pl.lit(1).alias("ones"))
            .with_column(pl.col("ones").cumsum().over("session").alias("rank"))
            .select([pl.all().exclude("ones"),])
            .filter(pl.col("rank") <= drop_th_sess_num)
        )
        
        # create pairs
        df_chunk = df_chunk.join(df_chunk, on="session", how="inner")
        df_chunk = (
            df_chunk
            .filter(( (pl.col("ts") - pl.col("ts_right")) < attention_period) & 
                    (pl.col("aid") != pl.col("aid_right")))
        )
        
        df_chunk = drop_dupli_func(df_chunk)
#         print(df_chunk)
        if weights_func_params is not None:
            df_chunk = weights_func(df_chunk, **weights_func_params)
        else:
            df_chunk = weights_func(df_chunk)

        df_chunk = (
            df_chunk
            .groupby(['aid', 'aid_right'])
            .agg([pl.sum("weight")])
        )

        if session == 0: tmp = df_chunk
        else: tmp = pl.concat([tmp, df_chunk]).groupby(['aid', 'aid_right']).agg([pl.sum("weight")])

    tmp = tmp.sort(['aid','weight'], reverse=[False, True])
    
    tmp_top_k = (
        tmp
        .with_column(pl.lit(1).alias("ones"))
        .with_column(pl.col("ones").cumsum().over("aid").alias("rank"))
        .filter(pl.col("rank") <= save_top_k)
    )
    return tmp, tmp_top_k

# new pipeline

In [12]:
# Use top X for clicks, carts and orders
clicks_th = 15
carts_th  = 20
orders_th = 20

In [13]:
carts_orders, carts_orders_top = co_visitation_matrix(
    df,
    weights_func=weights_func_carts_order,
    drop_dupli_func=drop_dupli_cols_for_carts_orders,
    chunk_size=200_000,
    attention_period=24*60*60*1000,
    attention_type=None,
    drop_th_sess_num=30,
    save_top_k=carts_th,
)

  0%|          | 0/65 [00:00<?, ?it/s]

In [14]:
carts_orders.write_parquet("__valid__covisit_carts_orders_all_v3.parquet")

In [51]:
buys2buys, buys2buys_top = co_visitation_matrix(
    df,
    weights_func=weights_func_orders,
    drop_dupli_func=drop_dupli_cols_for_carts_orders,
    chunk_size=200_000,
#     attention_period=7*24*60*60*1000,  # v3
    attention_period=14*24*60*60*1000,  # v4
    attention_type=[1, 2],
    drop_th_sess_num=30,
    save_top_k=orders_th
)

  0%|          | 0/65 [00:00<?, ?it/s]

In [52]:
buys2buys.write_parquet("__valid__covisit_buys2buys_all_v4.parquet")

In [38]:
clicks, clicks_top = co_visitation_matrix(
    df,
    weights_func=weights_func_clicks,
    weights_func_params={"_type": "valid"},
    drop_dupli_func=drop_dupli_cols_for_clicks,
    chunk_size=200_000,
    attention_period=24*60*60*1000,
    attention_type=None,
    drop_th_sess_num=30,
    save_top_k=clicks_th,
)

  0%|          | 0/65 [00:00<?, ?it/s]

In [39]:
clicks.write_parquet("__valid__covisit_clicks_all_v3.parquet")

In [26]:
carts_orders.write_parquet("__valid__covisit_carts_orders_all_v2.parquet")
buys2buys.write_parquet("__valid__covisit_buys2buys_all_v2.parquet")
clicks.write_parquet("__valid__covisit_clicks_all_v2.parquet")

In [18]:
clicks = pl.read_parquet("__valid__covisit_clicks_all_v2.parquet")
clicks_top = (
    clicks
    .with_column(pl.lit(1).alias("ones"))
    .with_column(pl.col("ones").cumsum().over("aid").alias("rank"))
    .filter(pl.col("rank") <= clicks_th)
)

In [53]:
def pldf_to_dict(df):
    df = df.groupby("aid").agg([pl.list("aid_right")])
    return dict(zip(df["aid"].to_list(), df["aid_right"].to_list()))

top_k_buys = pldf_to_dict(carts_orders_top)
top_k_buy2buy = pldf_to_dict(buys2buys_top)
top_k_clicks = pldf_to_dict(clicks_top)

In [54]:
top_clicks = val_df_valid_input.filter(pl.col("type") == 0)["aid"].value_counts(sort=True)[:20]["aid"].to_list()
top_carts = val_df_valid_input.filter(pl.col("type") == 1)["aid"].value_counts(sort=True)[:20]["aid"].to_list()
top_orders = val_df_valid_input.filter(pl.col("type") == 2)["aid"].value_counts(sort=True)[:20]["aid"].to_list()

In [55]:
import itertools

type_weight_multipliers = {0: 0.5, 1: 9, 2: 0.5}

In [56]:
def recommend_clicks(session_aid_list, session_type_list, topk=20):
    aids = session_aid_list
    types = session_type_list
    unique_aids = list(dict.fromkeys(aids[::-1]))

    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids) >= topk:
        weights=np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids, weights, types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        sorted_aids = [k for k,v in aids_temp.most_common(topk)]
        return sorted_aids

    # USE "CLICKS" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_k_clicks[aid] for aid in unique_aids if aid in top_k_clicks]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2).most_common(topk) if aid2 not in unique_aids]    
    result = unique_aids + top_aids2[:topk - len(unique_aids)]
    set_result = set(result)  # remove duplicates
    return result + [i for i in top_clicks if i not in set_result][:topk - len(result)]
    
    # USE TOP20 TEST CLICKS
#     return result + list(top_clicks)[:topk-len(result)]

In [57]:
def recommend_carts(session_aid_list, session_type_list, topk=20):
    aids = session_aid_list
    types = session_type_list
    unique_aids = list(dict.fromkeys(aids[::-1]))

    buy_aids = [aid for i, aid in enumerate(aids) if types[i] != 2]
    buy_types = [t for i, t in enumerate(types) if types[i] != 2]
    unique_buys = list(dict.fromkeys(buy_aids[::-1]))

    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids) >= topk:
        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids, weights, types): 
            aids_temp[aid] += w * type_weight_multipliers[t]

        # Rerank candidates using"top_20_carts" co-visitation matrix
        aids2 = list(itertools.chain(*[top_k_buys[aid] for aid in unique_buys if aid in top_k_buys]))
        for aid in aids2: aids_temp[aid] += 0.1
        sorted_aids = [k for k,v in aids_temp.most_common(20)]
        return sorted_aids

    # Use "cart order" and "clicks" co-visitation matrices
    aids1 = list(itertools.chain(*[top_k_clicks[aid] for aid in unique_aids if aid in top_k_clicks]))
    aids2 = list(itertools.chain(*[top_k_buys[aid] for aid in unique_aids if aid in top_k_buys]))

    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids1+aids2).most_common(topk) if aid2 not in unique_aids]    
    result = unique_aids + top_aids2[:topk - len(unique_aids)]
    set_result = set(result)  # remove duplicates
    return result + [i for i in top_carts if i not in set_result][:topk - len(result)]
    # USE TOP20 TEST CLICKS
#     return result + list(top_clicks)[:topk-len(result)]

In [58]:
def recommend_buys(session_aid_list, session_type_list, topk=20):
    aids = session_aid_list
    types = session_type_list
    unique_aids = list(dict.fromkeys(aids[::-1]))
    
    buy_aids = [aid for i, aid in enumerate(aids) if types[i] != 0]
    buy_types = [t for i, t in enumerate(types) if types[i] != 0]
    unique_buys = list(dict.fromkeys(buy_aids[::-1]))

    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids)>=topk:
        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        # RERANK CANDIDATES USING "BUY2BUY" CO-VISITATION MATRIX
        aids3 = list(itertools.chain(*[top_k_buy2buy[aid] for aid in unique_buys if aid in top_k_buy2buy]))
        for aid in aids3: aids_temp[aid] += 0.1
        sorted_aids = [k for k,v in aids_temp.most_common(topk)]
        return sorted_aids

    # USE "CART ORDER" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_k_buys[aid] for aid in unique_aids if aid in top_k_buys]))
    # USE "BUY2BUY" CO-VISITATION MATRIX
    aids3 = list(itertools.chain(*[top_k_buy2buy[aid] for aid in unique_buys if aid in top_k_buy2buy]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2+aids3).most_common(topk) if aid2 not in unique_aids] 
    result = unique_aids + top_aids2[:topk - len(unique_aids)]
    set_result = set(result)  # remove duplicates
    return result + [i for i in top_orders if i not in set_result][:topk - len(result)]

    # USE TOP20 TEST ORDERS
#     return result + list(top_orders)[:topk-len(result)]

In [59]:
test_df = val_df_valid_input.sort(["session", "ts"])
test_session_dict = test_df.groupby('session').agg([pl.list("aid"), pl.list("type")])
test_session_dict = dict(zip(test_session_dict["session"].to_list(),
                             tuple(zip(test_session_dict["aid"].to_list(), test_session_dict["type"].to_list()))
                        ))

In [60]:
submission_dict = {
    "session_type": [],
    "labels": [],
}

types = ["clicks", "carts", "orders"][:1]
topk = 20


for session_id, (session_aid_list, session_type_list) in tqdm(test_session_dict.items()):
    rec_items = recommend_clicks(session_aid_list, session_type_list, topk)
    
    session_types = [f"{session_id}_{t}" for t in types]
    labels = " ".join(str(aid) for aid in rec_items)
    labels_list = [labels] * len(types)
    
    submission_dict["session_type"].extend(session_types)
    submission_dict["labels"].extend(labels_list)

df_submission_clicks = pl.DataFrame(submission_dict)

  0%|          | 0/1303355 [00:00<?, ?it/s]

In [61]:
submission_dict = {
    "session_type": [],
    "labels": [],
}

types = ["clicks", "carts", "orders"][1:2]
topk = 20


for session_id, (session_aid_list, session_type_list) in tqdm(test_session_dict.items()):
    rec_items = recommend_carts(session_aid_list, session_type_list, topk)
    
    session_types = [f"{session_id}_{t}" for t in types]
    labels = " ".join(str(aid) for aid in rec_items)
    labels_list = [labels] * len(types)
    
    submission_dict["session_type"].extend(session_types)
    submission_dict["labels"].extend(labels_list)

df_submission_carts = pl.DataFrame(submission_dict)

  0%|          | 0/1303355 [00:00<?, ?it/s]

In [62]:
submission_dict = {
    "session_type": [],
    "labels": [],
}

types = ["clicks", "carts", "orders"][2:]
topk = 20


for session_id, (session_aid_list, session_type_list) in tqdm(test_session_dict.items()):
    rec_items = recommend_buys(session_aid_list, session_type_list, topk)
    
    session_types = [f"{session_id}_{t}" for t in types]
    labels = " ".join(str(aid) for aid in rec_items)
    labels_list = [labels] * len(types)
    
    submission_dict["session_type"].extend(session_types)
    submission_dict["labels"].extend(labels_list)

df_submission_orders = pl.DataFrame(submission_dict)

  0%|          | 0/1303355 [00:00<?, ?it/s]

In [63]:
# LB (???) - small preprocess fix
df_submission = pl.concat([df_submission_clicks, df_submission_carts, df_submission_orders])
valid_stats = calc_valid_score(df_submission, topk=20)

validation score: 0.5553375772931675
recall per type: type
carts    0.3932
clicks   0.5155
orders   0.6430
dtype: float64


In [50]:
# LB (???) - small preprocess fix
df_submission = pl.concat([df_submission_clicks, df_submission_carts, df_submission_orders])
valid_stats = calc_valid_score(df_submission, topk=20)

validation score: 0.5553063559127661
recall per type: type
carts    0.3932
clicks   0.5155
orders   0.6430
dtype: float64


In [31]:
0.5553 + 0.021

0.5763

In [29]:
# LB (???) - small preprocess fix
df_submission = pl.concat([df_submission_clicks, df_submission_carts, df_submission_orders])
valid_stats = calc_valid_score(df_submission, topk=20)

validation score: 0.5553075399331157
recall per type: type
carts    0.3932
clicks   0.5154
orders   0.6430
dtype: float64


In [69]:
# LB (???)
df_submission = pl.concat([df_submission_clicks, df_submission_carts, df_submission_orders])
valid_stats = calc_valid_score(df_submission, topk=20)

validation score: 0.5527826139371954
recall per type: type
carts    0.3912
clicks   0.5154
orders   0.6398
dtype: float64


In [44]:
# LB (???)
df_submission = pl.concat([df_submission_clicks, df_submission_carts, df_submission_orders])
valid_stats = calc_valid_score(df_submission, topk=20)

validation score: 0.5527566258603513
recall per type: type
carts    0.3910
clicks   0.5154
orders   0.6399
dtype: float64


# buys

In [34]:
chunk_size = 200_000
type_weight = {
    0:0.5,
    1:9,
    2:0.5
}
type_map = {
    "clicks": 0,
    "carts": 1,
    "orders": 2
}
df_type_weight = pl.DataFrame({"type": [0, 1, 2], "weight": [1, 3, 6]})

for session in tqdm(range(0, val_df_valid_input["session"].max(), chunk_size)):
    s_start = session
    s_end = session + chunk_size - 1
    
    df_chunk = df.filter(pl.col("session").is_between(s_start, s_end))
    
    df_chunk = (
        df_chunk
        .sort(["session", "ts"], reverse=[False, True])
        .with_column(pl.lit(1).alias("ones"))
        .with_column(pl.col("ones").cumsum().over("session").alias("rank"))
        .select([
            pl.all().exclude("ones"),
        ])
        .filter(pl.col("rank") <= 30)
    )
    
    # create pairs
    df_chunk = df_chunk.join(df_chunk, on="session", how="inner")
    df_chunk = (
        df_chunk
        .filter((pl.col("ts") - pl.col("ts_right") < 24 * 60 * 60 * 1000) & 
                (pl.col("aid") != pl.col("aid_right")))
    )
    
    # ASSIGN WEIGHTS
    df_chunk = (
        df_chunk
        .groupby(['session', 'aid', 'aid_right'])
        .agg([pl.first('type_right').alias("type")])
        .join(df_type_weight, on="type")
        .groupby(['aid', 'aid_right'])
        .agg([pl.sum("weight")])
    )
    
    if session == 0: tmp = df_chunk
    else: tmp = pl.concat([tmp, df_chunk]).groupby(['aid', 'aid_right']).agg([pl.sum("weight")])

tmp = (
    tmp
    .sort(['aid','weight'], reverse=[False, True])
    .with_column(pl.lit(1).alias("ones"))
    .with_column(pl.col("ones").cumsum().over("aid").alias("rank"))
    .filter(pl.col("rank") <= 15)
)

  0%|          | 0/65 [00:00<?, ?it/s]

In [36]:
tmp.drop("ones").write_parquet("__valid__top_15_carts_orders.parquet")

# Buy2Buy

In [37]:
chunk_size = 200_000
type_weight = {0:1, 1:6, 2:3}
type_map = {
    "clicks": 0,
    "carts": 1,
    "orders": 2
}
df_type_weight = pl.DataFrame({"type": [1, 2], "weight": [1, 1]})

for session in tqdm(range(0, val_df_valid_input["session"].max(), chunk_size)):
    s_start = session
    s_end = session + chunk_size - 1
    
    df_chunk = df.filter(pl.col("session").is_between(s_start, s_end))
    
    df_chunk = (
        df_chunk
        .filter(pl.col("type").is_in([1, 2]))
        .sort(["session", "ts"], reverse=[False, True])
        .with_column(pl.lit(1).alias("ones"))
        .with_column(pl.col("ones").cumsum().over("session").alias("rank"))
        .select([
            pl.all().exclude("ones"),
        ])
        .filter(pl.col("rank") <= 30)
    )
    
    # create pairs
    df_chunk = df_chunk.join(df_chunk, on="session", how="inner")
    df_chunk = (
        df_chunk
        .filter((pl.col("ts") - pl.col("ts_right") < 14 * 24 * 60 * 60 * 1000) &  # 14 DAYS
                (pl.col("aid") != pl.col("aid_right")))
    )
    
    # ASSIGN WEIGHTS
    df_chunk = (
        df_chunk
        .groupby(['session', 'aid', 'aid_right'])
        .agg([pl.first('type_right').alias("type")])
        .join(df_type_weight, on="type")
        .groupby(['aid', 'aid_right'])
        .agg([pl.sum("weight")])
    )
    
    if session == 0: tmp2 = df_chunk
    else: tmp2 = pl.concat([tmp2, df_chunk]).groupby(['aid', 'aid_right']).agg([pl.sum("weight")])

tmp2 = (
    tmp2
    .sort(['aid','weight'], reverse=[False, True])
    .with_column(pl.lit(1).alias("ones"))
    .with_column(pl.col("ones").cumsum().over("aid").alias("rank"))
    .filter(pl.col("rank") <= 15)
)

  0%|          | 0/65 [00:00<?, ?it/s]

In [39]:
tmp2.drop("ones").write_parquet("__valid__top_15_buy2buy.parquet")

# Clicks discounted time

In [49]:
chunk_size = 200_000
type_weight = {0:1, 1:6, 2:3}
type_map = {
    "clicks": 0,
    "carts": 1,
    "orders": 2
}
df_type_weight = pl.DataFrame({"type": [1, 2], "weight": [1, 1]})

for session in tqdm(range(0, val_df_valid_input["session"].max(), chunk_size)):
    s_start = session
    s_end = session + chunk_size - 1
    
    df_chunk = df.filter(pl.col("session").is_between(s_start, s_end))
    
    df_chunk = (
        df_chunk
        .sort(["session", "ts"], reverse=[False, True])
        .with_column(pl.lit(1).alias("ones"))
        .with_column(pl.col("ones").cumsum().over("session").alias("rank"))
        .select([
            pl.all().exclude("ones"),
        ])
        .filter(pl.col("rank") <= 30)
    )
    
    # create pairs
    df_chunk = df_chunk.join(df_chunk, on="session", how="inner")
    df_chunk = (
        df_chunk
        .filter((pl.col("ts") - pl.col("ts_right") < 24 * 60 * 60 * 1000) & 
                (pl.col("aid") != pl.col("aid_right")))
    )
    
    # ASSIGN WEIGHTS
#     df['wgt'] = 1 + 3*(df.ts_x - 1659304800)/(1662328791-1659304800)
    df_chunk = (
        df_chunk
        .groupby(['session', 'aid', 'aid_right'])
        .agg([pl.first('ts').alias("ts")])
        .with_column((1 + 3 * (pl.col("ts") - 1659304800025) / (1661723998621 - 1659304800025)).alias("weight"))
        .groupby(['aid', 'aid_right'])
        .agg([pl.sum("weight")])
    )
    
    if session == 0: tmp3 = df_chunk
    else: tmp3 = pl.concat([tmp3, df_chunk]).groupby(['aid', 'aid_right']).agg([pl.sum("weight")])

tmp3 = (
    tmp3
    .sort(['aid','weight'], reverse=[False, True])
    .with_column(pl.lit(1).alias("ones"))
    .with_column(pl.col("ones").cumsum().over("aid").alias("rank"))
    .filter(pl.col("rank") <= 20)
)

  0%|          | 0/65 [00:00<?, ?it/s]

In [50]:
tmp3.drop("ones").write_parquet("__valid__top_20_clicks.parquet")

In [42]:
(1662328791-1659304800 ) / 86400

34.99989583333333

In [44]:
df["ts"].min(), df["ts"].max()

(1659304800025, 1661723998621)

In [45]:
df_train["ts"].min(), df_test["ts"].max()

(1659304800025, 1662328791563)

In [55]:
def pldf_to_dict(df):
    df = df.groupby("aid").agg([pl.list("aid_right")])
    return dict(zip(df["aid"].to_list(), df["aid_right"].to_list()))

top_20_buys = pldf_to_dict(tmp)
top_20_buy2buy = pldf_to_dict(tmp2)
top_20_clicks = pldf_to_dict(tmp3)

top_clicks = val_df_valid_input.filter(pl.col("type") == 0)["aid"].value_counts(sort=True)[:20]["aid"].to_list()
top_orders = val_df_valid_input.filter(pl.col("type") == 2)["aid"].value_counts(sort=True)[:20]["aid"].to_list()

In [79]:
top_clicks = val_df_valid_input.filter(pl.col("type") == 0)["aid"].value_counts(sort=True)[:20]["aid"].to_list()
top_orders = val_df_valid_input.filter(pl.col("type") == 2)["aid"].value_counts(sort=True)[:20]["aid"].to_list()

In [63]:
test_df = val_df_valid_input.sort(["session", "ts"])
test_session_dict = test_df.groupby('session').agg([pl.list("aid"), pl.list("type")])
test_session_dict = dict(zip(test_session_dict["session"].to_list(),
                             tuple(zip(test_session_dict["aid"].to_list(), test_session_dict["type"].to_list()))
                        ))

In [80]:
import itertools

type_weight_multipliers = {0: 1, 1: 3, 2: 6}

def recommend_clicks(session_aid_list, session_type_list, topk=20):
    aids = session_aid_list
    types = session_type_list
    unique_aids = list(dict.fromkeys(aids[::-1]))

    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids) >= topk:
        weights=np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids, weights, types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        sorted_aids = [k for k,v in aids_temp.most_common(topk)]
        return sorted_aids

    # USE "CLICKS" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_20_clicks[aid] for aid in unique_aids if aid in top_20_clicks]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2).most_common(topk) if aid2 not in unique_aids]    
    result = unique_aids + top_aids2[:topk - len(unique_aids)]
    # USE TOP20 TEST CLICKS
    return result + list(top_clicks)[:topk-len(result)]

def recommend_buys(session_aid_list, session_type_list, topk=20):
    aids = session_aid_list
    types = session_type_list
    unique_aids = list(dict.fromkeys(aids[::-1]))
    
    buy_aids = [aid for i, aid in enumerate(aids) if types[i] != 0]
    buy_types = [t for i, t in enumerate(types) if types[i] != 0]
    unique_buys = list(dict.fromkeys(buy_aids[::-1]))

    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids)>=topk:
        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        # RERANK CANDIDATES USING "BUY2BUY" CO-VISITATION MATRIX
        aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
        for aid in aids3: aids_temp[aid] += 0.1
        sorted_aids = [k for k,v in aids_temp.most_common(topk)]
        return sorted_aids
    # USE "CART ORDER" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_20_buys[aid] for aid in unique_aids if aid in top_20_buys]))
    # USE "BUY2BUY" CO-VISITATION MATRIX
    aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2+aids3).most_common(topk) if aid2 not in unique_aids] 
    result = unique_aids + top_aids2[:topk - len(unique_aids)]
    # USE TOP20 TEST ORDERS
    return result + list(top_orders)[:topk-len(result)]

In [82]:
submission_dict = {
    "session_type": [],
    "labels": [],
}

types = ["clicks", "carts", "orders"][:1]
topk = 20


for session_id, (session_aid_list, session_type_list) in tqdm(test_session_dict.items()):
    rec_items = recommend_clicks(session_aid_list, session_type_list, topk)
    
    session_types = [f"{session_id}_{t}" for t in types]
    labels = " ".join(str(aid) for aid in rec_items)
    labels_list = [labels] * len(types)
    
    submission_dict["session_type"].extend(session_types)
    submission_dict["labels"].extend(labels_list)

  0%|          | 0/1303355 [00:00<?, ?it/s]

In [83]:
df_submission_clicks = pl.DataFrame(submission_dict)

In [84]:
submission_dict = {
    "session_type": [],
    "labels": [],
}

types = ["clicks", "carts", "orders"][1:]
topk = 20


for session_id, (session_aid_list, session_type_list) in tqdm(test_session_dict.items()):
    rec_items = recommend_buys(session_aid_list, session_type_list, topk)
    
    session_types = [f"{session_id}_{t}" for t in types]
    labels = " ".join(str(aid) for aid in rec_items)
    labels_list = [labels] * len(types)
    
    submission_dict["session_type"].extend(session_types)
    submission_dict["labels"].extend(labels_list)

  0%|          | 0/1303355 [00:00<?, ?it/s]

In [85]:
df_submission_carts_orders = pl.DataFrame(submission_dict)

In [86]:
# LB 0.574
df_submission = pl.concat([df_submission_clicks, df_submission_carts_orders])
valid_stats = calc_valid_score(df_submission, topk=20)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_labels["ground_truth"].loc[labels_null_idx] = (


validation score: 0.5533798707316397
recall per type: type
carts    0.3894
clicks   0.5207
orders   0.6408
dtype: float64


In [37]:
0.5207*0.1 + 0.3932*0.3 + 0.6430 * 0.6

0.55583

# for submission

In [13]:
df = pl.concat([df_train, df_test])

In [88]:
chunk_size = 200_000
type_weight = {0:1, 1:6, 2:3}
type_map = {
    "clicks": 0,
    "carts": 1,
    "orders": 2
}
df_type_weight = pl.DataFrame({"type": [0, 1, 2], "weight": [1, 3, 6]})

for session in tqdm(range(0, df_test["session"].max(), chunk_size)):
    s_start = session
    s_end = session + chunk_size - 1
    
    df_chunk = df.filter(pl.col("session").is_between(s_start, s_end))
    
    df_chunk = (
        df_chunk
        .sort(["session", "ts"], reverse=[False, True])
        .with_column(pl.lit(1).alias("ones"))
        .with_column(pl.col("ones").cumsum().over("session").alias("rank"))
        .select([
            pl.all().exclude("ones"),
        ])
        .filter(pl.col("rank") <= 30)
    )
    
    # create pairs
    df_chunk = df_chunk.join(df_chunk, on="session", how="inner")
    df_chunk = (
        df_chunk
        .filter((pl.col("ts") - pl.col("ts_right") < 24 * 60 * 60 * 1000) & 
                (pl.col("aid") != pl.col("aid_right")))
    )
    
    # ASSIGN WEIGHTS
    df_chunk = (
        df_chunk
        .groupby(['session', 'aid', 'aid_right'])
        .agg([pl.first('type_right').alias("type")])
        .join(df_type_weight, on="type")
        .groupby(['aid', 'aid_right'])
        .agg([pl.sum("weight")])
    )
    
    if session == 0: tmp = df_chunk
    else: tmp = pl.concat([tmp, df_chunk]).groupby(['aid', 'aid_right']).agg([pl.sum("weight")])

tmp = (
    tmp
    .sort(['aid','weight'], reverse=[False, True])
    .with_column(pl.lit(1).alias("ones"))
    .with_column(pl.col("ones").cumsum().over("aid").alias("rank"))
#     .filter(pl.col("rank") <= 15)
)

  0%|          | 0/73 [00:00<?, ?it/s]

In [89]:
tmp.drop("ones").write_parquet("__subm__all_carts_orders.parquet")
tmp.drop("ones").filter(pl.col("rank") <= 15).write_parquet("__subm__top_15_carts_orders.parquet")

In [90]:
chunk_size = 200_000
type_weight = {0:1, 1:6, 2:3}
type_map = {
    "clicks": 0,
    "carts": 1,
    "orders": 2
}
df_type_weight = pl.DataFrame({"type": [1, 2], "weight": [1, 1]})

for session in tqdm(range(0, df_test["session"].max(), chunk_size)):
    s_start = session
    s_end = session + chunk_size - 1
    
    df_chunk = df.filter(pl.col("session").is_between(s_start, s_end))
    
    df_chunk = (
        df_chunk
        .filter(pl.col("type").is_in([1, 2]))
        .sort(["session", "ts"], reverse=[False, True])
        .with_column(pl.lit(1).alias("ones"))
        .with_column(pl.col("ones").cumsum().over("session").alias("rank"))
        .select([
            pl.all().exclude("ones"),
        ])
        .filter(pl.col("rank") <= 30)
    )
    
    # create pairs
    df_chunk = df_chunk.join(df_chunk, on="session", how="inner")
    df_chunk = (
        df_chunk
        .filter((pl.col("ts") - pl.col("ts_right") < 14 * 24 * 60 * 60 * 1000) &  # 14 DAYS
                (pl.col("aid") != pl.col("aid_right")))
    )
    
    # ASSIGN WEIGHTS
    df_chunk = (
        df_chunk
        .groupby(['session', 'aid', 'aid_right'])
        .agg([pl.first('type_right').alias("type")])
        .join(df_type_weight, on="type")
        .groupby(['aid', 'aid_right'])
        .agg([pl.sum("weight")])
    )
    
    if session == 0: tmp2 = df_chunk
    else: tmp2 = pl.concat([tmp2, df_chunk]).groupby(['aid', 'aid_right']).agg([pl.sum("weight")])

tmp2 = (
    tmp2
    .sort(['aid','weight'], reverse=[False, True])
    .with_column(pl.lit(1).alias("ones"))
    .with_column(pl.col("ones").cumsum().over("aid").alias("rank"))
#     .filter(pl.col("rank") <= 15)
)

  0%|          | 0/73 [00:00<?, ?it/s]

In [91]:
tmp2.drop("ones").write_parquet("__subm__all_buy2buy.parquet")
tmp2.drop("ones").filter(pl.col("rank") <= 15).write_parquet("__subm__top_15_buy2buy.parquet")

In [92]:
chunk_size = 200_000
type_weight = {0:1, 1:6, 2:3}
type_map = {
    "clicks": 0,
    "carts": 1,
    "orders": 2
}
df_type_weight = pl.DataFrame({"type": [1, 2], "weight": [1, 1]})

for session in tqdm(range(0, df_test["session"].max(), chunk_size)):
    s_start = session
    s_end = session + chunk_size - 1
    
    df_chunk = df.filter(pl.col("session").is_between(s_start, s_end))
    
    df_chunk = (
        df_chunk
        .sort(["session", "ts"], reverse=[False, True])
        .with_column(pl.lit(1).alias("ones"))
        .with_column(pl.col("ones").cumsum().over("session").alias("rank"))
        .select([
            pl.all().exclude("ones"),
        ])
        .filter(pl.col("rank") <= 30)
    )
    
    # create pairs
    df_chunk = df_chunk.join(df_chunk, on="session", how="inner")
    df_chunk = (
        df_chunk
        .filter((pl.col("ts") - pl.col("ts_right") < 24 * 60 * 60 * 1000) & 
                (pl.col("aid") != pl.col("aid_right")))
    )
    
    # ASSIGN WEIGHTS
#     df['wgt'] = 1 + 3*(df.ts_x - 1659304800)/(1662328791-1659304800)
    df_chunk = (
        df_chunk
        .groupby(['session', 'aid', 'aid_right'])
        .agg([pl.first('ts').alias("ts")])
        .with_column((1 + 3 * (pl.col("ts") - 1659304800025) / (1662328791563 - 1659304800025)).alias("weight"))
        .groupby(['aid', 'aid_right'])
        .agg([pl.sum("weight")])
    )
    
    if session == 0: tmp3 = df_chunk
    else: tmp3 = pl.concat([tmp3, df_chunk]).groupby(['aid', 'aid_right']).agg([pl.sum("weight")])

tmp3 = (
    tmp3
    .sort(['aid','weight'], reverse=[False, True])
    .with_column(pl.lit(1).alias("ones"))
    .with_column(pl.col("ones").cumsum().over("aid").alias("rank"))
#     .filter(pl.col("rank") <= 20)
)

  0%|          | 0/73 [00:00<?, ?it/s]

In [97]:
tmp3.drop("ones").write_parquet("__subm__all_clicks.parquet")
tmp3.drop("ones").filter(pl.col("rank") <= 20).write_parquet("__subm__top_20_clicks.parquet")

In [11]:
def pldf_to_dict(df):
    df = df.groupby("aid").agg([pl.list("aid_right")])
    return dict(zip(df["aid"].to_list(), df["aid_right"].to_list()))

top_20_buys = pldf_to_dict(pl.read_parquet("__subm__top_15_carts_orders.parquet"))
top_20_buy2buy = pldf_to_dict(pl.read_parquet("__subm__top_15_buy2buy.parquet"))
top_20_clicks = pldf_to_dict(pl.read_parquet("__subm__top_20_clicks.parquet"))

top_clicks = df_test.filter(pl.col("type") == 0)["aid"].value_counts(sort=True)[:20]["aid"].to_list()
top_orders = df_test.filter(pl.col("type") == 2)["aid"].value_counts(sort=True)[:20]["aid"].to_list()

In [14]:
test_df = df_test.sort(["session", "ts"])
test_session_dict = test_df.groupby('session').agg([pl.list("aid"), pl.list("type")])
test_session_dict = dict(zip(test_session_dict["session"].to_list(),
                             tuple(zip(test_session_dict["aid"].to_list(), test_session_dict["type"].to_list()))
                        ))

In [15]:
import itertools

type_weight_multipliers = {0: 1, 1: 3, 2: 6}

def recommend_clicks(session_aid_list, session_type_list, topk=20):
    aids = session_aid_list
    types = session_type_list
    unique_aids = list(dict.fromkeys(aids[::-1]))

    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids) >= topk:
        weights=np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids, weights, types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        sorted_aids = [k for k,v in aids_temp.most_common(topk)]
        return sorted_aids

    
    # USE "CLICKS" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_20_clicks[aid] for aid in unique_aids if aid in top_20_clicks]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2).most_common(topk) if aid2 not in unique_aids]    
    result = unique_aids + top_aids2[:topk - len(unique_aids)]
    # USE TOP20 TEST CLICKS
    return result + list(top_clicks)[:topk-len(result)]

def recommend_buys(session_aid_list, session_type_list, topk=20):
    aids = session_aid_list
    types = session_type_list
    unique_aids = list(dict.fromkeys(aids[::-1]))
    
    buy_aids = [aid for i, aid in enumerate(aids) if types[i] != 0]
    buy_types = [t for i, t in enumerate(types) if types[i] != 0]
    unique_buys = list(dict.fromkeys(buy_aids[::-1]))

    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids)>=topk:
        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        # RERANK CANDIDATES USING "BUY2BUY" CO-VISITATION MATRIX
        aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
        for aid in aids3: aids_temp[aid] += 0.1
        sorted_aids = [k for k,v in aids_temp.most_common(topk)]
        return sorted_aids
    # USE "CART ORDER" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_20_buys[aid] for aid in unique_aids if aid in top_20_buys]))
    # USE "BUY2BUY" CO-VISITATION MATRIX
    aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2+aids3).most_common(topk) if aid2 not in unique_aids] 
    result = unique_aids + top_aids2[:topk - len(unique_aids)]
    # USE TOP20 TEST ORDERS
    return result + list(top_orders)[:topk-len(result)]

In [16]:
submission_dict = {
    "session_type": [],
    "labels": [],
}

types = ["clicks", "carts", "orders"][:1]
topk = 20


for session_id, (session_aid_list, session_type_list) in tqdm(test_session_dict.items()):
    rec_items = recommend_clicks(session_aid_list, session_type_list, topk)
    
    session_types = [f"{session_id}_{t}" for t in types]
    labels = " ".join(str(aid) for aid in rec_items)
    labels_list = [labels] * len(types)
    
    submission_dict["session_type"].extend(session_types)
    submission_dict["labels"].extend(labels_list)

  0%|          | 0/1671803 [00:00<?, ?it/s]

In [17]:
df_submission_clicks = pl.DataFrame(submission_dict)

In [18]:
submission_dict = {
    "session_type": [],
    "labels": [],
}

types = ["clicks", "carts", "orders"][1:]
topk = 20


for session_id, (session_aid_list, session_type_list) in tqdm(test_session_dict.items()):
    rec_items = recommend_buys(session_aid_list, session_type_list, topk)
    
    session_types = [f"{session_id}_{t}" for t in types]
    labels = " ".join(str(aid) for aid in rec_items)
    labels_list = [labels] * len(types)
    
    submission_dict["session_type"].extend(session_types)
    submission_dict["labels"].extend(labels_list)

  0%|          | 0/1671803 [00:00<?, ?it/s]

In [19]:
df_submission_carts_orders = pl.DataFrame(submission_dict)

In [20]:
df_submission = pl.concat([df_submission_clicks, df_submission_carts_orders])

In [105]:
# was an error
df_submission.write_csv("covisitation_submission.csv")

In [22]:
# LB 0.574
df_submission.to_pandas().to_csv("covisitation_submission.csv.gz", compression="gzip", index=False)