In [1]:
%autosave 60
%reload_ext autoreload
%autoreload 2

import sys

sys.path.insert(0, "..")

Autosaving every 60 seconds


In [2]:
import hydra
from omegaconf import DictConfig
from hydra import compose, initialize


import polars as pl
from tools.retrievers import PopularItemsRetriever

In [3]:
def main(config_path, config_name):
    with initialize(config_path=config_path, version_base=None):
        cfg = compose(config_name=config_name)

        return cfg

In [4]:
test = main("../configs/retrieval_models", "popular")

In [5]:
from pathlib import Path

DATA_DIR = Path.cwd().parent / "data" / "avito_ml_cup"

df_test_users = pl.read_parquet(f"{DATA_DIR}/test_users.pq")
df_clickstream = pl.read_parquet(f"{DATA_DIR}/clickstream.pq", n_rows=100_000)

df_cat_features = pl.read_parquet(f"{DATA_DIR}/cat_features.pq", n_rows=100_000)
df_text_features = pl.read_parquet(f"{DATA_DIR}/text_features.pq", n_rows=100_000)
df_event = pl.read_parquet(f"{DATA_DIR}/events.pq")

In [6]:
popul_items_retriever = PopularItemsRetriever(test)

In [7]:
popul_items_retriever.set_data(df_clickstream, df_cat_features)

In [8]:
test_df = popul_items_retriever.recommend(df_test_users.head(10)["cookie"].to_list())

In [97]:
test_df.sort("score_popular_items_by_category_7d", nulls_last=True, descending=True).sample(5)

cookie,node,score_popular_items_1d,score_popular_items_by_category_2d,score_popular_items_by_category_7d
i64,i64,f64,f64,f64
66590,130592,0.702788,0.946395,1.0
105000,147620,0.427663,0.885622,
143535,71524,0.52444,0.658265,0.648548
111026,158112,,,0.315465
129902,155666,0.496328,1.0,1.0


In [11]:
test["methods"]

[{'id': 'popular_items_1d', 'function': 'get_popular_items', 'output_column': 'score_popular_items_1d', 'params': {'days': 1, 'top_k': 200, 'use_cache': True}}, {'id': 'popular_items_2d', 'function': 'get_popular_items_by_category', 'output_column': 'score_popular_items_by_category_2d', 'params': {'days': 2, 'top_k': 10, 'use_cache': True}}]

In [28]:
import json
from collections import defaultdict
from tqdm import tqdm

In [31]:
attrs_items = defaultdict(set)

for data_json in tqdm(df_cat_features["clean_params"]):
    for items in json.loads(data_json):
        attrs_items.setdefault(items["attr"], set()).add(items["value"])

  0%|          | 0/100000 [00:00<?, ?it/s]

100%|██████████| 100000/100000 [00:00<00:00, 122579.14it/s]


In [43]:
df_cat_features.group_by("node").agg(
    pl.col("location").len()
).sort("location", descending=True)

node,location
u32,u32
170538,2066
151453,1377
71514,849
71511,603
71546,584
…,…
272618,1
356631,1
70475,1
234893,1


In [14]:
df_cat_features[["node", "category"]].unique().sort("node")

node,category
u32,i64
7,30
8,30
9,30
10,30
11,30
…,…
423206,51
423237,51
423278,51
423445,51
