In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:

import pandas as pd
import polars as pl
import numpy as np
from tqdm import tqdm

import os
import threadpoolctl
import warnings

warnings.filterwarnings('ignore')

from datetime import timedelta
from gc import collect

from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender
from rectools.dataset import Dataset
from rectools.models import (
    ImplicitALSWrapperModel,
    LightFMWrapperModel,
    ImplicitItemKNNWrapperModel,
)
from lightfm import LightFM
import catboost as cb

# For vector models optimized ranking
os.environ["OPENBLAS_NUM_THREADS"] = "1"
threadpoolctl.threadpool_limits(1, "blas");


In [4]:
import sys

sys.path.append('./src')

In [5]:
from data_load import get_train_val, get_cand_ranker, get_clickstream
from features import load_text_embeddings_node, load_text_embeddings, \
    load_cat_df, get_user_cat_features, get_user_loc_features, \
    get_node_loc_cat_features, join_features, add_dist_similarity
from ranker import SecondStageRanker
from tools import recall_at, reduce_memory_usage_pl



In [6]:
data_dir = './data/'
df_train, df_eval, _ = get_train_val(data_dir = data_dir)
df_ranker, df_cand = get_cand_ranker(df_train, cand_days=7)

Memory usage of dataframe df_test_users is 0.7 MB
Memory usage of dataframe df_test_users became 0.35 MB
Memory usage of dataframe df_clickstream is 3412.17 MB
Memory usage of dataframe df_clickstream became 1509.23 MB
Memory usage of dataframe df_cat_features is 6535.28 MB
Memory usage of dataframe df_cat_features became 2992.98 MB
Memory usage of dataframe df_train is 1490.49 MB
Memory usage of dataframe df_train became 1185.86 MB
Memory usage of dataframe df_eval is 2.4 MB
Memory usage of dataframe df_eval became 2.4 MB


In [7]:
df_cat = load_cat_df(data_dir)

Memory usage of dataframe df_cat_features is 604.76 MB
Memory usage of dataframe df_cat_features became 237.6 MB


In [16]:
df_event = pl.read_parquet(f'{data_dir}/events.pq')

In [10]:
ranker_users = df_ranker['cookie'].unique().to_list()
eval_users = df_eval['cookie'].unique().to_list()


df_user_loc = get_user_loc_features(df_cand)
df_user_cat = get_user_cat_features(df_cand)
df_node = get_node_loc_cat_features(df_cat)

# nodes embeddings
df_node_emb = load_text_embeddings_node(data_dir)

In [8]:
df_train = df_train.with_columns(
    pl.col("is_contact").alias("is_target")
)

In [9]:
ranker_preds1 = pl.read_parquet('./data/candidates/cand/preds_als_features_200.pq')
ranker_preds2 = pl.read_parquet('./data/candidates/cand/preds_als_no_feats_200.pq')
ranker_preds3 = pl.read_parquet('./data/candidates/cand/preds_lightfm_200.pq')

all_preds = pl.concat([ranker_preds1, ], how='vertical')
all_preds = all_preds.unique(['cookie', 'node'])
all_preds = all_preds.sort(by=['cookie'])

In [20]:
df_eval = df_eval.join(df_event, on='event', how='left')

In [22]:
eval_preds = pl.read_parquet('./data/candidates/train/preds_als_no_feats_200.pq')

In [26]:
df_eval = df_eval.with_columns(pl.col("is_contact").alias("is_target"))

In [30]:
ranker_preds = all_preds

# add targets
df_targets = df_ranker[['cookie', 'node', 'is_target']].sort(
    by = ['is_target'], descending=True
).unique(['cookie', 'node' ], keep='first', maintain_order=True)

ranker_preds = ranker_preds.join(df_targets, on = ['cookie', 'node'], how = 'left').with_columns(pl.col('is_target').cast(int))
ranker_preds = ranker_preds.with_columns(pl.col('is_target').fill_null(0))

In [31]:
ranker_preds['is_target'].mean()

0.001100686085127844

In [36]:
ranker_preds

cookie,node,is_target
i64,i64,i64
0,53194,0
0,214240,0
0,79110,0
0,192797,0
0,115741,0
…,…,…
149999,214294,0
149999,1908,0
149999,336243,0
149999,130602,0


In [32]:
# add targets
df_targets = df_eval[['cookie', 'node', 'is_target']].sort(
    by = ['is_target'], descending=True
).unique(['cookie', 'node' ], keep='first', maintain_order=True)

eval_preds = eval_preds.join(df_targets, on = ['cookie', 'node'], how = 'left').with_columns(pl.col('is_target').cast(int))
eval_preds = eval_preds.with_columns(pl.col('is_target').fill_null(0))

In [33]:
eval_preds['is_target'].mean()

0.00433325466131697

In [35]:
eval_preds.filter(pl.col("is_target").over("cookie").sum()> 0 )

cookie,node,is_target,is_target_right
i64,i64,i64,i64
0,115834,0,
0,214234,0,
0,214235,0,
0,115713,0,
0,115704,0,
…,…,…,…
149998,153018,0,
149998,229316,0,
149998,152684,0,
149998,334913,0,


In [10]:
from consts import nodes, feats, cat

In [14]:
# prepare data for catboost
ranker_preds = join_features(ranker_preds, df_node, df_user_cat, df_user_loc)
ranker_preds = ranker_preds.with_columns([pl.col(c).fill_null(-1).cast(int) for c in cat])


# add nodes distances from embeddings
ranker_preds  = add_dist_similarity(ranker_preds, df_node_emb, nodes)


Adding cosine similarity for node_last_contact_CAT
Adding cosine similarity for most_freq_node_contact_CAT
Adding cosine similarity for node_last_contact_LOC
Adding cosine similarity for most_freq_node_contact_LOC


In [16]:
cat = [
 'node_category',
 'most_freq_surface_CAT',
 'most_freq_event_CAT',
 'most_freq_event_contact_CAT',
]

In [20]:
ranker = SecondStageRanker(df=ranker_preds)
ranker.split_data(features=feats, cat_features=cat, eval_ratio=0.2)

train: 63055, eval: 21017, test: 21017
train: (23887689, 24), eval: (7963452, 24), test: (7967978, 24)


In [22]:
del df_train , df_eval, df_ranker, df_cand

In [25]:
del df_cat, df_node, df_user_loc

In [26]:
collect()

0

In [None]:
params = {
    "boosting_type": "Plain",
    "early_stopping_rounds": 10,
    "eval_metric": "RecallAt:top=40",
    # "learning_rate": 0.1,
    "max_ctr_complexity": 1,
    "nan_mode": "Min",
    "num_trees": 250,
    "objective": "PairLogitPairwise:max_pairs=50",
    "random_state": 42,
    "task_type": "CPU",
    "thread_count": 16,
}

model = cb.CatBoost(params = params)
ranker.model = model
ranker.model.fit(X= ranker.train_pool, verbose=10, eval_set=ranker.eval_pool)

Pairwise scoring loss functions on CPU do not support one hot features. OneHotMaxSize set to 1
0:	learn: 0.8408320	test: 0.8380029	best: 0.8380029 (0)	total: 35.9s	remaining: 2h 29m 4s


In [178]:
ranker.model.eval_metrics(
    ranker.test_pool,
    metrics=["AUC", "Accuracy", "PrecisionAt:top=40", "RecallAt:top=40"],ntree_start=ranker.model.tree_count_ -1
)

{'AUC': [0.7148534978094978],
 'Accuracy': [0.4608474495622383],
 'PrecisionAt:top=40': [0.002162637990102777],
 'RecallAt:top=40': [0.9108772349969533]}

In [180]:
ranker.save("models/catboost_0_91_emb_3sets")

Model saved as catboost_ranker_model.cbm
Ranker saved to models/catboost_0_91_emb.


### More features..

In [16]:
from features import get_user_features, get_node_features
from feature_pool import FeaturePool

In [None]:
# df_user = get_user_features(df_train)
# df_node = get_node_features(df_train, df_cat)

df_user_ranker = get_user_features(df_cand)
df_node_ranker = get_node_features(df_cand, df_cat)

user_pool = FeaturePool(key='cookie')
user_pool.add_features(df_user_ranker, feature_set='simple')

item_pool = FeaturePool(key='node')
item_pool.add_features(df_node_ranker, feature_set='simple')

user_params = dict(
    add_categorical=0, add_numerical=0, add_ratios=1, add_embeddings=0, 
    filter_by_name=('node',),
    # add_by_name = ('category_last_contact', 'most_freq_category') # most_freq_category
)
item_params = dict(
    add_categorical=1, add_numerical=0, add_ratios=1, add_embeddings=0, 
    filter_by_name=('surface', 'location', 'event')
)
user_df = user_pool.get_features(**user_params)
node_df = item_pool.get_features(**item_params)

In [23]:
item_pool.get_features(**item_params)

node,ctr,category
u32,f32,i8
311972,0.0,35
183127,0.0,32
337663,0.107143,51
251021,0.0,51
11412,0.0,28
…,…,…
405423,0.0,51
395595,0.0,51
394279,0.0,51
117305,0.0,19
