In [3]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import polars as pl
import pickle
import os
from pathlib import Path
import logger
import time
import pyarrow.parquet as pq
import scipy
import implicit
import bisect

from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from tqdm.notebook import tqdm

print(f'numpy={np.__version__}')
print(f'pandas={pd.__version__}')
print(f'polars={pl.__version__}')

numpy=1.23.5
pandas=1.5.3
polars=0.16.16


In [9]:
# Расположение папок с исходными данными
CONFIG_ORIG_TARGET_PATH = 'data/public_train.pqt'
CONFIG_ORIG_SUBMISSION_PATH = 'data/submit_2.pqt'

# Расположение рабочих папок
CONFIG_DATA_ENCODED_LIGHT_PARQUET_PATH = 'data_encoded_light_parquet'
CONFIG_DICT_PATH = 'dicts'
CONFIG_EMBEDDINGS_PATH = 'embeddings'

In [10]:
from functools import wraps
import time

def logger(function):
    @wraps(function)
    def wrapper(*args, **kwargs):
        """wrapper documentation"""
        start = time.perf_counter()
        print(f'{function.__name__}:begin:')        
        output = function(*args, **kwargs)
        end = time.perf_counter()        
        print(f'{function.__name__}:end: took {end - start:.6f} seconds to complete')
        return output
    return wrapper

@logger
def add_two_numbers(a, b):
    """this function adds two numbers"""
    return a + b

# Загрузка target

In [11]:
@logger
def get_targets():
    targets = pl.read_parquet(CONFIG_ORIG_TARGET_PATH, columns=['age', 'is_male', 'user_id'])
    print(targets.shape)
    print(targets.head())

    male_target = targets.filter((pl.col('is_male').is_in(['0', '1']))).select([pl.col('user_id').cast(pl.Int32()), pl.col('is_male')=='1'])
    print(male_target.shape)
    print(male_target.head())
    print(male_target['is_male'].value_counts())

    import bisect
    age_target = targets.filter(~pl.col('age').is_null()).filter(pl.col('age') >=19).select([pl.col('user_id').cast(pl.Int32()), pl.col('age').cast(pl.Int32())])
    age_target = age_target.with_columns(pl.col("age").apply(lambda x: bisect.bisect_left([25,35,45,55,65], x)).alias("age_bins")).with_columns((pl.col('age_bins')+1).alias('age_bins_pred'))
    print(age_target.groupby(['age_bins', 'age_bins_pred']).agg([pl.col('age').min().alias('min'), pl.col('age').max().alias('max'), pl.col('age').count().alias('count')]).sort('age_bins'))

    targets = male_target.join(age_target, on='user_id', how='inner')
    print(targets.shape)
    print(targets.head())
    print(targets['is_male'].value_counts())
    print(targets.groupby(['age_bins', 'age_bins_pred']).agg([pl.col('age').min().alias('min'), pl.col('age').max().alias('max'), pl.col('age').count().alias('count')]).sort('age_bins'))
    
    return targets

targets = get_targets()

get_targets:begin:
(270000, 3)
shape: (5, 3)
┌──────┬─────────┬─────────┐
│ age  ┆ is_male ┆ user_id │
│ ---  ┆ ---     ┆ ---     │
│ f64  ┆ str     ┆ i64     │
╞══════╪═════════╪═════════╡
│ 31.0 ┆ 1       ┆ 350459  │
│ 35.0 ┆ 1       ┆ 188276  │
│ 41.0 ┆ 0       ┆ 99002   │
│ 33.0 ┆ 0       ┆ 155506  │
│ 54.0 ┆ 0       ┆ 213873  │
└──────┴─────────┴─────────┘
(264326, 2)
shape: (5, 2)
┌─────────┬─────────┐
│ user_id ┆ is_male │
│ ---     ┆ ---     │
│ i32     ┆ bool    │
╞═════════╪═════════╡
│ 350459  ┆ true    │
│ 188276  ┆ true    │
│ 99002   ┆ false   │
│ 155506  ┆ false   │
│ 213873  ┆ false   │
└─────────┴─────────┘
shape: (2, 2)
┌─────────┬────────┐
│ is_male ┆ counts │
│ ---     ┆ ---    │
│ bool    ┆ u32    │
╞═════════╪════════╡
│ false   ┆ 128994 │
│ true    ┆ 135332 │
└─────────┴────────┘
shape: (6, 5)
┌──────────┬───────────────┬─────┬─────┬───────┐
│ age_bins ┆ age_bins_pred ┆ min ┆ max ┆ count │
│ ---      ┆ ---           ┆ --- ┆ --- ┆ ---   │
│ i64      ┆ i64         

# Сохранение/загрузка Embeddings

In [12]:
@logger
def save_embedding(embedding, name, size):
    print(f'save_embedding: {embedding.shape} {name=} {size=}')
    os.makedirs(CONFIG_EMBEDDINGS_PATH, exist_ok = True)    
    file_path = Path(CONFIG_EMBEDDINGS_PATH) / f'embedding_{name}_{size:03n}.pickle'
    print(f'save_embedding: {file_path=}')
    
    with open(file_path, 'wb') as f:
        pickle.dump(embedding, f)

@logger        
def load_embedding(name, size):
    print(f'load_embedding: {name=} {size=}')
    file_path = Path(CONFIG_EMBEDDINGS_PATH) / f'embedding_{name}_{size:03n}.pickle'
    print(f'load_embedding: {file_path=}')
    
    with open(file_path, 'rb') as f:
        embedding = pickle.load(f)
        
    print(f'load_embedding: {embedding.shape}')
    return embedding

test_embedding = np.zeros((64,64), dtype=np.float32)
save_embedding(test_embedding,'test',64)
test_embedding2 = load_embedding('test',64)

assert np.sum(test_embedding != test_embedding2) == 0

save_embedding:begin:
save_embedding: (64, 64) name='test' size=64
save_embedding: file_path=PosixPath('/home/jupyter/mnt/s3/mtsmlcup/embeddings_final/embedding_test_064.pickle')
save_embedding:end: took 0.033478 seconds to complete
load_embedding:begin:
load_embedding: name='test' size=64
load_embedding: file_path=PosixPath('/home/jupyter/mnt/s3/mtsmlcup/embeddings_final/embedding_test_064.pickle')
load_embedding: (64, 64)
load_embedding:end: took 0.001206 seconds to complete


# Создадим матрицу для ALS

In [13]:
@logger
def get_url_host_id_map():
    url_dict = pl.read_parquet(Path(CONFIG_DICT_PATH) / 'category_dict_url_host.parquet')
    url_dict = url_dict.with_columns(pl.col('url_host').apply(lambda x: x.encode('idna').decode('idna')).alias('url_host'))
    print(url_dict.shape)

    # Новая колонка для очищенных URL
    url_dict = url_dict.with_columns(pl.col('url_host').alias('url_host_clean'))

    # Заменяем цифры на N (только в URL)
    url_dict = url_dict.with_columns(pl.when(pl.col('url_host_clean').str.contains(r'^.*\.[a-z]*$')).then(pl.col('url_host_clean').str.replace_all(r'\d+', 'N')).otherwise(pl.col('url_host_clean')).alias('url_host_clean'))

    # Заменяем URL, который был меньше чем у N пользователей на 'lessthanNusers'
    N = 2
    url_dict = url_dict.with_columns(pl.when(pl.col('user_id_count') < N).then(pl.lit('lessthanNusers')).otherwise(pl.col('url_host_clean')).alias('url_host_clean'))
    print(url_dict)
    
    url_dict = url_dict.groupby('url_host_clean').agg([pl.col('url_host_id')]).with_columns(pl.arange(low=0, high=pl.count()).cast(pl.Int32()).alias('url_host_clean_id'))
    url_dict = url_dict.select(['url_host_id', 'url_host_clean_id'])
    print(url_dict.shape)
    print(f"url_host_clean_id min={url_dict['url_host_clean_id'].min()} min={url_dict['url_host_clean_id'].max()} n_unique={url_dict['url_host_clean_id'].n_unique()}")

    url_dict = url_dict.explode('url_host_id')
    print(url_dict.shape)
    
    return url_dict

get_url_host_id_map()

get_url_host_id_map:begin:
(199683, 5)
shape: (199683, 6)
┌────────────────────┬──────────┬───────────────┬───────────────┬─────────────┬────────────────────┐
│ url_host           ┆ count    ┆ user_id_count ┆ file_id_count ┆ url_host_id ┆ url_host_clean     │
│ ---                ┆ ---      ┆ ---           ┆ ---           ┆ ---         ┆ ---                │
│ str                ┆ u32      ┆ u32           ┆ u32           ┆ i32         ┆ str                │
╞════════════════════╪══════════╪═══════════════╪═══════════════╪═════════════╪════════════════════╡
│ googleads.g.double ┆ 22013466 ┆ 394562        ┆ 10            ┆ 0           ┆ googleads.g.double │
│ click.net          ┆          ┆               ┆               ┆             ┆ click.net          │
│ yandex.ru          ┆ 19007657 ┆ 386405        ┆ 10            ┆ 1           ┆ yandex.ru          │
│ i.ytimg.com        ┆ 16901446 ┆ 381268        ┆ 10            ┆ 2           ┆ i.ytimg.com        │
│ vk.com             ┆ 16695251 ┆

url_host_id,url_host_clean_id
i32,i32
131412,0
17789,1
66485,2
111951,3
133359,4
86152,5
90821,6
24856,7
92874,8
68394,9


In [None]:
@logger
def get_url_stat_by_user(n_files=100):
    """
    История посещения пользователем url. 
    Количество посещений URL считается суммой за все даты, поделенной на количество дат, за которые у пользователя были посещения
    """
    url_host_id_map = get_url_host_id_map()
        
    data = []
    for i, file_path in enumerate(list(Path(CONFIG_DATA_ENCODED_LIGHT_PARQUET_PATH).glob('*.parquet'))[:n_files]):
        print(f'reading {file_path}')
        data_item = pl.scan_parquet(file_path)
        
        # Вычищаем часть URL через url_host_id_map
        data_item = data_item.join(url_host_id_map.lazy(), on='url_host_id').select(pl.exclude('url_host_id')).rename({'url_host_clean_id':'url_host_id'})        

        data_item = data_item.groupby(['user_id', 'url_host_id']).agg([
            pl.col('request_cnt').sum().alias('request_cnt_total')
        ])
                
        data.append(data_item)
        del data_item
    data = pl.collect_all(data)
    data = pl.concat(data)
    return data

url_stat_by_user = get_url_stat_by_user(n_files=100) 
print(url_stat_by_user.groupby('user_id').agg([pl.count()]).select([
    pl.col('count').min().alias('min'),
    pl.col('count').mean().alias('mean'),
   pl.col('count').max().alias('max')
]))

In [16]:
mat = scipy.sparse.coo_matrix(
    (
        url_stat_by_user['request_cnt_total'].to_numpy(),
        (
            url_stat_by_user['user_id'].to_numpy(),
            url_stat_by_user['url_host_id'].to_numpy())
    ),
    shape=(url_stat_by_user['user_id'].n_unique(), url_stat_by_user['url_host_id'].n_unique())
)

In [20]:
mat = mat.tocsr()

In [None]:
embedding_size = 256

# Calculate approximate_als
import logging
import sys

handler = logging.StreamHandler(stream=sys.stdout)
log_implicit = logging.getLogger("implicit")
log_implicit.setLevel(logging.INFO)
log_implicit.addHandler(handler)

als = implicit.approximate_als.FaissAlternatingLeastSquares(factors = embedding_size, iterations = 60, use_gpu = True, calculate_training_loss = True, regularization = 0.1)
als.fit(mat, show_progress=True)

print(type(als.model.user_factors))
print(type(als.model.user_factors))

if (type(als.model.user_factors) == np.ndarray): # non GPU
    save_embedding(als.model.user_factors, 'user_id_from_user_id_vs_url_host_id2', embedding_size)
    save_embedding(als.model.item_factors, 'url_host_id2_from_user_id_vs_url_host_id2', embedding_size)
else: # GPU
    save_embedding(als.model.user_factors.to_numpy(), 'user_id_from_user_id_vs_url_host_id2', embedding_size)
    save_embedding(als.model.item_factors.to_numpy(), 'url_host_id2_from_user_id_vs_url_host_id2', embedding_size)

# Сравним качество ALS embedding с baseline как в примере (CatBoost)

Саму модель не сохраняем, т.к. она чуть лучше бейзлайна, но embedding пользователя добавим как один из embedding в финальную модель

In [28]:
from sklearn.model_selection import train_test_split
targets_train, targets_valid = train_test_split(targets)
print(targets_train)
print(targets_valid)

shape: (197631, 5)
┌─────────┬─────────┬─────┬──────────┬───────────────┐
│ user_id ┆ is_male ┆ age ┆ age_bins ┆ age_bins_pred │
│ ---     ┆ ---     ┆ --- ┆ ---      ┆ ---           │
│ i32     ┆ bool    ┆ i32 ┆ i64      ┆ i64           │
╞═════════╪═════════╪═════╪══════════╪═══════════════╡
│ 354920  ┆ true    ┆ 34  ┆ 1        ┆ 2             │
│ 259504  ┆ true    ┆ 49  ┆ 3        ┆ 4             │
│ 220373  ┆ false   ┆ 29  ┆ 1        ┆ 2             │
│ 50011   ┆ true    ┆ 51  ┆ 3        ┆ 4             │
│ …       ┆ …       ┆ …   ┆ …        ┆ …             │
│ 158168  ┆ true    ┆ 23  ┆ 0        ┆ 1             │
│ 188208  ┆ true    ┆ 59  ┆ 4        ┆ 5             │
│ 265003  ┆ false   ┆ 38  ┆ 2        ┆ 3             │
│ 255601  ┆ false   ┆ 37  ┆ 2        ┆ 3             │
└─────────┴─────────┴─────┴──────────┴───────────────┘
shape: (65877, 5)
┌─────────┬─────────┬─────┬──────────┬───────────────┐
│ user_id ┆ is_male ┆ age ┆ age_bins ┆ age_bins_pred │
│ ---     ┆ ---     ┆ --- ┆ 

In [23]:
def get_male_X_y(data, user_embedding):
    data = data.with_columns(pl.col('user_id').apply(lambda x: user_embedding[x].tolist()).alias('user_embedding'))
    X = np.vstack(data['user_embedding'].to_numpy())
    y = data['is_male'].to_numpy()
    return X, y

def get_age_X_y(data, user_embedding):
    data = data.with_columns(pl.col('user_id').apply(lambda x: user_embedding[x].tolist()).alias('user_embedding'))
    X = np.vstack(data['user_embedding'].to_numpy())
    y = data['age_bins'].to_numpy()
    return X, y

In [30]:
embedding_size = 256
user_embedding = load_embedding('user_id_from_user_id_vs_url_host_id2', embedding_size)

x_train, y_train = get_male_X_y(targets_train, user_embedding)
x_valid, y_valid = get_male_X_y(targets_valid, user_embedding)
assert x_train.shape != x_valid.shape

params = {
    'loss_function': 'CrossEntropy', 
    'eval_metric': 'NormalizedGini',
    'logging_level': 'Verbose', 
    'use_best_model': False, 
    'random_seed': 0,
    'iterations':30000,
    'od_wait':300,
    'learning_rate':0.1,
    'allow_writing_files':False
}
cb_model_male = CatBoostClassifier(**params)
cb_model_male.fit(Pool(x_train, y_train.astype(int)), eval_set=Pool(x_valid, y_valid.astype(int)), verbose = 100)

from sklearn.metrics import roc_auc_score
print(f'GINI по полу {2 * roc_auc_score(y_valid, cb_model_male.predict_proba(x_valid)[:,1]) - 1:2.3f}')

load_embedding:begin: name='user_id_from_user_id_vs_url_host_id2' size=256
load_embedding: file_path=PosixPath('/home/jupyter/mnt/s3/mtsmlcup/embeddings/embedding_user_id_from_user_id_vs_url_host_id2_256.pickle')
load_embedding:end: (415317, 256)
0:	test: 0.2582463	best: 0.2582463 (0)	total: 155ms	remaining: 1h 17m 15s
100:	test: 0.5717788	best: 0.5717788 (100)	total: 13.9s	remaining: 1h 8m 43s
200:	test: 0.6127856	best: 0.6127856 (200)	total: 27s	remaining: 1h 6m 49s
300:	test: 0.6309550	best: 0.6309550 (300)	total: 40.3s	remaining: 1h 6m 19s
400:	test: 0.6422982	best: 0.6422982 (400)	total: 53.3s	remaining: 1h 5m 32s
500:	test: 0.6500639	best: 0.6500639 (500)	total: 1m 6s	remaining: 1h 5m 9s
600:	test: 0.6558995	best: 0.6558995 (600)	total: 1m 19s	remaining: 1h 4m 38s
700:	test: 0.6609755	best: 0.6609755 (700)	total: 1m 32s	remaining: 1h 4m 21s
800:	test: 0.6640153	best: 0.6640153 (800)	total: 1m 45s	remaining: 1h 4m 8s
900:	test: 0.6666174	best: 0.6666174 (900)	total: 1m 58s	remaini