In [None]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import polars as pl
import pickle
import os
from pathlib import Path

import time
import pyarrow.parquet as pq
import scipy
import implicit
import bisect

from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from tqdm.notebook import tqdm

import tensorflow as tf
import tensorflow_addons as tfa
import keras_nlp

from typing import Dict, Text

print(f'numpy={np.__version__}')
print(f'pandas={pd.__version__}')
print(f'polars={pl.__version__}')
print(f'tf={tf.__version__}')
print(f'tfa={tfa.__version__}')
print(f'keras_nlp={keras_nlp.__version__}')

In [None]:
# Расположение папок с исходными данными
CONFIG_ORIG_DATA_PATH = 'data/competition_data_final_pqt'
CONFIG_ORIG_TARGET_PATH = 'data/public_train.pqt'
CONFIG_ORIG_SUBMISSION_PATH = 'data/submit_2.pqt'

# Расположение рабочих папок
CONFIG_DATA_ENCODED_LIGHT_PARQUET_PATH = 'data_encoded_light_parquet'
CONFIG_EMBEDDINGS_PATH = 'embeddings'

CONFIG_MODEL_CHECKPOINT_ROOT = 'models'
CONFIG_MODEL_REPORT_ROOT = 'reports'
CONFIG_PREDICTIONS_PATH = 'predictions'

In [None]:
user_id_count = 415317
# url_host_id_count = 199683
# cpe_model_name_id_count = 599

In [None]:
from functools import wraps
import time

def logger(function):
    @wraps(function)
    def wrapper(*args, **kwargs):
        """wrapper documentation"""
        start = time.perf_counter()
        print(f'{function.__name__}:begin:')        
        output = function(*args, **kwargs)
        end = time.perf_counter()        
        print(f'{function.__name__}:end: took {end - start:.6f} seconds to complete')
        return output
    return wrapper

@logger
def add_two_numbers(a, b):
    """this function adds two numbers"""
    return a + b

In [None]:
@logger
def save_embedding(embedding, name, size):
    print(f'save_embedding: {embedding.shape} {name=} {size=}')
    os.makedirs(CONFIG_EMBEDDINGS_PATH, exist_ok = True)    
    file_path = Path(CONFIG_EMBEDDINGS_PATH) / f'embedding_{name}_{size:03n}.pickle'
    print(f'save_embedding: {file_path=}')
    
    with open(file_path, 'wb') as f:
        pickle.dump(embedding, f)

@logger        
def load_embedding(name, size):
    print(f'load_embedding: {name=} {size=}')
    file_path = Path(CONFIG_EMBEDDINGS_PATH) / f'embedding_{name}_{size:03n}.pickle'
    print(f'load_embedding: {file_path=}')
    
    with open(file_path, 'rb') as f:
        embedding = pickle.load(f)
        
    print(f'load_embedding: {embedding.shape}')
    return embedding

test_embedding = np.zeros((64,64), dtype=np.float32)
save_embedding(test_embedding,'test',64)
test_embedding2 = load_embedding('test',64)

assert np.sum(test_embedding != test_embedding2) == 0

# Загрузка target

In [None]:
@logger
def get_targets():
    targets = pl.read_parquet(CONFIG_ORIG_TARGET_PATH, columns=['age', 'is_male', 'user_id'])
    print(targets.shape)
    print(targets.head())

    male_target = targets.filter((pl.col('is_male').is_in(['0', '1']))).select([pl.col('user_id').cast(pl.Int32()), pl.col('is_male')=='1'])
    print(male_target.shape)
    print(male_target.head())
    print(male_target['is_male'].value_counts())

    import bisect
    age_target = targets.filter(~pl.col('age').is_null()).filter(pl.col('age') >=19).select([pl.col('user_id').cast(pl.Int32()), pl.col('age').cast(pl.Int32())])
    age_target = age_target.with_columns(pl.col("age").apply(lambda x: bisect.bisect_left([25,35,45,55,65], x)).alias("age_bins")).with_columns((pl.col('age_bins')+1).alias('age_bins_pred'))
    print(age_target.groupby(['age_bins', 'age_bins_pred']).agg([pl.col('age').min().alias('min'), pl.col('age').max().alias('max'), pl.col('age').count().alias('count')]).sort('age_bins'))

    targets = male_target.join(age_target, on='user_id', how='inner')
    print(targets.shape)
    print(targets.head())
    print(targets['is_male'].value_counts())
    print(targets.groupby(['age_bins', 'age_bins_pred']).agg([pl.col('age').min().alias('min'), pl.col('age').max().alias('max'), pl.col('age').count().alias('count')]).sort('age_bins'))
    
    return targets

# targets = get_targets()

In [None]:
from sklearn.model_selection import KFold

class cv_folds_class:
    def __init__(self, targets, cv_fold_count=5):
        self.targets = targets
        self.folds = self.get_folds(targets, cv_fold_count)
        
    @logger
    def get_folds(self, targets, cv_fold_count):
        folds_path = f'cv_folds_{cv_fold_count}.parquet'
        if not(Path(folds_path).exists()):
            user_id = targets['user_id'].to_numpy()
            folds = []
            kfold = KFold(n_splits=cv_fold_count, random_state=10, shuffle=True)        
            for fold_id, (train_index, test_index) in enumerate(kfold.split(user_id)):
                user_id_fold = user_id[test_index]
                fold = pl.DataFrame({'user_id':user_id_fold, 'fold_id':fold_id})
                folds.append(fold)
            folds = pl.concat(folds)
            folds.write_parquet(folds_path)
        return pl.read_parquet(folds_path)

    @logger
    def get_train_target_fold(self, fold_id):
        print(f'get_train_target_fold: {fold_id=}')        
        train_target_fold = self.targets.join(self.folds, on='user_id', how='inner').filter(pl.col('fold_id')!=fold_id).sample(frac=1, shuffle=True, seed=10)
        print(f'get_train_target_fold: {train_target_fold.shape=}')        
        return train_target_fold
     
    @logger
    def get_valid_target_fold(self, fold_id):
        print(f'get_valid_target_fold: {fold_id=}')                
        valid_target_fold = self.targets.join(self.folds, on='user_id', how='inner').filter(pl.col('fold_id')==fold_id).sample(frac=1, shuffle=True, seed=10)
        print(f'get_valid_target_fold: {valid_target_fold.shape=}')        
        return valid_target_fold 
    
    @logger
    def get_full_target(self):
        target = self.targets.sample(frac=1, shuffle=True, seed=10)
        print(f'get_full_target: {target.shape=}')        
        return target    
    
    @logger
    def get_test_target(self):
        user_pl = pl.read_parquet(CONFIG_ORIG_SUBMISSION_PATH, columns=['user_id'])
        user_pl = user_pl.select([
            pl.col('user_id').cast(pl.Int32()),
            pl.lit(-1).alias('is_male'),
            pl.lit(-1).alias('age'),
            pl.lit(-1).alias('age_bins')        
        ])
        print(user_pl)
        return user_pl
    
cv_folds = cv_folds_class(targets=get_targets(), cv_fold_count=5)

# Загрузка данных

In [None]:
@logger
def get_url_host_id_map():
    url_dict = pl.read_parquet('/home/jupyter/mnt/s3/mtsmlcup/dicts/category_dict_url_host.parquet') #, columns=['url_host_id', 'url_host']
    url_dict = url_dict.with_columns(pl.col('url_host').apply(lambda x: x.encode('idna').decode('idna')).alias('url_host'))
    print(url_dict.shape)

    # Новая колонка для очищенных URL
    url_dict = url_dict.with_columns(pl.col('url_host').alias('url_host_clean'))

    # Заменяем цифры на N (только в URL)
    url_dict = url_dict.with_columns(pl.when(pl.col('url_host_clean').str.contains(r'^.*\.[a-z]*$')).then(pl.col('url_host_clean').str.replace_all(r'\d+', 'N')).otherwise(pl.col('url_host_clean')).alias('url_host_clean'))

    # Заменяем URL, который был меньше чем у N пользователей на 'lessthanNusers'
#     N = 2
#     url_dict = url_dict.with_columns(pl.when(pl.col('user_id_count') < N).then(pl.lit('lessthanNusers')).otherwise(pl.col('url_host_clean')).alias('url_host_clean'))
#     print(url_dict)
    
    # Удаляем из истории записи с lessthanNusers
#     url_dict = url_dict.filter(pl.col('url_host_clean') != 'lessthanNusers')
#     print(url_dict)
    
    # url_dict = url_dict.groupby('url_host_clean').agg([pl.all(), pl.count().alias('url_host_count')]).sort('url_host_count', descending=True)
    # print(url_dict)

    url_dict = url_dict.groupby('url_host_clean').agg([pl.col('url_host_id')]).with_columns(pl.arange(low=0, high=pl.count()).cast(pl.Int32()).alias('url_host_clean_id'))
    url_dict = url_dict.select(['url_host_id', 'url_host_clean_id'])
    print(url_dict.shape)
    print(f"url_host_clean_id min={url_dict['url_host_clean_id'].min()} min={url_dict['url_host_clean_id'].max()} n_unique={url_dict['url_host_clean_id'].n_unique()}")

    url_dict = url_dict.explode('url_host_id')
    print(url_dict.shape)
    
    return url_dict

get_url_host_id_map()

In [None]:
@logger
def get_url_stat_by_user(n_files=100):
    """
    История посещения пользователем url. 
    Количество посещений URL считается суммой за все даты, поделенной на количество дат, за которые у пользователя были посещения
    """
    url_host_id_map = get_url_host_id_map()
        
    data = []
    for i, file_path in enumerate(list(Path(CONFIG_DATA_ENCODED_LIGHT_PARQUET_PATH).glob('*.parquet'))[:n_files]):
        print(f'reading {file_path}')
        data_item = pl.scan_parquet(file_path)
        print(data_item.columns)
        
        # Вычищаем часть URL через url_host_id_map
        data_item = data_item.join(url_host_id_map.lazy(), on='url_host_id').select(pl.exclude('url_host_id')).rename({'url_host_clean_id':'url_host_id'})        
        
        # Считаем посещения за все дни и количество дней с посещениями
        part_of_day_id_2code_mapper = {
            0: 1,
            1: 2,
            2: 4,
            3: 8
        }

        data_item = data_item.with_columns(
            pl.col('part_of_day_id').map_dict(part_of_day_id_2code_mapper, default=pl.col("part_of_day_id")).alias('part_of_day_code')
        )

        data_item_url_stat = data_item.groupby(['user_id', 'url_host_id']).agg([
            pl.col('request_cnt').sum().alias('request_cnt_total'),
            pl.col('part_of_day_id').mode().max().alias('part_of_day_id_mode'),
#             pl.col('part_of_day_code').unique().str.concat(',').alias('part_of_day_code_list'),
            pl.col('part_of_day_code').unique().sum().alias('part_of_day_code'),            
            pl.col('date_int').n_unique().alias('n_days_url')
        ])
                
        # Считаем количество дней у пользовальтелей
        data_item_date_stat = data_item.groupby('user_id').agg(pl.col('date_int').n_unique().alias('n_days_any_url'))
        
        # Считаем статистику посещения в день
        data_item = data_item_url_stat.join(data_item_date_stat, on='user_id')
        
#         # Группируем в одну строчку на пользователя
#         data_item = data_item.sort(['user_id', 'request_cnt'], descending=[False, True]).groupby(['user_id']).agg([
#             pl.col('url_host_id'),
#             pl.count().alias('url_host_id_count'),              
#             pl.col('request_cnt'),
#         ])
        data.append(data_item)
        del data_item
    data = pl.collect_all(data)
    data = pl.concat(data)
    return data

# get_url_stat_by_user(n_files=1).groupby(['part_of_day_code_list', 'part_of_day_code']).agg(pl.count()).sort('part_of_day_code')
url_stat_by_user = get_url_stat_by_user(n_files=100) 
print(url_stat_by_user.groupby('user_id').agg([pl.count()]).select([
    pl.col('count').min().alias('min'),
    pl.col('count').mean().alias('mean'),
   pl.col('count').max().alias('max')
])) # max user history len = 1621

In [None]:
# ┌─────┬───────────┬──────┐
# │ min ┆ mean      ┆ max  │
# │ --- ┆ ---       ┆ ---  │
# │ u32 ┆ f64       ┆ u32  │
# ╞═════╪═══════════╪══════╡
# │ 1   ┆ 72.030008 ┆ 1621 │
# └─────┴───────────┴──────┘

In [None]:
@logger
def get_category_feature(column):
    print(f'get_category_feature:begin {column=}')    
    
    data_list = []
    for i, file_path in enumerate(list(Path(CONFIG_DATA_ENCODED_LIGHT_PARQUET_PATH).glob('*.parquet'))[:]):
        print(f'reading {file_path}')
        data = pl.scan_parquet(file_path)
        
        data = data.groupby(['user_id', column]).agg(pl.col('date_int').n_unique().alias('days')).sort(['user_id', 'days', column], descending=[False, True, False])
        data = data.groupby('user_id').agg([
            pl.col(column).first().alias(f'{column}_primary'),
            pl.col(column).len().alias(f'{column}_count'),
        ]).sort('user_id')        
        data_list.append(data)
    data_list = pl.collect_all(data_list)
    data = pl.concat(data_list)
    return data

@logger
def get_category_features():
    data_list = []
    for column in ['region_name_id', 'city_name_id', 'cpe_manufacturer_name_id', 'cpe_model_name_id', 'cpe_type_cd_id', 'cpe_model_os_type_id']: # 'part_of_day'
        print(f'{column=}')
        data_list.append(get_category_feature(column))
    data_list = pl.concat([data_list[0].select([pl.col('user_id')])]+[i.select(pl.exclude('user_id')) for i in data_list], how='horizontal')
    return data_list

cat_features = get_category_features()

In [None]:
@logger
def get_price_features():
    data_list = []
    for i, file_path in enumerate(list(Path(CONFIG_DATA_ENCODED_LIGHT_PARQUET_PATH).glob('*.parquet'))[:]):
        print(f'reading {file_path}')
        data = pl.scan_parquet(file_path)
        
        data = data.groupby(['user_id']).agg(pl.col('price').max())
        data = data.sort('user_id')        
        data_list.append(data)
    data_list = pl.collect_all(data_list)
    data = pl.concat(data_list)
    print(f'get_category_feature:end')            
    return data

price_features = get_price_features()
print(price_features)

In [None]:
@logger
def get_normalization_layer(data, name):
    print(f'get_normalization_layer: {name=}')
    
    mean = np.mean(data)
    variance = np.var(data)
    print(f'get_normalization_layer: {mean=} {variance=}')
    
    layer = tf.keras.layers.Normalization(axis=None, mean=mean, variance=variance, name=name)
    return layer

get_normalization_layer(np.arange(100), 'temp')(tf.constant([0,1,2,3,4,50,99]))

In [None]:
@logger
def get_discretization_layer(data, n_bins, name):
    print(f'get_discretization_layer: {n_bins=} {name=}')
    
    quantiles = np.linspace(0, 1, n_bins + 1)
    print(quantiles)

    bins = np.quantile(data, quantiles)
    print(f'get_discretization_layer: {bins=} {bins.shape=}')
    
    bins = np.unique(bins)
    print(f'get_discretization_layer: {bins=} {bins.shape=}')
    
    data_bins = np.digitize(data, bins, right=True)
    stat = np.unique(data_bins, return_counts=True)
    print(f'get_discretization_layer: bins_value: {stat[0]}')
    print(f'get_discretization_layer: bins_count: {stat[1]}')
    
    layer = tf.keras.layers.Discretization(bin_boundaries=bins[1:-1], epsilon=0.01, name=name)
    return layer

get_discretization_layer(data=url_stat_by_user['request_cnt_total'].to_numpy() / url_stat_by_user['n_days_url'].to_numpy(), n_bins=100, name='temp1')(np.arange(1000))
    

# Формируем словарь для перекодировки url_host_id

In [None]:
class enconding_config_class:
    @logger
    def __init__(self):
        self.vocabulary = enconding_config_class.get_encoding_vocabulary()
        print(f'enconding_config_class: {self.vocabulary.shape=}')
        
        self.pad_token = -1
        self.pad_token_id = 0
        print(f'enconding_config_class: {self.pad_token=} {self.pad_token_id=}')           
        
        self.oov_token_id = 1
        print(f'enconding_config_class: {self.oov_token_id=}') 
        
        self.mask_token = -2
        self.mask_token_id = 2
        print(f'enconding_config_class: {self.mask_token=} {self.mask_token_id=}')  
        
        self.start_token = -3
        self.start_token_id = 3
        print(f'enconding_config_class: {self.start_token=} {self.start_token_id=}')   
        
        self.end_token = -4
        self.end_token_id = 4
        print(f'enconding_config_class: {self.end_token=} {self.end_token_id=}')           
        
        self.vocabulary = np.append([self.mask_token, self.start_token, self.end_token], self.vocabulary)
        print(f'enconding_config_class: adding mask token: {self.vocabulary.shape=}')        
        
        self.vocabulary_size_short = len(self.vocabulary)
        print(f'enconding_config_class: {self.vocabulary_size_short=}')            
        
        self.vocabulary_size_full = self.vocabulary_size_short + 2 # + mask + oov
        print(f'enconding_config_class: {self.vocabulary_size_full=}')                  
        
    @logger        
    def get_encoding_vocabulary():
        url_host_id_all = url_stat_by_user['url_host_id'].to_numpy()
        values, values_count = np.unique(url_host_id_all, return_counts=True)
        vocabulary = values[values_count >= 2]
        vocabulary = np.sort(vocabulary)
        np.savetxt('vocabulary.csv', vocabulary, delimiter=',', fmt='%d') 
#         vocabulary = values[values_count >= 10]
        return vocabulary        
    
    @logger        
    def get_tokenizer(self):
        tokenizer = tf.keras.layers.IntegerLookup(
            output_mode='int',
            mask_token=self.pad_token, # will be 0
            num_oov_indices=1, # will be 1
            vocabulary=self.vocabulary # will start from 2 ()
        )  
        return tokenizer            
    
enconding_config = enconding_config_class()

# Создаем datasets

In [None]:
cv_fold_id=0

@logger
def get_data_by_target(target):
    print('Данные из истории посещения')
    data = url_stat_by_user
    print(data)
    
    print('Группированная история посещения')
    data = url_stat_by_user.sort('request_cnt_total', descending=True).groupby(['user_id']).agg([
        pl.col('url_host_id'),
        pl.col('request_cnt_total'),
        pl.col('part_of_day_id_mode'),       
        pl.col('part_of_day_code'),
        pl.col('n_days_url'),
        pl.col('n_days_any_url').first(),
        pl.count().alias('url_host_id_count'),
    ])
    print(data)
    
    print('Добавляем cat_features')
    data = cat_features.join(data, on='user_id')
    print(data)
    
    print('Добавляем price_features')    
    data = price_features.join(data, on='user_id')
    print(data)
        
    print('Добавляем targets')
    data = target.join(data, on='user_id')
    print(data)
    
    return data
        
train_sample = get_data_by_target(cv_folds.get_train_target_fold(cv_fold_id))[:10]
train_sample

In [None]:
@logger
def get_pretrain_train_valid_df():
    print('Данные из истории посещения')
    data = url_stat_by_user
    print(data)
    
    print('Группированная история посещения')
    data = url_stat_by_user.sort('request_cnt_total', descending=True).groupby(['user_id']).agg([
        pl.col('url_host_id'),
        pl.col('request_cnt_total'),
        pl.col('part_of_day_id_mode'),       
        pl.col('part_of_day_code'),
        pl.col('n_days_url'),
        pl.col('n_days_any_url').first(),
        pl.count().alias('url_host_id_count'),
    ])
    print(data)
    
    print('Добавляем cat_features')
    data = cat_features.join(data, on='user_id')
    print(data)
    
    print('Добавляем price_features')    
    data = price_features.join(data, on='user_id')
    print(data)
    
    print('Перемешиваем и добавляем пустой таргет')
    data = data.sample(frac=1, shuffle=True, seed=10).with_columns([
        pl.lit(-1).alias('is_male'),
        pl.lit(-1).alias('age'),
        pl.lit(-1).alias('age_bins') 
    ])
    print(data)
    
    print('Выделяем train')
    train = data[:-10000]
    print(train)
    
    print('Выделяем valid')
    valid = data[-10000:]
    print(valid) 
    
    return train, valid

In [None]:
@logger
def get_dataset(df):
    print(f'get_dataset:begin {df.shape=}')
    
    @logger
    def get_ragged_tensor(df, column_name):
        # https://github.com/tensorflow/tensorflow/issues/47853    
        print(f'get_dataset:get_ragged_tensor: {column_name}')        
        ragged = tf.RaggedTensor.from_row_lengths(
            values=np.hstack(df[column_name]),
            row_lengths=df['url_host_id_count'],
        )
        return ragged
    
    dataset = tf.data.Dataset.from_tensor_slices((
        {
            # User
            'user_id':df['user_id'].to_numpy(),
            
            # History fields
            'url_host_id':get_ragged_tensor(df, 'url_host_id'),
            'request_cnt_total':get_ragged_tensor(df, 'request_cnt_total'),    
            'part_of_day_id_mode':get_ragged_tensor(df, 'part_of_day_id_mode'),   
            'part_of_day_code':get_ragged_tensor(df, 'part_of_day_code'),   
            'n_days_url':get_ragged_tensor(df, 'n_days_url'),  
            'n_days_any_url':df['n_days_any_url'].to_numpy(),  
            'url_host_id_count':df['url_host_id_count'].to_numpy(),
            
#             # Static fields
            'region_name_id_primary':df['region_name_id_primary'].to_numpy(),
            'city_name_id_primary':df['city_name_id_primary'].to_numpy(),
            'cpe_manufacturer_name_id_primary':df['cpe_manufacturer_name_id_primary'].to_numpy(),
            'cpe_model_name_id_primary':df['cpe_model_name_id_primary'].to_numpy(),
            'cpe_type_cd_id_primary':df['cpe_type_cd_id_primary'].to_numpy(),
            'cpe_model_os_type_id_primary':df['cpe_model_os_type_id_primary'].to_numpy(),
            'price':df['price'].to_numpy(),
            'region_name_id_count':df['region_name_id_count'].to_numpy(),
            'city_name_id_count':df['city_name_id_count'].to_numpy(),
            'cpe_model_name_id_count':df['cpe_model_name_id_count'].to_numpy(),
            'url_host_id_count':df['url_host_id_count'].to_numpy(),
        },
        {
            'is_male':df['is_male'].to_numpy(),
            'age_bins':tf.one_hot(df['age_bins'].to_numpy(), 6),
            'age':df['age'].to_numpy(),     
        }))
    print(f'get_dataset:end')    
    return dataset

@logger
def apply_encode_host_id(dataset):
    tokenizer = enconding_config.get_tokenizer()
    
    def fn(inputs, target):
        url_host_id = inputs['url_host_id']
        url_host_id = tokenizer(url_host_id)        
        inputs['url_host_id'] = url_host_id
        
        return inputs, target
    
    dataset = dataset.map(fn, num_parallel_calls=tf.data.AUTOTUNE) 
    return dataset

def ragged_to_tensor(item, target):
    item['url_host_id'] = item['url_host_id'].to_tensor(default_value=-1)
    item['request_cnt_total'] = item['request_cnt_total'].to_tensor(default_value=-1)
    item['part_of_day_id_mode'] = item['part_of_day_id_mode'].to_tensor(default_value=-1)
    item['part_of_day_code'] = item['part_of_day_code'].to_tensor(default_value=-1)
    item['n_days_url'] = item['n_days_url'].to_tensor(default_value=-1)
    return item, target

@logger
def get_train_dataset(df, batch_size, shuffle_len=1024, data_rate=0.75, min_data_size=5):
    print(f'get_train_dataset:begin {batch_size=}')
    dataset = get_dataset(df)
    dataset = dataset.shuffle(shuffle_len)
    
    def appy_random_mask(dataset, data_rate, min_data_size):
        def random_drop(item, target):
            item_mask = tf.random.uniform(shape=tf.shape(item['url_host_id']))
            item_mask = item_mask <= data_rate
            item_mask_size = tf.reduce_sum(tf.cast(item_mask, tf.int32))
            item_mask = tf.where(item_mask_size >= min_data_size, item_mask, tf.ones_like(item_mask, dtype=tf.bool))
            item['url_host_id'] = tf.ragged.boolean_mask(data=item['url_host_id'], mask=item_mask)
            item['request_cnt_total'] = tf.ragged.boolean_mask(data=item['request_cnt_total'], mask=item_mask)
            item['part_of_day_id_mode'] = tf.ragged.boolean_mask(data=item['part_of_day_id_mode'], mask=item_mask)
            item['part_of_day_code'] = tf.ragged.boolean_mask(data=item['part_of_day_code'], mask=item_mask)
            item['n_days_url'] = tf.ragged.boolean_mask(data=item['n_days_url'], mask=item_mask)        
            return item, target
    
        dataset = dataset.map(random_drop, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
        return dataset
    
    dataset = appy_random_mask(dataset, data_rate=data_rate, min_data_size=min_data_size)
    dataset = dataset.apply(tf.data.experimental.dense_to_ragged_batch(batch_size=batch_size))
    dataset = dataset.map(ragged_to_tensor, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
    dataset = apply_encode_host_id(dataset)        
    
    dataset = dataset.prefetch(tf.data.AUTOTUNE)    
    print(f'get_train_dataset:end')    
    return dataset

@logger
def get_valid_dataset(df, batch_size):
    print(f'get_valid_dataset:begin {batch_size=}')
    dataset = get_dataset(df)
    dataset = dataset.apply(tf.data.experimental.dense_to_ragged_batch(batch_size=batch_size))
    dataset = dataset.map(ragged_to_tensor, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
    dataset = apply_encode_host_id(dataset)    
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    print(f'get_valid_dataset:end')    
    return dataset

    
###############################################################################

# for i in get_dataset(train_sample).take(2):
#     print(i[0]['url_host_id'])
    
for i in get_train_dataset(train_sample, batch_size=2).take(2):
    print(i[0]['url_host_id'])
    
# for i in get_valid_dataset(train_sample, batch_size=2).take(1):
#     print(i[0]['url_host_id'])

In [None]:
%%time
for i in tqdm(get_train_dataset(get_data_by_target(cv_folds.get_train_target_fold(cv_fold_id)), batch_size=32)):
    i

In [None]:
%%time
for i in tqdm(get_valid_dataset(get_data_by_target(cv_folds.get_valid_target_fold(cv_fold_id)), batch_size=32)):
    i

# Create model

In [None]:
def get_model_checkpoint_path_old(model_name: str) -> str:
    model_checkpoin_path = os.path.join(CONFIG_MODEL_CHECKPOINT_ROOT, model_name, 'checkpoint')
    print(f'get_model_checkpoint_path: {model_checkpoin_path}')
    return model_checkpoin_path

def get_model_checkpoint_path(model_name: str) -> str:
    os.makedirs(CONFIG_MODEL_CHECKPOINT_ROOT, exist_ok=True)    
    model_checkpoin_path = os.path.join(CONFIG_MODEL_CHECKPOINT_ROOT, model_name, '{epoch:02d}_checkpoint')
    print(f'get_model_checkpoint_path: {model_checkpoin_path}')
    return model_checkpoin_path

def get_model_checkpoint_path_by_epoch(model_name: str, epoch: int) -> str:
    model_checkpoin_path = os.path.join(CONFIG_MODEL_CHECKPOINT_ROOT, model_name, f'{epoch:02d}_checkpoint')
    print(f'get_model_checkpoint_path_by_epoch: model_name={model_name} epoch={epoch} -> {model_checkpoin_path}')
    return model_checkpoin_path

def get_model_report_path(model_name: str) -> str:
    os.makedirs(CONFIG_MODEL_REPORT_ROOT, exist_ok=True)
    model_report_path = os.path.join(CONFIG_MODEL_REPORT_ROOT, f'{model_name}_report.csv')
    print(f'get_model_report_path: {model_report_path}')
    return model_report_path

In [None]:
import keras.backend as K
def categorical_focal_loss(gamma=2.0, alpha=0.25):
    """
    Implementation of Focal Loss from the paper in multiclass classification
    Formula:
        loss = -alpha*((1-p)^gamma)*log(p)
    Parameters:
        alpha -- the same as wighting factor in balanced cross entropy
        gamma -- focusing parameter for modulating factor (1-p)
    Default value:
        gamma -- 2.0 as mentioned in the paper
        alpha -- 0.25 as mentioned in the paper
    """
    def focal_loss(y_true, y_pred):
        # Define epsilon so that the backpropagation will not result in NaN
        # for 0 divisor case
        epsilon = K.epsilon()
        # Add the epsilon to prediction value
        #y_pred = y_pred + epsilon
        # Clip the prediction value
        y_pred = K.clip(y_pred, epsilon, 1.0-epsilon)
        # Calculate cross entropy
        cross_entropy = -y_true*K.log(y_pred)
        # Calculate weight that consists of  modulating factor and weighting factor
        weight = alpha * y_true * K.pow((1-y_pred), gamma)
        # Calculate focal loss
        loss = weight * cross_entropy
        # Sum the losses in mini_batch
        loss = K.sum(loss, axis=1)
        return loss
    
    return focal_loss

y_true = tf.constant([[0., 1., 0.], [0., 0., 1.]])
y_pred = tf.constant([[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
categorical_focal_loss()(y_true, y_pred).numpy().mean()
#0.23315276

In [None]:
from keras import backend_config
epsilon = backend_config.epsilon

def categorical_focal_crossentropy(
    target,
    output,
    alpha=0.25,
    gamma=2.0,
    from_logits=False,
    axis=-1,
):
    """Computes the alpha balanced focal crossentropy loss between
    the labels and predictions.
    According to [Lin et al., 2018](https://arxiv.org/pdf/1708.02002.pdf), it
    helps to apply a focal factor to down-weight easy examples and focus more on
    hard examples. By default, the focal tensor is computed as follows:
    It has pt defined as:
    pt = p, if y = 1 else 1 - p
    The authors use alpha-balanced variant of focal loss in the paper:
    FL(pt) = −α_t * (1 − pt)^gamma * log(pt)
    Extending this to multi-class case is straightforward:
    FL(pt) = α_t * (1 − pt)^gamma * CE, where minus comes from
    negative log-likelihood and included in CE.
    `modulating_factor` is (1 − pt)^gamma, where `gamma` is a focusing
    parameter. When `gamma` = 0, there is no focal effect on the categorical
    crossentropy. And if alpha = 1, at the same time the loss is equivalent
    to the categorical crossentropy.
    Args:
        target: A tensor with the same shape as `output`.
        output: A tensor.
        alpha: A weight balancing factor for all classes, default is `0.25` as
            mentioned in the reference. It can be a list of floats or a scalar.
            In the multi-class case, alpha may be set by inverse class
            frequency by using `compute_class_weight` from `sklearn.utils`.
        gamma: A focusing parameter, default is `2.0` as mentioned in the
            reference. It helps to gradually reduce the importance given to
            simple examples in a smooth manner.
        from_logits: Whether `output` is expected to be a logits tensor. By
            default, we consider that `output` encodes a probability
            distribution.
    Returns:
        A tensor.
    """
    
    def _constant_to_tensor(x, dtype):
        """Convert the input `x` to a tensor of type `dtype`.
        This is slightly faster than the _to_tensor function, at the cost of
        handling fewer cases.
        Args:
            x: An object to be converted (numpy arrays, floats, ints and lists of
              them).
            dtype: The destination type.
        Returns:
            A tensor.
        """
        return tf.constant(x, dtype=dtype)



    target = tf.convert_to_tensor(target)
    output = tf.convert_to_tensor(output)
    target.shape.assert_is_compatible_with(output.shape)

    # scale preds so that the class probas of each sample sum to 1
    output = output / tf.reduce_sum(output, axis=axis, keepdims=True)

    epsilon_ = _constant_to_tensor(epsilon(), output.dtype.base_dtype)
    output = tf.clip_by_value(output, epsilon_, 1.0 - epsilon_)

    # Calculate cross entropy
    cce = -target * tf.math.log(output)

    # Calculate factors
    modulating_factor = tf.pow(1.0 - output, gamma)
    weighting_factor = tf.multiply(modulating_factor, alpha)

    # Apply weighting factor
    focal_cce = tf.multiply(weighting_factor, cce)
    focal_cce = tf.reduce_sum(focal_cce, axis=axis)
    return focal_cce

tf.reduce_mean(categorical_focal_crossentropy(y_true, y_pred, alpha=0.25))

In [None]:
def get_categorical_focal_crossentropy_loss(gamma=2.0, alpha=0.25):
    def loss_fn(y_true, y_pred):
        return categorical_focal_crossentropy(y_true, y_pred, alpha=alpha, gamma=gamma)
    return loss_fn

In [None]:
import keras_nlp 

NORM_EPSILON = 1e-5

@logger
def get_encoder_inputs():
    inputs = {}
    inputs['url_host_id'] = tf.keras.Input(name='url_host_id', ragged=False, dtype=tf.int32, shape=(None,))   
    inputs['request_cnt_total'] = tf.keras.Input(name='request_cnt_total', ragged=False, dtype=tf.int32, shape=(None,))
    inputs['n_days_url'] = tf.keras.Input(name='n_days_url', ragged=False, dtype=tf.int32, shape=(None,))
    inputs['part_of_day_id_mode'] = tf.keras.Input(name='part_of_day_id_mode', ragged=False, dtype=tf.int32, shape=(None,))        
    inputs['part_of_day_code'] = tf.keras.Input(name='part_of_day_code', ragged=False, dtype=tf.int32, shape=(None,))        
    inputs['n_days_any_url'] = tf.keras.Input(name='n_days_any_url', ragged=False, dtype=tf.int32, shape=())
    print(inputs)
    return inputs

def create_encoder_model(num_layers, num_heads, dropout, url_host_id_emb_dim):
    print(f'create_encoder_model: {num_layers=} {num_heads=} {dropout=} {url_host_id_emb_dim=}')
    inputs = get_encoder_inputs()
       
    print('Prepare mask')
    mask = inputs['url_host_id'] != 0

    print('Create url_host_id embedding')
    url_host_id_embedding = tf.keras.layers.Embedding(
        input_dim=enconding_config.vocabulary_size_full, # 0-pad, 1-oof, 2-mask, 2-start, 3-end, ....-tokens
        output_dim=url_host_id_emb_dim,
        mask_zero=True,
        name='url_host_id_embedding',
    )(inputs['url_host_id'])   
    
    ###########################################################################
    
    print('Create part of day embedding')
    part_of_day_id_mode_embedding = tf.keras.layers.Embedding(
            input_dim=4+1,
            output_dim=url_host_id_emb_dim,
            mask_zero=True,
            name='part_of_day_id_mode_embedding',
        )(inputs['part_of_day_id_mode'] + 1)    
    
    print('Create part of day code embedding')
    part_of_day_code_embedding = tf.keras.layers.Embedding(
            input_dim=16+1,
            output_dim=url_host_id_emb_dim,
            mask_zero=False,
            name='part_of_day_code_embedding',
        )(inputs['part_of_day_code']+1)       
    
    
    ###########################################################################
    
    print('Create request_cnt_total_vs_n_days_url embedding')
    request_cnt_total = tf.cast(inputs['request_cnt_total'],tf.float32)     
    n_days_url = tf.cast(inputs['n_days_url'],tf.float32)
    request_cnt_total_vs_n_days_url = request_cnt_total /n_days_url
    
    request_cnt_total_vs_n_days_url = get_discretization_layer(
        data=url_stat_by_user['request_cnt_total'].to_numpy() / url_stat_by_user['n_days_url'].to_numpy(),
        n_bins=100,
        name='request_cnt_total_vs_n_days_url_bins'
    )(request_cnt_total_vs_n_days_url)
    
    request_cnt_total_vs_n_days_url = tf.where(request_cnt_total != -1, request_cnt_total_vs_n_days_url+1, tf.zeros_like(request_cnt_total_vs_n_days_url))
    
    request_cnt_total_vs_n_days_url_embedding = tf.keras.layers.Embedding(
            input_dim=31+1,
            output_dim=url_host_id_emb_dim,
            mask_zero=True,
            name='request_cnt_total_vs_n_days_url_embedding',
        )(request_cnt_total_vs_n_days_url)  
    print(f'{request_cnt_total_vs_n_days_url_embedding.shape=}')
    
    print('Create request_cnt_total_vs_n_days_any_url embedding')
    n_days_any_url = tf.cast(inputs['n_days_any_url'],tf.float32)
    n_days_any_url = tf.expand_dims(n_days_any_url, axis=-1)
    request_cnt_total_vs_n_days_any_url = request_cnt_total / n_days_any_url
    
    request_cnt_total_vs_n_days_any_url = get_discretization_layer(
        data=url_stat_by_user['request_cnt_total'].to_numpy() / url_stat_by_user['n_days_any_url'].to_numpy(), 
        n_bins=32,
        name='request_cnt_total_vs_n_days_any_url_bins'
    )(request_cnt_total_vs_n_days_any_url)
    
    request_cnt_total_vs_n_days_any_url = tf.where(request_cnt_total != -1, request_cnt_total_vs_n_days_any_url+1, tf.zeros_like(request_cnt_total_vs_n_days_any_url))
        
    request_cnt_total_vs_n_days_any_url_embedding = tf.keras.layers.Embedding(
            input_dim=32+1,
            output_dim=url_host_id_emb_dim,
            mask_zero=True,
            name='request_cnt_total_vs_n_days_any_url_embedding',
        )(request_cnt_total_vs_n_days_any_url)      
    print(f'{request_cnt_total_vs_n_days_any_url_embedding.shape=}')
    
    ###########################################################################
    outputs = url_host_id_embedding + request_cnt_total_vs_n_days_url_embedding + request_cnt_total_vs_n_days_any_url_embedding  + part_of_day_id_mode_embedding + part_of_day_code_embedding
    ###########################################################################   
    
    print('Apply layer normalization and dropout to the embedding.')
    outputs = tf.keras.layers.LayerNormalization(epsilon=NORM_EPSILON)(outputs)
    outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)    
    outputs._keras_mask = mask    

    # Add a number of encoder blocks
    for i in range(num_layers):
        outputs = keras_nlp.layers.TransformerEncoder(
            intermediate_dim=url_host_id_emb_dim*2,
            num_heads=num_heads,
            dropout=dropout,
            layer_norm_epsilon=NORM_EPSILON,
        )(outputs)  # , padding_mask=mask
        
    encoder_model = tf.keras.Model(inputs, outputs, name='encoder_model')
    return encoder_model

def test():
    encoder_model = create_encoder_model(
        num_layers=1,
        num_heads=4,
        dropout=0.1,
        url_host_id_emb_dim=64,
    )
    encoder_model.summary()    
    tf.keras.utils.plot_model(encoder_model, to_file="encoder_model.png", rankdir='LR') # , show_shapes=True, show_dtype=True
    
test()

In [None]:
class FinalMetricCallback(tf.keras.callbacks.Callback):
    def on_train_batch_end(self, batch, logs=None):
        logs['is_male_gini'] = (2*logs["is_male_auc"]-1)
        logs['metric'] = 2*logs["age_bins_f1"] + (2*logs["is_male_auc"]-1)

    def on_epoch_end(self, epoch, logs=None):
        logs['is_male_gini'] = (2*logs["is_male_auc"]-1)
        logs['metric'] = 2*logs["age_bins_f1"] + (2*logs["is_male_auc"]-1)
        logs['val_is_male_gini'] = (2*logs["val_is_male_auc"]-1)
        logs['val_metric'] = 2*logs["val_age_bins_f1"] + (2*logs["val_is_male_auc"]-1)
        
def get_callbacks(model_name):
    return [
        FinalMetricCallback(),
        tf.keras.callbacks.CSVLogger(get_model_report_path(model_name)),
        tf.keras.callbacks.ModelCheckpoint(
            filepath = get_model_checkpoint_path(model_name),   
            verbose=1,
            save_best_only=False,
            save_weights_only=True,
            save_freq='epoch'),
    ]

In [None]:
user_id_embedding_size = 256

@logger
def get_static_features(inputs):
    user_embedding = load_embedding('user_id_from_user_id_vs_url_host_id2', user_id_embedding_size)
    
    user_id = tf.keras.layers.Embedding(
        user_id_count, user_id_embedding_size, name='user_id_embedding',
        embeddings_initializer=tf.keras.initializers.Constant(user_embedding),
        trainable=False
    )(inputs['user_id'])
            
    region_name_id = tf.keras.layers.Embedding(81, 16, input_shape=(), name='region_name_id_embending')(inputs['region_name_id_primary'])
    city_name_id = tf.keras.layers.Embedding(985, 64, input_shape=(), name='city_name_id_embending')(inputs['city_name_id_primary'])
    cpe_manufacturer_name_id = tf.keras.layers.Embedding(37, 8, input_shape=(), name='cpe_manufacturer_name_id_embending')(inputs['cpe_manufacturer_name_id_primary'])
    cpe_model_name_id = tf.keras.layers.Embedding(599, 64, input_shape=(), name='cpe_model_name_id_embending')(inputs['cpe_model_name_id_primary'])
    cpe_type_cd_id = tf.keras.layers.Embedding(4, 2, input_shape=(), name='cpe_type_cd_id_embending')(inputs['cpe_type_cd_id_primary'])
    cpe_model_os_type_id = tf.keras.layers.Embedding(3, 2, input_shape=(), name='cpe_model_os_type_id_embending')(inputs['cpe_model_os_type_id_primary'])   
    region_name_id_count = get_normalization_layer(data=cat_features['region_name_id_count'].to_numpy(), name='region_name_id_count_bins')(inputs['region_name_id_count'])
    city_name_id_count = get_normalization_layer(data=cat_features['city_name_id_count'].to_numpy(), name='city_name_id_count_bins')(inputs['city_name_id_count'])
    
    n_price_bins=32
    price = get_discretization_layer(data=price_features['price'].to_numpy(), n_bins=n_price_bins, name='price_bins')(inputs['price'])
    price = tf.keras.layers.Embedding(n_price_bins, 8, input_shape=(), name='price_embending')(price)    

    static_features_list = [
        user_id,
        region_name_id, city_name_id, cpe_manufacturer_name_id, cpe_model_name_id, cpe_type_cd_id, cpe_model_os_type_id, 
        region_name_id_count, city_name_id_count, 
        price,
    ]
    static_features = tf.concat(static_features_list, axis=-1)   
    static_features = tf.keras.layers.LayerNormalization(epsilon=NORM_EPSILON)(static_features)    
    return static_features

@logger
def create_full_model(encoder_model, fc_hidden_units, male_fc_hidden_units, age_fc_hidden_units, fc_dropout_rate,):
    print(f'create_full_model: {fc_hidden_units=} {male_fc_hidden_units=} {age_fc_hidden_units=} {fc_dropout_rate=}')
    
    ###########################################################################
    inputs = get_encoder_inputs()
    encoder_outputs = encoder_model(inputs)
    encoder_outputs1 = tf.keras.layers.GlobalAveragePooling1D()(encoder_outputs)
    encoder_outputs2 = tf.keras.layers.GlobalMaxPool1D()(encoder_outputs)

    ###########################################################################
    inputs['user_id'] = tf.keras.Input(name='user_id', dtype=tf.int32, shape=())    
    inputs['region_name_id_primary'] = tf.keras.Input(name='region_name_id_primary', dtype=tf.int32, shape=())    
    inputs['city_name_id_primary'] = tf.keras.Input(name='city_name_id_primary', dtype=tf.int32, shape=())    
    inputs['cpe_manufacturer_name_id_primary'] = tf.keras.Input(name='cpe_manufacturer_name_id_primary', dtype=tf.int32, shape=())    
    inputs['cpe_model_name_id_primary'] = tf.keras.Input(name='cpe_model_name_id_primary', dtype=tf.int32, shape=())    
    inputs['cpe_type_cd_id_primary'] = tf.keras.Input(name='cpe_type_cd_id_primary', dtype=tf.int32, shape=())    
    inputs['cpe_model_os_type_id_primary'] = tf.keras.Input(name='cpe_model_os_type_id_primary', dtype=tf.int32, shape=())  
    inputs['region_name_id_count'] = tf.keras.Input(name='region_name_id_count', dtype=tf.int32, shape=(1))  
    inputs['city_name_id_count'] = tf.keras.Input(name='city_name_id_count', dtype=tf.int32, shape=(1))  
    inputs['cpe_model_name_id_count'] = tf.keras.Input(name='cpe_model_name_id_count', dtype=tf.int32, shape=(1))   ###### !!!!!!!!!!!! Не используется
    inputs['url_host_id_count'] = tf.keras.Input(name='url_host_id_count', dtype=tf.int32, shape=(1))               ###### !!!!!!!!!!!! Не используется
    inputs['price'] = tf.keras.Input(name='price', dtype=tf.float32, shape=())    
    
    static_features = get_static_features(inputs)  
    ###########################################################################

    x = tf.concat([
        encoder_outputs1, 
        encoder_outputs2, 
        static_features
    ], axis=-1)
    
    # Fully-connected layers block
    for layer_id, num_units in enumerate(fc_hidden_units):
        x = tf.keras.layers.Dense(num_units, name=f'final_dense_{layer_id}_{num_units}')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.LeakyReLU()(x)
        x = tf.keras.layers.Dropout(fc_dropout_rate)(x)   
        
    male = x
    for layer_id, num_units in enumerate(male_fc_hidden_units):
        male = tf.keras.layers.Dense(num_units, name=f'male_dense_{layer_id}_{num_units}')(male)
        male = tf.keras.layers.BatchNormalization()(male)
        male = tf.keras.layers.LeakyReLU()(male)
        male = tf.keras.layers.Dropout(fc_dropout_rate)(male)            
    male_output = tf.keras.layers.Dense(1, activation='sigmoid', name='is_male')(male)    # 
    
    age = x
    for layer_id, num_units in enumerate(age_fc_hidden_units):
        age = tf.keras.layers.Dense(num_units, name=f'age_dense_{layer_id}_{num_units}')(age)
        age = tf.keras.layers.BatchNormalization()(age)
        age = tf.keras.layers.LeakyReLU()(age)
        age = tf.keras.layers.Dropout(fc_dropout_rate)(age)      
    age_bins_output = tf.keras.layers.Dense(6, activation='softmax', name='age_bins')(age)  # 
    age_output = tf.keras.layers.Dense(1, activation=None, name='age')(age) 
        
    model = tf.keras.Model(inputs=inputs, outputs={
        'is_male':male_output,
        'age_bins':age_bins_output, 
        'age':age_output,
    }, name='full_model')
    return model

def test():
    encoder_num_layers = 1
    encoder_num_heads = 3
    encoder_dropout=0.1
    encoder_url_host_id_emb_dim = 64

    fc_hidden_units = [256,128]
    male_fc_hidden_units = [128, 64]
    age_fc_hidden_units = [128, 64]
    fc_dropout_rate = 0.2

    learning_rate = 1e-4
    batch_size = 64

    encoder_model = create_encoder_model(
        num_layers=encoder_num_layers,
        num_heads=encoder_num_heads,
        dropout=encoder_dropout,
        url_host_id_emb_dim=encoder_url_host_id_emb_dim,
    )
    tf.keras.utils.plot_model(encoder_model, to_file="encoder_model.png", rankdir='LR') # , show_shapes=True, show_dtype=True
    
    model = create_full_model(
        encoder_model, 
        fc_hidden_units, 
        male_fc_hidden_units,
        age_fc_hidden_units,
        fc_dropout_rate,
    )
    tf.keras.utils.plot_model(model, to_file="full_model.png", rankdir='LR')
    model.summary()
    
    print(model.predict(get_train_dataset(train_sample, batch_size=2).take(2)))
    
test()

# Pretrained model

In [None]:
PREDICTIONS_PER_SEQ = 32

@logger
def get_lm_masked_dataset(dataset, mask_selection_rate=0.2, mask_token_rate=0.8, random_token_rate=0.1):
    
    # https://keras.io/api/keras_nlp/preprocessing_layers/masked_lm_mask_generator/
    masker = keras_nlp.layers.MaskedLMMaskGenerator(
        vocabulary_size=enconding_config.vocabulary_size_full,
        mask_selection_rate=mask_selection_rate,
        mask_token_id=enconding_config.mask_token_id,
        mask_selection_length=PREDICTIONS_PER_SEQ,
        unselectable_token_ids=[enconding_config.pad_token_id, enconding_config.start_token_id, enconding_config.end_token_id],
        mask_token_rate=mask_token_rate,
        random_token_rate=random_token_rate,
    )

    def fn(inputs, target):
        outputs = masker(inputs['url_host_id'])
        inputs['url_host_id'] = outputs["token_ids"]
        inputs['mask_positions'] = outputs["mask_positions"]

        # Split the masking layer outputs into a (features, labels, and weights) tuple that we can use with keras.Model.fit().
        labels = outputs["mask_ids"]#.to_tensor()
        weights = outputs["mask_weights"]#.to_tensor()
        return inputs, labels, weights
    
    dataset = dataset.map(fn, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)    
    return dataset

pretrain_train, pretrain_valid = get_pretrain_train_valid_df()
pretrain_train = pretrain_train[:10]
pretrain_valid = pretrain_valid[:10]

for i in get_lm_masked_dataset(get_train_dataset(pretrain_train, batch_size=2)).take(1):
    print(i)

In [None]:
@logger
def create_pretrain_model(encoder_model):
    inputs = get_encoder_inputs()
    encoder_outputs = encoder_model(inputs)    
    
    # Predict an output word for each masked input token.
    # We use the input token embedding to project from our encoded vectors to
    # vocabulary logits, which has been shown to improve training efficiency.
    
    inputs['mask_positions'] = tf.keras.Input(name='mask_positions', ragged=False, dtype=tf.int32, shape=(PREDICTIONS_PER_SEQ,))    
    url_host_id_embedding = encoder_model.get_layer('url_host_id_embedding').embeddings
    
    masked_head_outputs = keras_nlp.layers.MaskedLMHead(
        embedding_weights=url_host_id_embedding,
#         activation="softmax",
        activation=None,
    )(inputs=encoder_outputs, mask_positions=inputs['mask_positions'])
    
    pretraining_model = tf.keras.Model(inputs, masked_head_outputs, name='masked_model')
    pretraining_model.summary()    
    return pretraining_model

def test():
    encoder_num_layers = 4
    encoder_num_heads = 4
    encoder_dropout=0.1
    encoder_url_host_id_emb_dim = 64

    fc_hidden_units = [256,128]
    male_fc_hidden_units = [128, 64]
    age_fc_hidden_units = [128, 64]
    fc_dropout_rate = 0.2

    learning_rate = 1e-4
    batch_size = 32

    encoder_model = create_encoder_model(
        num_layers=encoder_num_layers,
        num_heads=encoder_num_heads,
        dropout=encoder_dropout,
        url_host_id_emb_dim=encoder_url_host_id_emb_dim,
    )
    
    model = create_pretrain_model(encoder_model)
    tf.keras.utils.plot_model(model, to_file="pretrain_model.png", rankdir='LR')
    model.summary()
    
    dataset = get_lm_masked_dataset(get_train_dataset(pretrain_train, batch_size=2)).take(1)
    print(model.predict(dataset))
    
test()

In [None]:
@logger
def train_pretrain_model(
    model_name, 
    encoder_num_layers, encoder_num_heads, encoder_dropout, encoder_url_host_id_emb_dim,
    batch_size, learning_rate, epoch_max):
    
    print(f'train_pretrain_model: {model_name=} {batch_size=} {learning_rate=} {epoch_max=}')

    encoder_model = create_encoder_model(
        num_layers=encoder_num_layers,
        num_heads=encoder_num_heads,
        dropout=encoder_dropout,
        url_host_id_emb_dim=encoder_url_host_id_emb_dim,
    )
    
    model = create_pretrain_model(encoder_model)
    model.summary()

    pretrain_train, dataset_valid = get_pretrain_train_valid_df()
    dataset_train = get_lm_masked_dataset(get_train_dataset(pretrain_train, batch_size=batch_size, data_rate=0.5)).shuffle(128).prefetch(tf.data.AUTOTUNE)
    dataset_valid = get_lm_masked_dataset(get_valid_dataset(dataset_valid, batch_size=batch_size)).cache()
    
    model.compile(
        optimizer=tf.keras.optimizers.experimental.AdamW(learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        weighted_metrics=tf.keras.metrics.SparseCategoricalAccuracy(),
    )
    
    def get_callbacks(model_name):
        return [
            tf.keras.callbacks.CSVLogger(get_model_report_path(model_name)),
            tf.keras.callbacks.ModelCheckpoint(
                filepath = get_model_checkpoint_path(model_name),   
                verbose=1,
                save_best_only=False,
                save_weights_only=True,
                save_freq='epoch'),
            tf.keras.callbacks.EarlyStopping(
                monitor='val_sparse_categorical_accuracy',
                mode='max',
                verbose=1,    
                patience=2,
                restore_best_weights=True),
        ]    
    
    model.fit(
        x=dataset_train,
        validation_data=dataset_valid, 
        epochs=epoch_max, 
        callbacks = get_callbacks(model_name)
    )    


In [None]:
@logger
def train_pretrain_model_additional(
    model_name, 
    encoder_num_layers, encoder_num_heads, encoder_dropout, encoder_url_host_id_emb_dim,
    checkpoint_path,
    batch_size, learning_rate, epoch_max,
    data_rate=0.5):
    
    print(f'train_pretrain_model: {model_name=} {batch_size=} {learning_rate=} {epoch_max=}')

    encoder_model = create_encoder_model(
        num_layers=encoder_num_layers,
        num_heads=encoder_num_heads,
        dropout=encoder_dropout,
        url_host_id_emb_dim=encoder_url_host_id_emb_dim,
    )
    
    model = create_pretrain_model(encoder_model)
    model.load_weights(checkpoint_path)
    model.summary()

    pretrain_train, dataset_valid = get_pretrain_train_valid_df()
    dataset_train = get_lm_masked_dataset(get_train_dataset(pretrain_train, batch_size=batch_size, data_rate=data_rate))
    dataset_valid = get_lm_masked_dataset(get_valid_dataset(dataset_valid, batch_size=batch_size)).cache()
    
    model.compile(
        optimizer=tf.keras.optimizers.experimental.AdamW(learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        weighted_metrics=tf.keras.metrics.SparseCategoricalAccuracy(),
    )
    
    def get_callbacks(model_name):
        return [
            tf.keras.callbacks.CSVLogger(get_model_report_path(model_name)),
            tf.keras.callbacks.ModelCheckpoint(
                filepath = get_model_checkpoint_path(model_name),   
                verbose=1,
                save_best_only=False,
                save_weights_only=True,
                save_freq='epoch'),
            tf.keras.callbacks.EarlyStopping(
                monitor='val_sparse_categorical_accuracy',
                mode='max',
                verbose=1,    
                patience=2,
                restore_best_weights=True),
        ]    
    
    model.fit(
        x=dataset_train,
        validation_data=dataset_valid, 
        epochs=epoch_max, 
        callbacks = get_callbacks(model_name)
    )    


## Обучим masked модель на коротких последовательностях (половину данных удаляем для скорости)

In [None]:
import tensorflow as tf
import keras_nlp

from typing import Dict, Text

print(f'numpy={np.__version__}')
print(f'pandas={pd.__version__}')
print(f'polars={pl.__version__}')
print(f'tf={tf.__version__}')
print(f'keras_nlp={keras_nlp.__version__}')

encoder_num_layers = 4
encoder_num_heads = 4
encoder_dropout=0.1
encoder_url_host_id_emb_dim = 256

learning_rate = 1e-4
batch_size = 32
    
model_name = f'uhf08_pretarain_url_{encoder_url_host_id_emb_dim}_tf_{encoder_num_layers}_{encoder_num_heads}_adam_{learning_rate}_b_{batch_size}_v11'
print(f'{model_name=}') 
    
train_pretrain_model(
    model_name=model_name, 
    encoder_num_layers=encoder_num_layers, encoder_num_heads=encoder_num_heads, encoder_dropout=encoder_dropout, encoder_url_host_id_emb_dim=encoder_url_host_id_emb_dim, 
    batch_size=batch_size, learning_rate=learning_rate, epoch_max=30, data_rate=0.5)

## Дообучим masked модель на полных последовательностях

In [None]:
import tensorflow as tf
import keras_nlp

from typing import Dict, Text

print(f'numpy={np.__version__}')
print(f'pandas={pd.__version__}')
print(f'polars={pl.__version__}')
print(f'tf={tf.__version__}')
print(f'keras_nlp={keras_nlp.__version__}')

encoder_num_layers = 4
encoder_num_heads = 4
encoder_dropout=0.1
encoder_url_host_id_emb_dim = 256

learning_rate = 1e-4
batch_size = 32
    
model_name = f'uhf08_pretarain_url_{encoder_url_host_id_emb_dim}_tf_{encoder_num_layers}_{encoder_num_heads}_adam_{learning_rate}_b_{batch_size}_v12'
print(f'{model_name=}') 
    
train_pretrain_model_additional(
    model_name=model_name, 
    encoder_num_layers=encoder_num_layers, encoder_num_heads=encoder_num_heads, encoder_dropout=encoder_dropout, encoder_url_host_id_emb_dim=encoder_url_host_id_emb_dim, 
    checkpoint_path=get_model_checkpoint_path_by_epoch(uhf08_pretarain_url_256_tf_4_4_adam_0.0001_b_32_v11, 10),
    batch_size=batch_size, learning_rate=learning_rate, epoch_max=30, data_rate=1.0)

## Дообучим masked модель на полных последовательностях с меньшим learning_rate

In [None]:
#!g1.1
#pragma async

import tensorflow as tf
import keras_nlp

from typing import Dict, Text

print(f'numpy={np.__version__}')
print(f'pandas={pd.__version__}')
print(f'polars={pl.__version__}')
print(f'tf={tf.__version__}')
print(f'keras_nlp={keras_nlp.__version__}')

encoder_num_layers = 4
encoder_num_heads = 4
encoder_dropout=0.1
encoder_url_host_id_emb_dim = 256

learning_rate = 3e-5
batch_size = 32
    
model_name = f'uhf08_pretarain_url_{encoder_url_host_id_emb_dim}_tf_{encoder_num_layers}_{encoder_num_heads}_adam_{learning_rate}_b_{batch_size}_v13'
print(f'{model_name=}') 
    
train_pretrain_model_additional(
    model_name=model_name, 
    encoder_num_layers=encoder_num_layers, encoder_num_heads=encoder_num_heads, encoder_dropout=encoder_dropout, encoder_url_host_id_emb_dim=encoder_url_host_id_emb_dim, 
    checkpoint_path='/home/jupyter/mnt/s3/mtsmlcup/models/uhf08_pretarain_url_256_tf_4_4_adam_0.0001_b_32_v12/11_checkpoint',
    batch_size=batch_size, learning_rate=learning_rate, epoch_max=30, data_rate=1.0)

# Строим модель на базе предобученного encoder

## Определяем логику обучеения
* сначала с большим learning_rate, но заблокированным для обучения encoder
* продолжаем с малым learning_rate и разблокированным для обучения encoder

In [None]:
@logger
def train_full_model_from_pretrain(
    model_name, cv_fold_id, 
    encoder_num_layers, encoder_num_heads, encoder_dropout, encoder_url_host_id_emb_dim,    
    prepratin_model_name, prepratin_model_epoch, 
    fc_hidden_units, male_fc_hidden_units, age_fc_hidden_units,fc_dropout_rate,
    learning_rate1, epoch_max1,
    learning_rate2, epoch_max2,
    batch_size, data_rate):
    
    print(f'train_full_model_from_pretrain: {model_name=} {learning_rate1=} {epoch_max1=} {learning_rate2=} {epoch_max2=} {prepratin_model_name=} {prepratin_model_epoch=}')
    
    ###########################################################################
    # Вспомогательные функции 
    
    def get_callbacks(model_name):
        return [
            FinalMetricCallback(),
            tf.keras.callbacks.CSVLogger(get_model_report_path(model_name), append=True),
            tf.keras.callbacks.ModelCheckpoint(
                filepath = get_model_checkpoint_path(model_name),   
                verbose=1,
                save_best_only=False,
                save_weights_only=True,
                save_freq='epoch'),
            tf.keras.callbacks.EarlyStopping(
                monitor='val_metric',
                mode='max',
                verbose=1,    
                patience=3,
                restore_best_weights=True),
        ]
    
    # Загрузка данных
    dataset_train = get_train_dataset(get_data_by_target(cv_folds.get_train_target_fold(cv_fold_id)), batch_size=batch_size, data_rate=data_rate)
    dataset_valid = get_valid_dataset(get_data_by_target(cv_folds.get_valid_target_fold(cv_fold_id)), batch_size=batch_size) 
    
    ###########################################################################
    # Создаем модеель на основании предзагруженной
    
    encoder_model = create_encoder_model(
        num_layers=encoder_num_layers,
        num_heads=encoder_num_heads,
        dropout=encoder_dropout,
        url_host_id_emb_dim=encoder_url_host_id_emb_dim,
    )
    encoder_model.summary()
    
    pretrain_model = create_pretrain_model(encoder_model)    
    checkpoint_path = get_model_checkpoint_path_by_epoch(prepratin_model_name, prepratin_model_epoch)    
    pretrain_model.load_weights(checkpoint_path)     
    
    model = create_full_model(
        encoder_model, 
        fc_hidden_units, 
        male_fc_hidden_units,
        age_fc_hidden_units,
        fc_dropout_rate,
    )
    
    ###########################################################################
    # Обучаем все, кроме encoder_model
    
    encoder_model.trainable = False
    model.summary()
    
    model.compile(
        optimizer=tf.keras.optimizers.experimental.AdamW(learning_rate1),
        loss_weights={
            'is_male':1,
            'age_bins':5,
            'age':0.01, # 0.05
        },
        loss={
            'is_male':tf.keras.losses.BinaryCrossentropy(), #tf.keras.losses.BinaryFocalCrossentropy(), # #
            'age_bins':get_categorical_focal_crossentropy_loss(gamma=2.0, alpha=0.25),
            'age':tf.keras.losses.Huber(),
        },
        metrics= {
            'is_male':tf.keras.metrics.AUC(name='auc'),
            'age_bins':tfa.metrics.F1Score(average='weighted', num_classes=6, name='f1'),
            'age':tf.keras.metrics.RootMeanSquaredError(name='rmse')
        },
    )

    model.fit(
        x=dataset_train,
        validation_data=dataset_valid.cache(), 
        epochs=epoch_max1, 
        callbacks = get_callbacks(model_name)
    ) 
    
    ###########################################################################
    # Обучаем все
    
    encoder_model.trainable = True
    model.summary()    
    
    model.compile(
        optimizer=tf.keras.optimizers.experimental.AdamW(learning_rate2),
        loss_weights={
            'is_male':1,
            'age_bins':5,
            'age':0.01, # 0.05
        },
        loss={
            'is_male':tf.keras.losses.BinaryCrossentropy(), #tf.keras.losses.BinaryFocalCrossentropy(), # #
            'age_bins':get_categorical_focal_crossentropy_loss(gamma=2.0, alpha=0.25),
            'age':tf.keras.losses.Huber(),
        },
        metrics= {
            'is_male':tf.keras.metrics.AUC(name='auc'),
            'age_bins':tfa.metrics.F1Score(average='weighted', num_classes=6, name='f1'),
            'age':tf.keras.metrics.RootMeanSquaredError(name='rmse')
        },
    )
    
    model.fit(
        x=dataset_train,
        validation_data=dataset_valid.cache(), 
        initial_epoch = epoch_max1,
        epochs=epoch_max1+epoch_max2, 
        callbacks = get_callbacks(model_name)
    ) 


## Обучаем 5 моделей (CV5)
Распределение данных очень нестабильное:
* Много элементов, которые были у очень небольшого количества данных
* Даты из разных интервалов, количество пользователей между датами сильно отличаеется

В результате модель построенная по всем данным проигрывает устредненным предсказаниям 5 моделей, построенных по 80% данных (CV5)
Более того, усреднение нескольких моделей с одинаковой архитектурой и построеенных по одним данным тоже дает прирос.
Так же усреднение двух лучших epoch дает результат лучше каждого из них.
Итоговый результат - усреднение предсказаний 5 моделей на 80% данных (CV5) * посчитанно 5 раз * предсказания с двух лучших epoch (25, 26)

In [None]:
import tensorflow as tf
import tensorflow_addons as tfa
from typing import Dict, Text
import keras_nlp

print(f'numpy={np.__version__}')
print(f'pandas={pd.__version__}')
print(f'polars={pl.__version__}')
print(f'tf={tf.__version__}')
print(f'tfa={tfa.__version__}')
print(f'keras_nlp={keras_nlp.__version__}')

encoder_num_layers = 4
encoder_num_heads = 4
encoder_dropout=0.1
encoder_url_host_id_emb_dim = 256

fc_hidden_units = [256,128]
male_fc_hidden_units = [128, 64]
age_fc_hidden_units = [128, 64]
fc_dropout_rate = 0.2

batch_size = 32

learning_rate1=1e-4
learning_rate2=1e-5
    
for cv_fold_id in range(5):
    model_name = f'uhf08_pretrained_url_{encoder_url_host_id_emb_dim}_tf_{encoder_num_layers}_{encoder_num_heads}_adamw_{learning_rate1}_{learning_rate2}_b_{batch_size}_v13_01_08_cv5_{cv_fold_id}'
    print(f'{model_name=}') 
    
    train_full_model_from_pretrain(
        model_name=model_name, cv_fold_id=cv_fold_id, 
        prepratin_model_name='uhf08_pretarain_url_256_tf_4_4_adam_3e-05_b_32_v13', prepratin_model_epoch=1, 
        encoder_num_layers=encoder_num_layers, encoder_num_heads=encoder_num_heads, encoder_dropout=encoder_dropout, encoder_url_host_id_emb_dim=encoder_url_host_id_emb_dim, 
        fc_hidden_units=fc_hidden_units, male_fc_hidden_units=male_fc_hidden_units, age_fc_hidden_units=age_fc_hidden_units, fc_dropout_rate=fc_dropout_rate,
        batch_size=batch_size, 
        learning_rate1=learning_rate1, epoch_max1=20,
        learning_rate2=learning_rate2, epoch_max2=20,
        data_rate=1.0 # в предыдущих было 0.75
    )

## Предсказываем предобученной моделью

In [None]:
from collections import namedtuple
ModelCheckpoint = namedtuple('ModelCheckpoint', ['cv_fold_id', 'model_name', 'epoch'])

CONFIG_PREDICTIONS_PATH = '/home/jupyter/mnt/s3/mtsmlcup/predictions'

@logger
def predict_cv_model_list():
    # Create empty model
    encoder_num_layers = 4
    encoder_num_heads = 4
    encoder_dropout=0.1
    encoder_url_host_id_emb_dim = 256

    fc_hidden_units = [256,128]
    male_fc_hidden_units = [128, 64]
    age_fc_hidden_units = [128, 64]
    fc_dropout_rate = 0.2

    batch_size = 64

    encoder_model = create_encoder_model(
        num_layers=encoder_num_layers,
        num_heads=encoder_num_heads,
        dropout=encoder_dropout,
        url_host_id_emb_dim=encoder_url_host_id_emb_dim,
    )
    encoder_model.summary()
    
    model = create_full_model(
        encoder_model, 
        fc_hidden_units, 
        male_fc_hidden_units,
        age_fc_hidden_units,
        fc_dropout_rate,
    )
    model.summary()
    
    # Load test data
    test_data = get_data_by_target(cv_folds.get_test_target())
    test_dataset = get_valid_dataset(test_data, batch_size=128)
    
    # Define list of models
    model_checkpoint_list = [
        ModelCheckpoint(cv_fold_id=0, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_08_cv5_0', epoch=24),
        ModelCheckpoint(cv_fold_id=0, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_08_cv5_0', epoch=25),
        ModelCheckpoint(cv_fold_id=0, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_09_cv5_0', epoch=24),
        ModelCheckpoint(cv_fold_id=0, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_09_cv5_0', epoch=25),        
        ModelCheckpoint(cv_fold_id=0, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_10_cv5_0', epoch=24),
        ModelCheckpoint(cv_fold_id=0, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_10_cv5_0', epoch=25),
        
        ModelCheckpoint(cv_fold_id=1, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_08_cv5_1', epoch=24),
        ModelCheckpoint(cv_fold_id=1, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_08_cv5_1', epoch=25),
        ModelCheckpoint(cv_fold_id=1, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_09_cv5_1', epoch=24),
        ModelCheckpoint(cv_fold_id=1, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_09_cv5_1', epoch=25),        
        ModelCheckpoint(cv_fold_id=1, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_10_cv5_1', epoch=24),
        ModelCheckpoint(cv_fold_id=1, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_10_cv5_1', epoch=25),    
        
        ModelCheckpoint(cv_fold_id=2, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_08_cv5_2', epoch=24),
        ModelCheckpoint(cv_fold_id=2, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_08_cv5_2', epoch=25),
        ModelCheckpoint(cv_fold_id=2, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_09_cv5_2', epoch=24),
        ModelCheckpoint(cv_fold_id=2, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_09_cv5_2', epoch=25),        
        ModelCheckpoint(cv_fold_id=2, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_10_cv5_2', epoch=24),
        ModelCheckpoint(cv_fold_id=2, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_10_cv5_2', epoch=25),     
        
        ModelCheckpoint(cv_fold_id=3, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_08_cv5_3', epoch=24),
        ModelCheckpoint(cv_fold_id=3, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_08_cv5_3', epoch=25),
        ModelCheckpoint(cv_fold_id=3, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_09_cv5_3', epoch=24),
        ModelCheckpoint(cv_fold_id=3, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_09_cv5_3', epoch=25),        
        ModelCheckpoint(cv_fold_id=3, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_10_cv5_3', epoch=24),
        ModelCheckpoint(cv_fold_id=3, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_10_cv5_3', epoch=25),     
        
        ModelCheckpoint(cv_fold_id=4, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_08_cv5_4', epoch=24),
        ModelCheckpoint(cv_fold_id=4, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_08_cv5_4', epoch=25),
        ModelCheckpoint(cv_fold_id=4, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_09_cv5_4', epoch=24),
        ModelCheckpoint(cv_fold_id=4, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_09_cv5_4', epoch=25),        
        ModelCheckpoint(cv_fold_id=4, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_10_cv5_4', epoch=24),
        ModelCheckpoint(cv_fold_id=4, model_name='uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_10_cv5_4', epoch=25),             
    ]    

    for model_checkpoint in model_checkpoint_list:
        print(f'{model_checkpoint=}')
        model.load_weights(get_model_checkpoint_path_by_epoch(model_checkpoint.model_name, model_checkpoint.epoch)) 
        
        pred = model.predict(test_dataset)
        
        def get_embedding_column(feature_name, column_name = None):
            if column_name is None:
                column_name = feature_name
            print(f'get_embedding_column: {feature_name} -> {column_name}')
            column = pl.Series(name=column_name, values=pred[feature_name].tolist())
            return column
        
        prediction = test_data.select([
                pl.col('user_id'),        
                pl.col('is_male').alias('is_male_fact'),
                pl.lit(pred['is_male'][:,0]).alias('is_male'),
                pl.col('age_bins').alias('age_bins_fact'),
                get_embedding_column('age_bins', 'age_pred'),
                pl.lit(tf.argmax(pred['age_bins'], axis=-1).numpy()+1).alias('age'),
            ])  

        os.makedirs(CONFIG_PREDICTIONS_PATH, exist_ok=True)      
        prediction_folder = Path(CONFIG_PREDICTIONS_PATH)     

        print(f'predict_raw:is_male target distribution')
        print(prediction['is_male'].describe())

        print(f'predict_raw:age target distribution')
        print(prediction.groupby('age').agg(pl.count()).sort('age'))        
        assert ~any(prediction['age'] == 0)

        prediction_path = prediction_folder / f'{model_checkpoint.model_name}_epoch_{model_checkpoint.epoch}.csv'
        prediction_raw_path = prediction_folder / f'{model_checkpoint.model_name}_epoch_{model_checkpoint.epoch}_raw.parquet'
        print(f'predict_raw:{prediction_path=}, {prediction_raw_path=}')

        prediction.select(pl.col(['user_id', 'is_male', 'age'])).write_csv(prediction_path)        
        prediction.write_parquet(prediction_raw_path)      
            
predict_cv_model_list() 

## Усредняем предсказания нескольких моделей

In [None]:
import numpy as np

def save_prediction(prediction, name):
    print(f'save_prediction:begin {name=}, {prediction.shape=}')
    
    prediction_folder = Path(CONFIG_PREDICTIONS_PATH)
    os.makedirs(prediction_folder, exist_ok=True)    
    
    print(f'save_prediction:is_male target distribution')
    print(prediction['is_male'].describe())

    print(f'save_prediction:age target distribution')
    print(prediction.groupby('age').agg(pl.count()).sort('age'))        
    assert ~any(prediction['age'] == 0)
    
    prediction_path = prediction_folder / f'{name}.csv'
    prediction_raw_path = prediction_folder / f'{name}_raw.parquet'
    print(f'save_prediction:{prediction_path=}, {prediction_raw_path=}')
    
    prediction.select(pl.col(['user_id', 'is_male', 'age'])).write_csv(prediction_path)        
    prediction.select(pl.col(['user_id', 'is_male', 'age', 'age_pred'])).write_parquet(prediction_raw_path)  
    
def get_raw_predictions(file_name_list):
    prediction_folder = Path(CONFIG_PREDICTIONS_PATH)
    
    predictions_list = []
    for file_name in file_name_list:
        file_path = prediction_folder / file_name
        print(file_path)
        assert file_path.exists() == True    
        predictions_list.append(pl.read_parquet(file_path))
        
    return pl.concat(predictions_list)

model_name_list = []
model_name_list += [f'uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_08_cv5_{cv_fold_id}_epoch_24_raw.parquet' for cv_fold_id in range(5)] 
model_name_list += [f'uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_08_cv5_{cv_fold_id}_epoch_25_raw.parquet' for cv_fold_id in range(5)] 

model_name_list += [f'uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_09_cv5_{cv_fold_id}_epoch_24_raw.parquet' for cv_fold_id in range(5)] 
model_name_list += [f'uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_09_cv5_{cv_fold_id}_epoch_25_raw.parquet' for cv_fold_id in range(5)] 

model_name_list += [f'uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_10_cv5_{cv_fold_id}_epoch_24_raw.parquet' for cv_fold_id in range(5)] 
model_name_list += [f'uhf08_pretrained_url_256_tf_4_4_adamw_0.0001_1e-05_b_32_v13_01_10_cv5_{cv_fold_id}_epoch_25_raw.parquet' for cv_fold_id in range(5)] 

prediction = get_raw_predictions(model_name_list)

prediction_mean = prediction.groupby('user_id').agg([
    pl.col('is_male').mean(),
    pl.col('age_pred')
]).with_columns([
    pl.col('age_pred').apply(lambda x: np.mean(np.vstack(np.array(x)), axis=0).tolist())
]).sort('user_id')

age_pred = np.vstack(prediction_mean['age_pred'].to_numpy()).argmax(axis=-1) + 1
prediction_mean = prediction_mean.with_columns(pl.lit(age_pred).alias('age'))
save_prediction(prediction_mean, f'v13_01_10_cv5_3_seeds')
prediction_mean