### Import libraries

In [1]:
import warnings

import numpy as np
import pandas as pd
import random

import torch
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

In [2]:
import sys
import os
import gc
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ["USE_OPENMP"] = '1'

sys.path.append(os.path.abspath("../"))
from fe_modules.aggregates import get_agg_count, get_agg_sum, get_top_n_mode, \
                                  get_agg_mean, get_agg_max, get_agg_min, \
                                  get_agg_median, get_agg_std, get_agg_nunique, \
                                  get_price_of_all_cpes, get_ratio_part_of_day
from fe_modules.cyclical_features import generate_cyclical_features
from fe_modules.datetime_features import get_year, get_month, get_day, \
                                         get_timestamp, get_day_of_year, \
                                         get_day_of_week, get_holiday_name, \
                                         part_of_day_to_hour, add_hour_to_date, \
                                         get_relative_time, mean_first_visit, \
                                         mean_last_visit
from fe_modules.encoders import TargetEncoderWrapper
from fe_modules.feature_merges import add_cat_features
from fe_modules.geo_features import map_cities, geo_dist, dist_to_large_cities, \
                                    map_grid, MapGridTransformer
from fe_modules.missing import map_prices
from fe_modules.preprocessing import clean_os_type
from fe_modules.text_manipulation import get_domain
from fe_modules.time_lags import generate_time_lags
from fe_modules.recsys_features import ALSWrapper, RecVAEWrapper

from fe_modules.user_fe import UserFE

from modules.memory_utils import pandas_reduce_mem_usage

from seq2seq_modules.metrics import GENDER_METRIC
from fe_modules.encoders import CatBoostEncoderWrapper

In [3]:
def my_reset(*varnames):
    """
    varnames are what you want to keep
    """
    globals_ = globals()
    to_save = {v: globals_[v] for v in varnames}
    to_save['my_reset'] = my_reset  # lets keep this function by default
    del globals_
    get_ipython().magic("reset")
    globals().update(to_save)

In [4]:
random_state = 69

random.seed(random_state),
np.random.seed(random_state)
torch.manual_seed(random_state)
torch.cuda.manual_seed_all(random_state)

### Read Data

In [5]:
LOCAL_DATA = "../data"

In [14]:
df = pandas_reduce_mem_usage( 
        pd.read_parquet(f'{LOCAL_DATA}/competition_data_final_pqt')
)

targets = pandas_reduce_mem_usage( 
        pd.read_parquet(f'{LOCAL_DATA}/public_train.pqt')
) 

df = df.merge(targets, how="left", on="user_id")

df.head()

Memory usage of dataframe is 29562.33 MB


  0%|          | 0/12 [00:00<?, ?it/s]

Memory usage after optimization is: 26174.98 MB
Decreased by 11.5%
Memory usage of dataframe is 8.24 MB


  0%|          | 0/3 [00:00<?, ?it/s]

Memory usage after optimization is: 7.21 MB
Decreased by 12.5%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,age,is_male
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,,
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,,
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,,
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,,
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,,


In [15]:
df["is_male"] = df["is_male"].fillna(value=np.nan)
df.loc[df["is_male"] == "NA", "is_male"] = np.nan
df["is_male"] = df["is_male"].astype(np.float32)
df

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,age,is_male
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,,
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,,
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,,
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,,
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
322899430,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,avatars.mds.yandex.net,smartphone,Android,16376.0,2021-07-12,morning,1,300964,57.0,0.0
322899431,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,googleads.g.doubleclick.net,smartphone,Android,16376.0,2021-06-20,evening,1,300964,57.0,0.0
322899432,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,online.sberbank.ru,smartphone,Android,16376.0,2021-08-05,day,1,300964,57.0,0.0
322899433,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,s0.2mdn.net,smartphone,Android,16376.0,2021-07-19,evening,1,300964,57.0,0.0


In [16]:
df = clean_os_type(df)
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,age,is_male
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,,
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,,
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,,
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,,
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,,


In [17]:
df = get_year(df)
df = get_month(df)
df = get_day(df)
df = pandas_reduce_mem_usage(df)
df.head()

Memory usage of dataframe is 39724.38 MB


  0%|          | 0/17 [00:00<?, ?it/s]

Memory usage after optimization is: 33565.56 MB
Decreased by 15.5%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,age,is_male,year,month,day
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,,,2022,6,15
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,,,2022,6,19
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,,,2022,6,12
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,,,2022,5,16
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,,,2022,5,30


In [18]:
df = get_day_of_year(df)
df = get_day_of_week(df)
df = pandas_reduce_mem_usage(df)
df.head()

Memory usage of dataframe is 38492.61 MB


  0%|          | 0/19 [00:00<?, ?it/s]

Memory usage after optimization is: 34489.38 MB
Decreased by 10.4%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,age,is_male,year,month,day,day_of_year,day_of_week
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,,,2022,6,15,166,2
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,,,2022,6,19,170,6
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,,,2022,6,12,163,6
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,,,2022,5,16,136,0
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,,,2022,5,30,150,0


In [19]:
df = get_holiday_name(df)
df = pandas_reduce_mem_usage(df)
df.head()

Memory usage of dataframe is 36952.91 MB


  0%|          | 0/20 [00:00<?, ?it/s]

Memory usage after optimization is: 36952.91 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,age,is_male,year,month,day,day_of_year,day_of_week,holiday
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,,,2022,6,15,166,2,Не праздник
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,,,2022,6,19,170,6,Не праздник
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,,,2022,6,12,163,6,День России
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,,,2022,5,16,136,0,Не праздник
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,,,2022,5,30,150,0,Не праздник


In [20]:
df = part_of_day_to_hour(df)
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,...,user_id,age,is_male,year,month,day,day_of_year,day_of_week,holiday,hour
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,...,45098,,,2022,6,15,166,2,Не праздник,0 days 09:00:00
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,...,45098,,,2022,6,19,170,6,Не праздник,0 days 09:00:00
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,...,45098,,,2022,6,12,163,6,День России,0 days 15:00:00
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,...,45098,,,2022,5,16,136,0,Не праздник,0 days 15:00:00
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,...,45098,,,2022,5,30,150,0,Не праздник,0 days 15:00:00


In [21]:
df = add_hour_to_date(df)
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,...,age,is_male,year,month,day,day_of_year,day_of_week,holiday,hour,datetime
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,...,,,2022,6,15,166,2,Не праздник,0 days 09:00:00,2022-06-15 09:00:00
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,...,,,2022,6,19,170,6,Не праздник,0 days 09:00:00,2022-06-19 09:00:00
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,...,,,2022,6,12,163,6,День России,0 days 15:00:00,2022-06-12 15:00:00
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,...,,,2022,5,16,136,0,Не праздник,0 days 15:00:00,2022-05-16 15:00:00
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,...,,,2022,5,30,150,0,Не праздник,0 days 15:00:00,2022-05-30 15:00:00


In [22]:
df = df.drop(labels=["date", "hour"], axis=1)
gc.collect()
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,user_id,age,is_male,year,month,day,day_of_year,day_of_week,holiday,datetime
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,morning,1,45098,,,2022,6,15,166,2,Не праздник,2022-06-15 09:00:00
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,morning,1,45098,,,2022,6,19,170,6,Не праздник,2022-06-19 09:00:00
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,day,1,45098,,,2022,6,12,163,6,День России,2022-06-12 15:00:00
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,day,1,45098,,,2022,5,16,136,0,Не праздник,2022-05-16 15:00:00
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,day,1,45098,,,2022,5,30,150,0,Не праздник,2022-05-30 15:00:00


In [23]:
df = pandas_reduce_mem_usage(get_timestamp(df), columns=["timestamp"])
df

Memory usage of dataframe is 39416.43 MB


  0%|          | 0/1 [00:00<?, ?it/s]

Memory usage after optimization is: 39416.43 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,age,is_male,year,month,day,day_of_year,day_of_week,holiday,datetime,timestamp
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,morning,1,...,,,2022,6,15,166,2,Не праздник,2022-06-15 09:00:00,165528360.0
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,morning,1,...,,,2022,6,19,170,6,Не праздник,2022-06-19 09:00:00,165562920.0
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,day,1,...,,,2022,6,12,163,6,День России,2022-06-12 15:00:00,165504600.0
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,day,1,...,,,2022,5,16,136,0,Не праздник,2022-05-16 15:00:00,165271320.0
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,day,1,...,,,2022,5,30,150,0,Не праздник,2022-05-30 15:00:00,165392280.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
322899430,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,avatars.mds.yandex.net,smartphone,Android,16376.0,morning,1,...,57.0,0.0,2021,7,12,193,0,Не праздник,2021-07-12 09:00:00,162608040.0
322899431,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,googleads.g.doubleclick.net,smartphone,Android,16376.0,evening,1,...,57.0,0.0,2021,6,20,171,6,Не праздник,2021-06-20 21:00:00,162422280.0
322899432,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,online.sberbank.ru,smartphone,Android,16376.0,day,1,...,57.0,0.0,2021,8,5,217,3,Не праздник,2021-08-05 15:00:00,162817560.0
322899433,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,s0.2mdn.net,smartphone,Android,16376.0,evening,1,...,57.0,0.0,2021,7,19,200,0,Не праздник,2021-07-19 21:00:00,162672840.0


In [24]:
df = df.drop(labels=["datetime"], axis=1)
gc.collect()
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,user_id,age,is_male,year,month,day,day_of_year,day_of_week,holiday,timestamp
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,morning,1,45098,,,2022,6,15,166,2,Не праздник,165528360.0
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,morning,1,45098,,,2022,6,19,170,6,Не праздник,165562920.0
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,day,1,45098,,,2022,6,12,163,6,День России,165504600.0
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,day,1,45098,,,2022,5,16,136,0,Не праздник,165271320.0
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,day,1,45098,,,2022,5,30,150,0,Не праздник,165392280.0


In [25]:
my_reset("df")

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [30]:
df.to_parquet(f"../sequence_catboost_data/stages/stage_1.parquet.gzip",
              compression='gzip')

In [5]:
df = pd.read_parquet("../sequence_catboost_data/stages/stage_1.parquet.gzip")
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,user_id,age,is_male,year,month,day,day_of_year,day_of_week,holiday,timestamp
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,morning,1,45098,,,2022,6,15,166,2,Не праздник,165528360.0
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,morning,1,45098,,,2022,6,19,170,6,Не праздник,165562920.0
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,day,1,45098,,,2022,6,12,163,6,День России,165504600.0
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,day,1,45098,,,2022,5,16,136,0,Не праздник,165271320.0
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,day,1,45098,,,2022,5,30,150,0,Не праздник,165392280.0


In [6]:
df = map_prices(df)
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,user_id,age,is_male,year,month,day,day_of_year,day_of_week,holiday,timestamp
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,morning,1,45098,,,2022,6,15,166,2,Не праздник,165528360.0
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,morning,1,45098,,,2022,6,19,170,6,Не праздник,165562920.0
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,day,1,45098,,,2022,6,12,163,6,День России,165504600.0
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,day,1,45098,,,2022,5,16,136,0,Не праздник,165271320.0
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,day,1,45098,,,2022,5,30,150,0,Не праздник,165392280.0


In [7]:
df = df.drop(columns=["age"]).dropna(subset=["is_male"])
df

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,user_id,is_male,year,month,day,day_of_year,day_of_week,holiday,timestamp
2272,Санкт-Петербург,Санкт-Петербург,Samsung,Galaxy S20+,ad.mail.ru,smartphone,Android,74259.0,night,1,79395,1.0,2021,9,26,269,6,Не праздник,163262520.0
2273,Санкт-Петербург,Санкт-Петербург,Samsung,Galaxy S20+,i.ytimg.com,smartphone,Android,74259.0,day,3,79395,1.0,2021,8,21,233,5,Не праздник,162955800.0
2274,Санкт-Петербург,Санкт-Петербург,Samsung,Galaxy S20+,node3.online.sberbank.ru,smartphone,Android,74259.0,day,3,79395,1.0,2021,8,20,232,4,Не праздник,162947160.0
2275,Калининградская область,Калининград,Samsung,Galaxy S20+,node3.online.sberbank.ru,smartphone,Android,74259.0,evening,2,79395,1.0,2021,9,9,252,3,Не праздник,163122120.0
2276,Санкт-Петербург,Санкт-Петербург,Samsung,Galaxy S20+,play.google.com,smartphone,Android,74259.0,day,1,79395,1.0,2021,8,30,242,0,Не праздник,163033560.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
322899430,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,avatars.mds.yandex.net,smartphone,Android,16376.0,morning,1,300964,0.0,2021,7,12,193,0,Не праздник,162608040.0
322899431,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,googleads.g.doubleclick.net,smartphone,Android,16376.0,evening,1,300964,0.0,2021,6,20,171,6,Не праздник,162422280.0
322899432,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,online.sberbank.ru,smartphone,Android,16376.0,day,1,300964,0.0,2021,8,5,217,3,Не праздник,162817560.0
322899433,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,s0.2mdn.net,smartphone,Android,16376.0,evening,1,300964,0.0,2021,7,19,200,0,Не праздник,162672840.0


In [8]:
my_reset("df")

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [9]:
df.to_parquet("../sequence_catboost_data/stages/stage_2_is_male.parquet.gzip",
              compression='gzip')

In [6]:
df_is_male = pd.read_parquet("../sequence_catboost_data/stages/stage_2_is_male.parquet.gzip")
df_is_male.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,user_id,is_male,year,month,day,day_of_year,day_of_week,holiday,timestamp
2272,Санкт-Петербург,Санкт-Петербург,Samsung,Galaxy S20+,ad.mail.ru,smartphone,Android,74259.0,night,1,79395,1.0,2021,9,26,269,6,Не праздник,163262520.0
2273,Санкт-Петербург,Санкт-Петербург,Samsung,Galaxy S20+,i.ytimg.com,smartphone,Android,74259.0,day,3,79395,1.0,2021,8,21,233,5,Не праздник,162955800.0
2274,Санкт-Петербург,Санкт-Петербург,Samsung,Galaxy S20+,node3.online.sberbank.ru,smartphone,Android,74259.0,day,3,79395,1.0,2021,8,20,232,4,Не праздник,162947160.0
2275,Калининградская область,Калининград,Samsung,Galaxy S20+,node3.online.sberbank.ru,smartphone,Android,74259.0,evening,2,79395,1.0,2021,9,9,252,3,Не праздник,163122120.0
2276,Санкт-Петербург,Санкт-Петербург,Samsung,Galaxy S20+,play.google.com,smartphone,Android,74259.0,day,1,79395,1.0,2021,8,30,242,0,Не праздник,163033560.0


In [9]:
FE_DATA = "../fe_data/"
userfe = UserFE()
userfe.load(path=f"{FE_DATA}/USER_FE.parquet.gzip")
userfe.df = userfe.df[["user_id", 
                       "evening", 
                       "day", 
                       "night", 
                       "morning", 
                       "timespan", 
                       "url_host_mode_0", 
                       "url_host_mode_1",
                       "als_emb_url_request_sum_53",
                       "als_emb_url_request_sum_27",
                       "als_emb_url_request_sum_15",
                       "als_emb_url_request_sum_17",
                       "als_emb_url_request_sum_81"]]

Memory usage of dataframe is 282.01 MB


  0%|          | 0/143 [00:00<?, ?it/s]

Memory usage after optimization is: 248.74 MB
Decreased by 11.8%


In [10]:
df_is_male = df_is_male.merge(userfe.df, on="user_id", how="left")
df_is_male

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,night,morning,timespan,url_host_mode_0,url_host_mode_1,als_emb_url_request_sum_53,als_emb_url_request_sum_27,als_emb_url_request_sum_15,als_emb_url_request_sum_17,als_emb_url_request_sum_81
0,Санкт-Петербург,Санкт-Петербург,Samsung,Galaxy S20+,ad.mail.ru,smartphone,Android,74259.0,night,1,...,0.109480,0.211621,43200.0,i.ytimg.com,yandex.ru,0.337429,0.344958,0.111196,0.801757,-0.399860
1,Санкт-Петербург,Санкт-Петербург,Samsung,Galaxy S20+,i.ytimg.com,smartphone,Android,74259.0,day,3,...,0.109480,0.211621,43200.0,i.ytimg.com,yandex.ru,0.337429,0.344958,0.111196,0.801757,-0.399860
2,Санкт-Петербург,Санкт-Петербург,Samsung,Galaxy S20+,node3.online.sberbank.ru,smartphone,Android,74259.0,day,3,...,0.109480,0.211621,43200.0,i.ytimg.com,yandex.ru,0.337429,0.344958,0.111196,0.801757,-0.399860
3,Калининградская область,Калининград,Samsung,Galaxy S20+,node3.online.sberbank.ru,smartphone,Android,74259.0,evening,2,...,0.109480,0.211621,43200.0,i.ytimg.com,yandex.ru,0.337429,0.344958,0.111196,0.801757,-0.399860
4,Санкт-Петербург,Санкт-Петербург,Samsung,Galaxy S20+,play.google.com,smartphone,Android,74259.0,day,1,...,0.109480,0.211621,43200.0,i.ytimg.com,yandex.ru,0.337429,0.344958,0.111196,0.801757,-0.399860
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206840061,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,avatars.mds.yandex.net,smartphone,Android,16376.0,morning,1,...,0.161725,0.292004,56808.0,yandex.ru,googleads.g.doubleclick.net,0.100901,0.393608,-0.193072,0.951323,-0.193426
206840062,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,googleads.g.doubleclick.net,smartphone,Android,16376.0,evening,1,...,0.161725,0.292004,56808.0,yandex.ru,googleads.g.doubleclick.net,0.100901,0.393608,-0.193072,0.951323,-0.193426
206840063,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,online.sberbank.ru,smartphone,Android,16376.0,day,1,...,0.161725,0.292004,56808.0,yandex.ru,googleads.g.doubleclick.net,0.100901,0.393608,-0.193072,0.951323,-0.193426
206840064,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,s0.2mdn.net,smartphone,Android,16376.0,evening,1,...,0.161725,0.292004,56808.0,yandex.ru,googleads.g.doubleclick.net,0.100901,0.393608,-0.193072,0.951323,-0.193426


In [11]:
cat_features = list(df_is_male.columns[np.where(df_is_male.dtypes == object)])

In [12]:
cbe = CatBoostEncoderWrapper(cat_features=cat_features, sort_col="timestamp", n_folds=10)

In [None]:
df_is_male = cbe.fit_transform(df_is_male, target_col="is_male")

0it [00:00, ?it/s]

In [None]:
cbe.save("../model_zoo", "is_male_cbe_seq_catboost")

In [None]:
my_reset("df_is_male")

In [None]:
df_is_male.to_parquet(f"../sequence_catboost_data/stages/stage_3_is_male.parquet.gzip",
              compression='gzip')

In [5]:
df_is_male = pandas_reduce_mem_usage(pd.read_parquet("../sequence_catboost_data/stages/stage_3_is_male.parquet.gzip"))
df_is_male.head()

Memory usage of dataframe is 22092.90 MB


  0%|          | 0/19 [00:00<?, ?it/s]

Memory usage after optimization is: 13413.55 MB
Decreased by 39.3%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,user_id,year,month,day,day_of_year,day_of_week,holiday,timestamp,is_male
72036835,0.492813,0.475856,0.523871,0.527727,0.515649,0.520669,0.534969,14990.0,0.559003,1,88235,2021,6,16,167,2,0.520504,162381248.0,0.0
269481717,0.460119,0.468477,0.505041,0.491774,0.508047,0.520669,0.534969,17985.0,0.559003,1,160508,2021,6,16,167,2,0.520504,162381248.0,0.0
83105451,0.575031,0.585275,0.497987,0.38863,0.522636,0.520669,0.497987,45328.0,0.559003,4,113128,2021,6,16,167,2,0.520504,162381248.0,0.0
210293403,0.515135,0.533519,0.505041,0.470685,0.485715,0.520669,0.534969,28788.0,0.559003,3,66732,2021,6,16,167,2,0.520504,162381248.0,1.0
269479580,0.522969,0.522969,0.505041,0.44339,0.544725,0.520669,0.534969,13190.0,0.559003,2,168799,2021,6,16,167,2,0.520504,162381248.0,1.0


In [6]:
df_is_male = generate_time_lags(df_is_male, 
                                shift_column="url_host", 
                                time_column="timestamp", 
                                n_lags=10).fillna(-1)
df_is_male.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,lag_url_host_1,lag_url_host_2,lag_url_host_3,lag_url_host_4,lag_url_host_5,lag_url_host_6,lag_url_host_7,lag_url_host_8,lag_url_host_9,lag_url_host_10
179123993,0.579224,0.579224,0.50507,0.437066,0.485884,0.520642,0.534937,2990.0,0.510756,1,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
179124074,0.579238,0.579238,0.505052,0.437486,0.489935,0.520676,0.534976,2990.0,0.510796,1,...,0.485884,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
179124075,0.579238,0.579238,0.505041,0.437431,0.522636,0.520669,0.534969,2990.0,0.513743,1,...,0.489935,0.485884,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
179123969,0.579292,0.579292,0.505097,0.436958,0.522641,0.520681,0.534986,2990.0,0.559007,1,...,0.522636,0.489935,0.485884,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
179124057,0.579241,0.579241,0.505036,0.437167,0.511697,0.520663,0.534955,2990.0,0.525651,1,...,0.522641,0.522636,0.489935,0.485884,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [7]:
df_is_male = generate_time_lags(df_is_male, 
                                shift_column="timestamp", 
                                time_column="timestamp", 
                                n_lags=5).fillna(0)

In [8]:
df_is_male = generate_time_lags(df_is_male, 
                                shift_column="city_name", 
                                time_column="timestamp", 
                                n_lags=1).fillna(0)

In [9]:
my_reset("df_is_male")

Once deleted, variables cannot be recovered. Proceed (y/[n])? н
Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [10]:
df_is_male.to_parquet(f"../sequence_catboost_data/stages/stage_4_is_male.parquet.gzip",
              compression='gzip')

# CatBoost

In [1]:
import warnings

import numpy as np
import pandas as pd
import random
import torch
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

import sys
import os
import gc
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ["USE_OPENMP"] = '1'

from sklearn.metrics import roc_auc_score

sys.path.append(os.path.abspath("../"))
from fe_modules.aggregates import get_agg_count, get_agg_sum, get_top_n_mode, \
                                  get_agg_mean, get_agg_max, get_agg_min, \
                                  get_agg_median, get_agg_std, get_agg_nunique, \
                                  get_price_of_all_cpes, get_ratio_part_of_day
from fe_modules.cyclical_features import generate_cyclical_features
from fe_modules.datetime_features import get_year, get_month, get_day, \
                                         get_timestamp, get_day_of_year, \
                                         get_day_of_week, get_holiday_name, \
                                         part_of_day_to_hour, add_hour_to_date, \
                                         get_relative_time, mean_first_visit, \
                                         mean_last_visit
from fe_modules.encoders import TargetEncoderWrapper
from fe_modules.feature_merges import add_cat_features
from fe_modules.geo_features import map_cities, geo_dist, dist_to_large_cities, \
                                    map_grid, MapGridTransformer
from fe_modules.missing import map_prices
from fe_modules.preprocessing import clean_os_type
from fe_modules.text_manipulation import get_domain
from fe_modules.time_lags import generate_time_lags
from fe_modules.recsys_features import ALSWrapper, RecVAEWrapper

from fe_modules.user_fe import UserFE

from modules.memory_utils import pandas_reduce_mem_usage

from seq2seq_modules.metrics import GENDER_METRIC


def my_reset(*varnames):
    """
    varnames are what you want to keep
    """
    globals_ = globals()
    to_save = {v: globals_[v] for v in varnames}
    to_save['my_reset'] = my_reset  # lets keep this function by default
    del globals_
    get_ipython().magic("reset")
    globals().update(to_save)
    

random_state = 69

random.seed(random_state),
np.random.seed(random_state)
torch.manual_seed(random_state)
torch.cuda.manual_seed_all(random_state)

In [2]:
import pandas as pd
import numpy as np
from catboost import Pool
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from catboost.utils import get_gpu_device_count

from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold
from sklearn.inspection import permutation_importance

from catboost_modules.metrics import GENDER_METRIC
from seq2seq_modules.utils import numpy_age_bucket

In [3]:
df_is_male = pandas_reduce_mem_usage(pd.read_parquet(f"../sequence_catboost_data/stages/stage_4_is_male.parquet.gzip"))
df_is_male

Memory usage of dataframe is 26038.06 MB


  0%|          | 0/35 [00:00<?, ?it/s]

Memory usage after optimization is: 26038.06 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,lag_url_host_7,lag_url_host_8,lag_url_host_9,lag_url_host_10,lag_timestamp_1,lag_timestamp_2,lag_timestamp_3,lag_timestamp_4,lag_timestamp_5,lag_city_name_1
179123993,0.579224,0.579224,0.505070,0.437066,0.485884,0.520642,0.534937,2990.0,0.510756,1,...,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.000000
179124074,0.579238,0.579238,0.505052,0.437486,0.489935,0.520676,0.534976,2990.0,0.510796,1,...,-1.0,-1.0,-1.0,-1.0,162385568.0,0.0,0.0,0.0,0.0,0.579224
179124075,0.579238,0.579238,0.505041,0.437431,0.522636,0.520669,0.534969,2990.0,0.513743,1,...,-1.0,-1.0,-1.0,-1.0,162385568.0,162385568.0,0.0,0.0,0.0,0.579238
179123969,0.579292,0.579292,0.505097,0.436958,0.522641,0.520681,0.534986,2990.0,0.559007,1,...,-1.0,-1.0,-1.0,-1.0,162387712.0,162385568.0,162385568.0,0.0,0.0,0.579238
179124057,0.579241,0.579241,0.505036,0.437167,0.511697,0.520663,0.534955,2990.0,0.525651,1,...,-1.0,-1.0,-1.0,-1.0,162407168.0,162387712.0,162385568.0,162385568.0,0.0,0.579292
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219235306,0.479846,0.502942,0.586680,0.592065,0.522729,0.520675,0.534980,10054.0,0.525675,1,...,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.000000
277230213,0.515381,0.483720,0.523893,0.549565,0.528831,0.520676,0.534976,16077.0,0.513726,1,...,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.000000
316271590,0.395068,0.401789,0.523858,0.489358,0.448803,0.520669,0.534974,7990.0,0.513676,1,...,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.000000
258443456,0.535242,0.581704,0.523790,0.489245,0.522569,0.520642,0.534937,7947.0,0.510756,1,...,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.000000


#### Add seq2seq features

In [4]:
def coss_val_score(model, df, target_feature, metric, n_folds=5, random_state=42, calculate_importance=True):
    users = df['user_id']
    
    scores = []
    models = []
    
#     importances = pd.DataFrame({"Feature Id": X.columns, 
#                                 "Importance": np.zeros(X.columns.shape[0]), 
#                                 "std": np.zeros(X.columns.shape[0])})
    
    skf = StratifiedGroupKFold(n_splits=n_folds, random_state=random_state, shuffle=True)
    for i, (train_index, val_index) in enumerate(skf.split(df, df[target_feature], users)):
        X_train, X_val = df.drop(columns=[target_feature, 'user_id']).iloc[train_index], df.drop(columns=[target_feature, 'user_id']).iloc[val_index]
        y_train, y_val = df[target_feature].iloc[train_index], df[target_feature].iloc[val_index]
        
        model.fit(X_train,
                  y_train,
                  eval_set=(X_val, y_val), 
                  use_best_model=True)
        models.append(model)
        
        if target_feature == "age":
            preds = model.predict_proba(X_val)
            score = metric(preds, y_val)

        elif target_feature == "is_male":
            preds = pd.DataFrame({
                "user_id": users.iloc[val_index], 
                "proba": model.predict_proba(X_val)[:, 1],
                f"{target_feature}": y_val
            })
            
            final_preds = preds.groupby("user_id").agg({
                "proba": np.mean,
                f"{target_feature}": pd.Series.mode
            })
            
            final_preds[f"{target_feature}"] = final_preds[f"{target_feature}"].astype(int)
                        
            score = 2 * roc_auc_score(
                final_preds[f"{target_feature}"].to_numpy(),
                final_preds["proba"].to_numpy()
            ) - 1
            
            

        
        scores.append(score)
        print(f"Fold {i}: {score}")
        
#         if calculate_importance:
#             r = permutation_importance(model, 
#                                        X_val, 
#                                        y_val,
#                                        n_repeats=10,
#                                        random_state=42)
        
        
#             importances["Importance"] += r.importances_mean
#             importances["std"] += r.importances_std
        
#         for i in r.importances_mean.argsort()[::-1]:
#             print(f"{X_val.columns[i]}")
#             print(f"{r.importances_mean[i]}")
            

    importances["Importance"] /= n_folds      
    importances["std"] /= n_folds      
    importances["good"] = importances["Importance"] - 2 * importances["std"] > 0
    
    return (models, (sum(scores) / n_folds), importances, X.columns)

In [5]:
params = {"loss_function": "CrossEntropy",
          "eval_metric": "NormalizedGini",
          "verbose": 1,
          "iterations": 50,
          "random_state": 42,
          "task_type": "GPU" if get_gpu_device_count() else "CPU",
          "early_stopping_rounds": 100,
          "use_best_model": True,
          "bootstrap_type": 'Poisson',
          "subsample": 0.3,
         }

model_is_male = CatBoostClassifier(**params)

In [None]:
is_male_models, scores, is_male_importances, features = coss_val_score(
    model=model_is_male, 
    df=df_is_male, 
    target_feature="is_male", 
    metric=GENDER_METRIC, 
    calculate_importance=False
)

print(scores)

Default metric period is 5 because NormalizedGini is/are not implemented for GPU
Metric NormalizedGini is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.2216762	best: 0.2216762 (0)	total: 4.43s	remaining: 3m 36s
1:	total: 8.7s	remaining: 3m 28s
2:	total: 13s	remaining: 3m 23s
3:	total: 17.4s	remaining: 3m 19s
4:	total: 21.8s	remaining: 3m 15s
5:	test: 0.3150001	best: 0.3150001 (5)	total: 26.1s	remaining: 3m 11s
6:	total: 30.5s	remaining: 3m 7s
7:	total: 34.9s	remaining: 3m 3s
8:	total: 39.3s	remaining: 2m 59s
9:	total: 43.7s	remaining: 2m 54s
10:	test: 0.3346159	best: 0.3346159 (10)	total: 48.1s	remaining: 2m 50s
11:	total: 52.5s	remaining: 2m 46s
12:	total: 56.9s	remaining: 2m 41s
13:	total: 1m 1s	remaining: 2m 37s
14:	total: 1m 5s	remaining: 2m 33s
15:	test: 0.3506085	best: 0.3506085 (15)	total: 1m 10s	remaining: 2m 29s
16:	total: 1m 14s	remaining: 2m 24s
17:	total: 1m 19s	remaining: 2m 20s
18:	total: 1m 23s	remaining: 2m 16s
19:	total: 1m 27s	remaining: 2m 11s
20:	test: 0.3636302	best: 0.3636302 (20)	total: 1m 32s	remaining: 2m 7s
21:	total: 1m 36s	remaining: 2m 3s
22:	total: 1m 41s	remaining: 1m 58s
23:	total: 1m 45s	rem

Default metric period is 5 because NormalizedGini is/are not implemented for GPU
Metric NormalizedGini is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.2298637	best: 0.2298637 (0)	total: 4.52s	remaining: 3m 41s
1:	total: 8.8s	remaining: 3m 31s
2:	total: 13.1s	remaining: 3m 25s
3:	total: 17.4s	remaining: 3m 20s
4:	total: 21.8s	remaining: 3m 16s
5:	test: 0.3219957	best: 0.3219957 (5)	total: 26.2s	remaining: 3m 11s
6:	total: 30.5s	remaining: 3m 7s
7:	total: 34.9s	remaining: 3m 3s
8:	total: 39.3s	remaining: 2m 58s
9:	total: 43.7s	remaining: 2m 54s
10:	test: 0.3403721	best: 0.3403721 (10)	total: 48.1s	remaining: 2m 50s
11:	total: 52.5s	remaining: 2m 46s
12:	total: 56.9s	remaining: 2m 42s
13:	total: 1m 1s	remaining: 2m 37s
14:	total: 1m 5s	remaining: 2m 33s
15:	test: 0.3568167	best: 0.3568167 (15)	total: 1m 10s	remaining: 2m 29s
16:	total: 1m 14s	remaining: 2m 24s
17:	total: 1m 19s	remaining: 2m 20s
18:	total: 1m 23s	remaining: 2m 16s
19:	total: 1m 27s	remaining: 2m 11s
20:	test: 0.3700080	best: 0.3700080 (20)	total: 1m 32s	remaining: 2m 7s
21:	total: 1m 36s	remaining: 2m 3s
22:	total: 1m 41s	remaining: 1m 58s
23:	total: 1m 45s	r

# Experiments

* 0.24 Фичи начальные
* 0.40 5 лагов url host