In [1]:
import gc
import sys
import os
import warnings

sys.path.append(os.path.abspath("modules/"))
sys.path.append(os.path.abspath("fe_modules/"))
sys.path.append(os.path.abspath("seq2seq_modules/"))

os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import time
import polars as pl
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV

In [3]:
import importlib

import modules
import fe_modules

importlib.reload(modules)
importlib.reload(fe_modules)

from modules.memory_utils import pandas_reduce_mem_usage, pandas_string_to_cat, my_reset
from fe_modules.text_manipulation import get_domain
from fe_modules.datetime import get_timestamp, get_relative_time, part_of_day_to_hour, add_hour_to_date

# Load data

In [4]:
LOCAL_DATA_PATH = './data/'
SPLIT_SEED = 42

In [5]:
df = pandas_reduce_mem_usage( 
        pd.read_parquet(
            f'{LOCAL_DATA_PATH}competition_data_final_pqt/'
    ))
df

Memory usage of dataframe is 29562.33 MB


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:03<00:00,  3.28it/s]

Memory usage after optimization is: 24943.21 MB
Decreased by 15.6%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098
...,...,...,...,...,...,...,...,...,...,...,...,...
322899430,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,avatars.mds.yandex.net,smartphone,Android,16376.0,2021-07-12,morning,1,300964
322899431,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,googleads.g.doubleclick.net,smartphone,Android,16376.0,2021-06-20,evening,1,300964
322899432,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,online.sberbank.ru,smartphone,Android,16376.0,2021-08-05,day,1,300964
322899433,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,s0.2mdn.net,smartphone,Android,16376.0,2021-07-19,evening,1,300964


In [6]:
target = pandas_reduce_mem_usage(
    pd.read_parquet(
        f'{LOCAL_DATA_PATH}public_train.pqt'
    ))
target

Memory usage of dataframe is 8.24 MB


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 871.03it/s]

Memory usage after optimization is: 5.66 MB
Decreased by 31.2%





Unnamed: 0,age,is_male,user_id
350459,31.0,1,350459
188276,35.0,1,188276
99002,41.0,0,99002
155506,33.0,0,155506
213873,54.0,0,213873
...,...,...,...
225374,49.0,1,225374
25776,22.0,1,25776
148131,28.0,0,148131
205570,28.0,1,205570


# Feature engeneering

In [7]:
df = pandas_string_to_cat(df, 
                            ["region_name", 
                             "city_name", 
                             "cpe_manufacturer_name", 
                             "cpe_model_name",
                             "cpe_type_cd", 
                             "cpe_model_os_type",
                            ]
                           )
df

Memory usage of dataframe is 24943.21 MB
Memory usage of dataframe is 24.36 GB


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [01:38<00:00, 16.40s/it]

Memory usage after optimization is: 12625.58 MB
Decreased by 49.4%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id
0,21,409,1,589,ad.adriver.ru,2,2,20368.0,2022-06-15,morning,1,45098
1,21,409,1,589,apple.com,2,2,20368.0,2022-06-19,morning,1,45098
2,21,409,1,589,avatars.mds.yandex.net,2,2,20368.0,2022-06-12,day,1,45098
3,21,409,1,589,googleads.g.doubleclick.net,2,2,20368.0,2022-05-16,day,1,45098
4,21,409,1,589,googleads.g.doubleclick.net,2,2,20368.0,2022-05-30,day,1,45098
...,...,...,...,...,...,...,...,...,...,...,...,...
322899430,15,616,27,124,avatars.mds.yandex.net,2,0,16376.0,2021-07-12,morning,1,300964
322899431,15,616,27,124,googleads.g.doubleclick.net,2,0,16376.0,2021-06-20,evening,1,300964
322899432,15,616,27,124,online.sberbank.ru,2,0,16376.0,2021-08-05,day,1,300964
322899433,15,616,27,124,s0.2mdn.net,2,0,16376.0,2021-07-19,evening,1,300964


In [8]:
df = get_domain(df)
df

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,domain
0,21,409,1,589,ad.adriver.ru,2,2,20368.0,2022-06-15,morning,1,45098,ru
1,21,409,1,589,apple.com,2,2,20368.0,2022-06-19,morning,1,45098,com
2,21,409,1,589,avatars.mds.yandex.net,2,2,20368.0,2022-06-12,day,1,45098,net
3,21,409,1,589,googleads.g.doubleclick.net,2,2,20368.0,2022-05-16,day,1,45098,net
4,21,409,1,589,googleads.g.doubleclick.net,2,2,20368.0,2022-05-30,day,1,45098,net
...,...,...,...,...,...,...,...,...,...,...,...,...,...
322899430,15,616,27,124,avatars.mds.yandex.net,2,0,16376.0,2021-07-12,morning,1,300964,net
322899431,15,616,27,124,googleads.g.doubleclick.net,2,0,16376.0,2021-06-20,evening,1,300964,net
322899432,15,616,27,124,online.sberbank.ru,2,0,16376.0,2021-08-05,day,1,300964,ru
322899433,15,616,27,124,s0.2mdn.net,2,0,16376.0,2021-07-19,evening,1,300964,net


In [11]:
df = pandas_string_to_cat(df, 
                            [
                            "domain",
                            "url_host"
                            ]
                           )
df

Memory usage of dataframe is 17552.63 MB
Memory usage of dataframe is 17.14 GB


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:23<00:00, 11.56s/it]

Memory usage after optimization is: 14473.22 MB
Decreased by 17.5%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,domain
0,21,409,1,589,5788,2,2,20368.0,2022-06-15,morning,1,45098,761
1,21,409,1,589,12900,2,2,20368.0,2022-06-19,morning,1,45098,549
2,21,409,1,589,17626,2,2,20368.0,2022-06-12,day,1,45098,712
3,21,409,1,589,59366,2,2,20368.0,2022-05-16,day,1,45098,712
4,21,409,1,589,59366,2,2,20368.0,2022-05-30,day,1,45098,712
...,...,...,...,...,...,...,...,...,...,...,...,...,...
322899430,15,616,27,124,17626,2,0,16376.0,2021-07-12,morning,1,300964,712
322899431,15,616,27,124,59366,2,0,16376.0,2021-06-20,evening,1,300964,712
322899432,15,616,27,124,117489,2,0,16376.0,2021-08-05,day,1,300964,761
322899433,15,616,27,124,142334,2,0,16376.0,2021-07-19,evening,1,300964,712


In [None]:
df = polars_reduce_mem_usage(get_timestamp(df))
df

In [None]:
df = part_of_day_to_hour(df)
df

In [None]:
df = add_hour_to_date(df)
df

In [None]:
df = get_relative_time(df)
df

In [None]:
df = df.drop("date")
df