In [1]:
import gc
import sys
import os
import warnings

sys.path.append(os.path.abspath("modules/"))
sys.path.append(os.path.abspath("fe_modules/"))
sys.path.append(os.path.abspath("seq2seq_modules/"))

os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import time
import polars as pl
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV

In [3]:
import importlib

import modules
import fe_modules

importlib.reload(modules)
importlib.reload(fe_modules)

from modules.memory_utils import polars_reduce_mem_usage, polars_string_to_cat, my_reset
from fe_modules.text_manipulation import get_domain
from fe_modules.datetime import get_timestamp, get_relative_time, part_of_day_to_hour, add_hour_to_date

# Load data

In [4]:
LOCAL_DATA_PATH = './data/'
SPLIT_SEED = 42

In [5]:
df = polars_reduce_mem_usage(
    pl.read_parquet(
        f'{LOCAL_DATA_PATH}competition_data_final_pqt/*.parquet'
    ))
df

Memory usage of dataframe is 59971.98 MB
Memory usage of dataframe is 58.57 GB
utf
utf
utf
utf
utf
utf
utf
flo
dat
utf
int
int
Memory usage after optimization is: 55352.86 MB
Memory usage  after optimization is 54.06 GB
Decreased by 7.7%


region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id
str,str,str,str,str,str,str,f32,date,str,i8,i32
"""Краснодарский ...","""Краснодар""","""Apple""","""iPhone 7""","""ad.adriver.ru""","""smartphone""","""iOS""",20368.0,2022-06-15,"""morning""",1,45098
"""Краснодарский ...","""Краснодар""","""Apple""","""iPhone 7""","""apple.com""","""smartphone""","""iOS""",20368.0,2022-06-19,"""morning""",1,45098
"""Краснодарский ...","""Краснодар""","""Apple""","""iPhone 7""","""avatars.mds.ya...","""smartphone""","""iOS""",20368.0,2022-06-12,"""day""",1,45098
"""Краснодарский ...","""Краснодар""","""Apple""","""iPhone 7""","""googleads.g.do...","""smartphone""","""iOS""",20368.0,2022-05-16,"""day""",1,45098
"""Краснодарский ...","""Краснодар""","""Apple""","""iPhone 7""","""googleads.g.do...","""smartphone""","""iOS""",20368.0,2022-05-30,"""day""",1,45098
"""Краснодарский ...","""Краснодар""","""Apple""","""iPhone 7""","""i.ytimg.com""","""smartphone""","""iOS""",20368.0,2022-03-29,"""evening""",2,45098
"""Краснодарский ...","""Краснодар""","""Apple""","""iPhone 7""","""icloud.com""","""smartphone""","""iOS""",20368.0,2022-03-17,"""morning""",1,45098
"""Краснодарский ...","""Краснодар""","""Apple""","""iPhone 7""","""m.avito.ru""","""smartphone""","""iOS""",20368.0,2022-05-19,"""morning""",1,45098
"""Краснодарский ...","""Краснодар""","""Apple""","""iPhone 7""","""relap.io""","""smartphone""","""iOS""",20368.0,2022-03-29,"""night""",1,45098
"""Краснодарский ...","""Краснодар""","""Apple""","""iPhone 7""","""sun9-5.userapi...","""smartphone""","""iOS""",20368.0,2022-06-16,"""day""",1,45098


In [6]:
target = polars_reduce_mem_usage(
    pl.read_parquet(
        f'{LOCAL_DATA_PATH}public_train.pqt'
    ))
target

Memory usage of dataframe is 8.57 MB
Memory usage of dataframe is 0.01 GB
flo
utf
int
int
Memory usage after optimization is: 5.48 MB
Memory usage  after optimization is 0.01 GB
Decreased by 36.1%


age,is_male,user_id,__index_level_0__
f32,str,i32,i32
31.0,"""1""",350459,350459
35.0,"""1""",188276,188276
41.0,"""0""",99002,99002
33.0,"""0""",155506,155506
54.0,"""0""",213873,213873
63.0,"""0""",212300,212300
33.0,"""1""",268755,268755
39.0,"""1""",350740,350740
27.0,"""0""",357123,357123
66.0,"""0""",113057,113057


# Feature engeneering

In [7]:
df = polars_string_to_cat(df, 
                            ["region_name", 
                             "city_name", 
                             "cpe_manufacturer_name", 
                             "cpe_model_name",
                             "cpe_type_cd", 
                             "cpe_model_os_type",
                            ]
                           )
df

Memory usage of dataframe is 55352.86 MB
Memory usage of dataframe is 54.06 GB
<class 'int'> <class 'int'> 0 80
<class 'int'> <class 'int'> 0 984
<class 'int'> <class 'int'> 0 36
<class 'int'> <class 'int'> 0 598
<class 'int'> <class 'int'> 0 3
<class 'int'> <class 'int'> 0 2
Memory usage after optimization is: 17602.04 MB
Memory usage  after optimization is 17.19 GB
Decreased by 68.2%


region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id
i8,i16,i8,i16,str,i8,i8,f32,date,str,i8,i32
0,0,0,0,"""ad.adriver.ru""",0,0,20368.0,2022-06-15,"""morning""",1,45098
0,0,0,0,"""apple.com""",0,0,20368.0,2022-06-19,"""morning""",1,45098
0,0,0,0,"""avatars.mds.ya...",0,0,20368.0,2022-06-12,"""day""",1,45098
0,0,0,0,"""googleads.g.do...",0,0,20368.0,2022-05-16,"""day""",1,45098
0,0,0,0,"""googleads.g.do...",0,0,20368.0,2022-05-30,"""day""",1,45098
0,0,0,0,"""i.ytimg.com""",0,0,20368.0,2022-03-29,"""evening""",2,45098
0,0,0,0,"""icloud.com""",0,0,20368.0,2022-03-17,"""morning""",1,45098
0,0,0,0,"""m.avito.ru""",0,0,20368.0,2022-05-19,"""morning""",1,45098
0,0,0,0,"""relap.io""",0,0,20368.0,2022-03-29,"""night""",1,45098
0,0,0,0,"""sun9-5.userapi...",0,0,20368.0,2022-06-16,"""day""",1,45098


In [8]:
df = get_domain(df)
df

region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,domain
i8,i16,i8,i16,str,i8,i8,f32,date,str,i8,i32,str
0,0,0,0,"""ad.adriver.ru""",0,0,20368.0,2022-06-15,"""morning""",1,45098,"""ru"""
0,0,0,0,"""apple.com""",0,0,20368.0,2022-06-19,"""morning""",1,45098,"""com"""
0,0,0,0,"""avatars.mds.ya...",0,0,20368.0,2022-06-12,"""day""",1,45098,"""net"""
0,0,0,0,"""googleads.g.do...",0,0,20368.0,2022-05-16,"""day""",1,45098,"""net"""
0,0,0,0,"""googleads.g.do...",0,0,20368.0,2022-05-30,"""day""",1,45098,"""net"""
0,0,0,0,"""i.ytimg.com""",0,0,20368.0,2022-03-29,"""evening""",2,45098,"""com"""
0,0,0,0,"""icloud.com""",0,0,20368.0,2022-03-17,"""morning""",1,45098,"""com"""
0,0,0,0,"""m.avito.ru""",0,0,20368.0,2022-05-19,"""morning""",1,45098,"""ru"""
0,0,0,0,"""relap.io""",0,0,20368.0,2022-03-29,"""night""",1,45098,"""io"""
0,0,0,0,"""sun9-5.userapi...",0,0,20368.0,2022-06-16,"""day""",1,45098,"""com"""


In [None]:
df = polars_string_to_cat(df, 
                            [
                            "domain",
                            "url_host"
                            ]
                           )
df

Memory usage of dataframe is 20874.76 MB
Memory usage of dataframe is 20.39 GB
<class 'int'> <class 'int'> 0 868


In [None]:
df = polars_reduce_mem_usage(get_timestamp(df))
df

In [None]:
df = part_of_day_to_hour(df)
df

In [None]:
df = add_hour_to_date(df)
df

In [None]:
df = get_relative_time(df)
df

In [None]:
df = df.drop("date")
df