In [1]:
import gc
import sys
import os
import warnings

sys.path.append(os.path.abspath("modules/"))
sys.path.append(os.path.abspath("fe_modules/"))
sys.path.append(os.path.abspath("seq2seq_modules/"))

os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import time
import polars as pl
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV

In [3]:
import importlib

import modules
import fe_modules

importlib.reload(modules)
importlib.reload(fe_modules)

from modules.memory_utils import pandas_reduce_mem_usage, pandas_string_to_cat, my_reset
from fe_modules.text_manipulation import get_domain
from fe_modules.datetime_features import get_timestamp, get_relative_time, part_of_day_to_hour, add_hour_to_date
from fe_modules.preprocessing import clean_os_type
from fe_modules.missing import fill_price

# Load data

In [4]:
LOCAL_DATA_PATH = './data/'
SPLIT_SEED = 42

In [5]:
df = pandas_reduce_mem_usage( 
        pd.read_parquet(
            f'{LOCAL_DATA_PATH}competition_data_final_pqt/'
    ))
df

Memory usage of dataframe is 29562.33 MB


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:03<00:00,  3.40it/s]

Memory usage after optimization is: 24943.21 MB
Decreased by 15.6%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098
...,...,...,...,...,...,...,...,...,...,...,...,...
322899430,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,avatars.mds.yandex.net,smartphone,Android,16376.0,2021-07-12,morning,1,300964
322899431,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,googleads.g.doubleclick.net,smartphone,Android,16376.0,2021-06-20,evening,1,300964
322899432,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,online.sberbank.ru,smartphone,Android,16376.0,2021-08-05,day,1,300964
322899433,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,s0.2mdn.net,smartphone,Android,16376.0,2021-07-19,evening,1,300964


# Feature engeneering

## Cleaning

In [6]:
df = clean_os_type(df)
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098


In [8]:
df = fill_price(df)
df

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098
...,...,...,...,...,...,...,...,...,...,...,...,...
322899430,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,avatars.mds.yandex.net,smartphone,Android,16376.0,2021-07-12,morning,1,300964
322899431,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,googleads.g.doubleclick.net,smartphone,Android,16376.0,2021-06-20,evening,1,300964
322899432,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,online.sberbank.ru,smartphone,Android,16376.0,2021-08-05,day,1,300964
322899433,Калужская область,Обнинск,Samsung,Galaxy A30s Dual,s0.2mdn.net,smartphone,Android,16376.0,2021-07-19,evening,1,300964


## Feature generation

In [10]:
df = pandas_string_to_cat(df, 
                            ["region_name", 
                             "city_name", 
                             "cpe_manufacturer_name", 
                             "cpe_model_name",
                             "cpe_type_cd", 
                             "cpe_model_os_type",
                            ]
                           )
df.head()

Memory usage of dataframe is 24943.21 MB
Memory usage of dataframe is 24.36 GB


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [01:31<00:00, 15.25s/it]

Memory usage after optimization is: 12625.58 MB
Decreased by 49.4%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id
0,21,409,1,589,ad.adriver.ru,2,1,20368.0,2022-06-15,morning,1,45098
1,21,409,1,589,apple.com,2,1,20368.0,2022-06-19,morning,1,45098
2,21,409,1,589,avatars.mds.yandex.net,2,1,20368.0,2022-06-12,day,1,45098
3,21,409,1,589,googleads.g.doubleclick.net,2,1,20368.0,2022-05-16,day,1,45098
4,21,409,1,589,googleads.g.doubleclick.net,2,1,20368.0,2022-05-30,day,1,45098


In [11]:
df = get_domain(df)
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,domain
0,21,409,1,589,ad.adriver.ru,2,1,20368.0,2022-06-15,morning,1,45098,ru
1,21,409,1,589,apple.com,2,1,20368.0,2022-06-19,morning,1,45098,com
2,21,409,1,589,avatars.mds.yandex.net,2,1,20368.0,2022-06-12,day,1,45098,net
3,21,409,1,589,googleads.g.doubleclick.net,2,1,20368.0,2022-05-16,day,1,45098,net
4,21,409,1,589,googleads.g.doubleclick.net,2,1,20368.0,2022-05-30,day,1,45098,net


In [12]:
df = pandas_string_to_cat(df, 
                            [
                            "domain",
                            "url_host"
                            ]
                           )
df.head()

Memory usage of dataframe is 17552.63 MB
Memory usage of dataframe is 17.14 GB


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:23<00:00, 11.87s/it]

Memory usage after optimization is: 14473.22 MB
Decreased by 17.5%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,domain
0,21,409,1,589,5788,2,1,20368.0,2022-06-15,morning,1,45098,761
1,21,409,1,589,12900,2,1,20368.0,2022-06-19,morning,1,45098,549
2,21,409,1,589,17626,2,1,20368.0,2022-06-12,day,1,45098,712
3,21,409,1,589,59366,2,1,20368.0,2022-05-16,day,1,45098,712
4,21,409,1,589,59366,2,1,20368.0,2022-05-30,day,1,45098,712


In [13]:
df.dtypes

region_name                uint8
city_name                 uint16
cpe_manufacturer_name      uint8
cpe_model_name            uint16
url_host                  uint32
cpe_type_cd                uint8
cpe_model_os_type          uint8
price                    float32
date                      object
part_of_day               object
request_cnt                 int8
user_id                    int32
domain                    uint16
dtype: object

In [14]:
df = pandas_reduce_mem_usage(get_timestamp(df))
df.head()

Memory usage of dataframe is 16936.75 MB


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14/14 [00:02<00:00,  6.58it/s]

Memory usage after optimization is: 15704.99 MB
Decreased by 7.3%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,domain,timestamp
0,21,409,1,589,5788,2,1,20368.0,2022-06-15,morning,1,45098,761,165525120.0
1,21,409,1,589,12900,2,1,20368.0,2022-06-19,morning,1,45098,549,165559680.0
2,21,409,1,589,17626,2,1,20368.0,2022-06-12,day,1,45098,712,165499200.0
3,21,409,1,589,59366,2,1,20368.0,2022-05-16,day,1,45098,712,165265920.0
4,21,409,1,589,59366,2,1,20368.0,2022-05-30,day,1,45098,712,165386880.0


In [15]:
df = part_of_day_to_hour(df)
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,domain,timestamp,hour
0,21,409,1,589,5788,2,1,20368.0,2022-06-15,morning,1,45098,761,165525120.0,0 days 09:00:00
1,21,409,1,589,12900,2,1,20368.0,2022-06-19,morning,1,45098,549,165559680.0,0 days 09:00:00
2,21,409,1,589,17626,2,1,20368.0,2022-06-12,day,1,45098,712,165499200.0,0 days 15:00:00
3,21,409,1,589,59366,2,1,20368.0,2022-05-16,day,1,45098,712,165265920.0,0 days 15:00:00
4,21,409,1,589,59366,2,1,20368.0,2022-05-30,day,1,45098,712,165386880.0,0 days 15:00:00


In [16]:
df = add_hour_to_date(df)
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,domain,timestamp,hour,datetime
0,21,409,1,589,5788,2,1,20368.0,2022-06-15,morning,1,45098,761,165525120.0,0 days 09:00:00,2022-06-15 09:00:00
1,21,409,1,589,12900,2,1,20368.0,2022-06-19,morning,1,45098,549,165559680.0,0 days 09:00:00,2022-06-19 09:00:00
2,21,409,1,589,17626,2,1,20368.0,2022-06-12,day,1,45098,712,165499200.0,0 days 15:00:00,2022-06-12 15:00:00
3,21,409,1,589,59366,2,1,20368.0,2022-05-16,day,1,45098,712,165265920.0,0 days 15:00:00,2022-05-16 15:00:00
4,21,409,1,589,59366,2,1,20368.0,2022-05-30,day,1,45098,712,165386880.0,0 days 15:00:00,2022-05-30 15:00:00


In [17]:
df = get_relative_time(df, return_dtype="timestamp")
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,domain,timestamp,hour,datetime,relative_date
0,21,409,1,589,5788,2,1,20368.0,2022-06-15,morning,1,45098,761,165525120.0,0 days 09:00:00,2022-06-15 09:00:00,3888.0
1,21,409,1,589,12900,2,1,20368.0,2022-06-19,morning,1,45098,549,165559680.0,0 days 09:00:00,2022-06-19 09:00:00,432.0
2,21,409,1,589,17626,2,1,20368.0,2022-06-12,day,1,45098,712,165499200.0,0 days 15:00:00,2022-06-12 15:00:00,6264.0
3,21,409,1,589,59366,2,1,20368.0,2022-05-16,day,1,45098,712,165265920.0,0 days 15:00:00,2022-05-16 15:00:00,29592.0
4,21,409,1,589,59366,2,1,20368.0,2022-05-30,day,1,45098,712,165386880.0,0 days 15:00:00,2022-05-30 15:00:00,17496.0


In [18]:
df = pandas_string_to_cat(df, 
                            [
                            "part_of_day"
                            ]
                           )
df.head()

Memory usage of dataframe is 23095.57 MB
Memory usage of dataframe is 22.55 GB


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:10<00:00, 10.03s/it]

Memory usage after optimization is: 20939.98 MB
Decreased by 9.3%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,domain,timestamp,hour,datetime,relative_date
0,21,409,1,589,5788,2,1,20368.0,2022-06-15,2,1,45098,761,165525120.0,0 days 09:00:00,2022-06-15 09:00:00,3888.0
1,21,409,1,589,12900,2,1,20368.0,2022-06-19,2,1,45098,549,165559680.0,0 days 09:00:00,2022-06-19 09:00:00,432.0
2,21,409,1,589,17626,2,1,20368.0,2022-06-12,0,1,45098,712,165499200.0,0 days 15:00:00,2022-06-12 15:00:00,6264.0
3,21,409,1,589,59366,2,1,20368.0,2022-05-16,0,1,45098,712,165265920.0,0 days 15:00:00,2022-05-16 15:00:00,29592.0
4,21,409,1,589,59366,2,1,20368.0,2022-05-30,0,1,45098,712,165386880.0,0 days 15:00:00,2022-05-30 15:00:00,17496.0


In [19]:
df = df.drop(labels=["date", "hour", "datetime"], axis=1)
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,user_id,domain,timestamp,relative_date
0,21,409,1,589,5788,2,1,20368.0,2,1,45098,761,165525120.0,3888.0
1,21,409,1,589,12900,2,1,20368.0,2,1,45098,549,165559680.0,432.0
2,21,409,1,589,17626,2,1,20368.0,0,1,45098,712,165499200.0,6264.0
3,21,409,1,589,59366,2,1,20368.0,0,1,45098,712,165265920.0,29592.0
4,21,409,1,589,59366,2,1,20368.0,0,1,45098,712,165386880.0,17496.0


## Save

In [20]:
df.to_parquet("seq2seq_data/version_1.parquet.gzip",
              compression='gzip')

In [21]:
df.shape

(322899435, 14)

# seq2seq

In [3]:
import gc
import sys
import os
import warnings
from tqdm import tqdm

sys.path.append(os.path.abspath("modules/"))
sys.path.append(os.path.abspath("fe_modules/"))
sys.path.append(os.path.abspath("seq2seq_modules/"))

os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')
tqdm.pandas()

In [4]:
import pandas as pd
import numpy as np
import time
import polars as pl
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.utils.class_weight import compute_class_weight

import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn

from transformers import get_constant_schedule

In [5]:
import importlib

import modules
import fe_modules
import seq2seq_modules

importlib.reload(modules)
importlib.reload(fe_modules)
importlib.reload(seq2seq_modules)

from modules.memory_utils import pandas_reduce_mem_usage, pandas_string_to_cat, my_reset
from seq2seq_modules.preprocessing import PandasPreprocessor
from seq2seq_modules.models import LSTMModel
from seq2seq_modules.weight_initialization import weights_init_uniform_rule
from seq2seq_modules.loops import cross_validation, single_model_training
from seq2seq_modules.utils import age_bucket
from seq2seq_modules.metrics import AGE_METRIC

## Read and process

In [6]:
LOCAL_DATA_PATH = './data/'
SPLIT_SEED = 42

In [7]:
ids = ["user_id"]

cat_features = [
    "region_name",
    "city_name",
    "cpe_manufacturer_name",
    "cpe_model_name",
    "url_host",
    "cpe_type_cd",
    "cpe_model_os_type",
    "part_of_day",
    "request_cnt",
    "domain"
]

continous_features = [
    "price",
    "timestamp",
    "relative_date"
]

In [6]:
df = pandas_reduce_mem_usage(
    pd.read_parquet("seq2seq_data/version_1.parquet.gzip")
)
df.head()

Memory usage of dataframe is 13549.40 MB


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14/14 [00:03<00:00,  3.91it/s]

Memory usage after optimization is: 12317.64 MB
Decreased by 9.1%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,user_id,domain,timestamp,relative_date
0,21,409,1,589,5788,2,1,20368.0,2,1,45098,761,165525120.0,3888.0
1,21,409,1,589,12900,2,1,20368.0,2,1,45098,549,165559680.0,432.0
2,21,409,1,589,17626,2,1,20368.0,0,1,45098,712,165499200.0,6264.0
3,21,409,1,589,59366,2,1,20368.0,0,1,45098,712,165265920.0,29592.0
4,21,409,1,589,59366,2,1,20368.0,0,1,45098,712,165386880.0,17496.0


In [7]:
target = pandas_reduce_mem_usage(
    pd.read_parquet(
        f'{LOCAL_DATA_PATH}public_train.pqt', columns=["user_id", "age"]
    )
)
target

Memory usage of dataframe is 6.18 MB


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 478.31it/s]

Memory usage after optimization is: 3.60 MB
Decreased by 41.7%





Unnamed: 0,user_id,age
350459,350459,31.0
188276,188276,35.0
99002,99002,41.0
155506,155506,33.0
213873,213873,54.0
...,...,...
225374,225374,49.0
25776,25776,22.0
148131,148131,28.0
205570,205570,28.0


In [8]:
df = pandas_reduce_mem_usage(df.merge(target, how="left", on="user_id").dropna(subset=["age"]))
df.head()


KeyboardInterrupt



In [None]:
df["age"] = df["age"].progress_apply(age_bucket)
df = pandas_reduce_mem_usage(df)
df.head()

In [None]:
df.to_parquet("seq2seq_data/version_1_with_target.parquet.gzip",
              compression='gzip')

## Make torch Dataset

In [1]:
import gc
import sys
import os
import warnings
from tqdm import tqdm

sys.path.append(os.path.abspath("modules/"))
sys.path.append(os.path.abspath("fe_modules/"))
sys.path.append(os.path.abspath("seq2seq_modules/"))

os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')
tqdm.pandas()

In [2]:
import pandas as pd
import numpy as np
import time
import polars as pl
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.utils.class_weight import compute_class_weight

import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn

from transformers import get_constant_schedule, get_cosine_schedule_with_warmup

In [3]:
import importlib

import modules
import fe_modules
import seq2seq_modules

importlib.reload(modules)
importlib.reload(fe_modules)
importlib.reload(seq2seq_modules)

from modules.memory_utils import pandas_reduce_mem_usage, pandas_string_to_cat, my_reset
from seq2seq_modules.preprocessing import PandasPreprocessor
from seq2seq_modules.models import LSTMModel, StarterBERTModel
from seq2seq_modules.weight_initialization import weights_init_uniform_rule
from seq2seq_modules.loops import cross_validation, single_model_training
from seq2seq_modules.utils import age_bucket
from seq2seq_modules.metrics import AGE_METRIC

In [4]:
LOCAL_DATA_PATH = './data/'
SPLIT_SEED = 42

In [5]:
ids = ["user_id"]

cat_features = [
    "region_name",
    "city_name",
    "cpe_manufacturer_name",
    "cpe_model_name",
    "url_host",
    "cpe_type_cd",
    "cpe_model_os_type",
    "part_of_day",
    "domain"
]

continous_features = [
    "request_cnt",
    "price",
    "timestamp",
    "relative_date"
]

In [6]:
df = pandas_reduce_mem_usage(
    pd.read_parquet("seq2seq_data/version_1_with_target.parquet.gzip")
)

Memory usage of dataframe is 8239.68 MB


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:02<00:00,  6.04it/s]

Memory usage after optimization is: 8239.68 MB
Decreased by 0.0%





In [7]:
preprocessor = PandasPreprocessor(
         agg_column="user_id", 
         time_column="timestamp",
         target_column="age",
         features=cat_features + continous_features,
         max_len=1024,
         padding_side="right"
)

In [8]:
input_features, attention_masks, targets = preprocessor.transform(df)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 210730009/210730009 [00:29<00:00, 7115122.74it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 269998/269998 [00:06<00:00, 40299.60it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 269998/269998 [00:07<00:00, 36859.46it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 269998/269998 [00:00<00:00, 614256.15it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████

In [9]:
dataset = TensorDataset(input_features, attention_masks, targets)

In [10]:
df[cat_features + continous_features].head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,part_of_day,domain,request_cnt,price,timestamp,relative_date
2272,60,732,27,211,5790,2,0,3,761,1,74259.0,163261440.0,10584.0
2273,60,732,27,211,65865,2,0,0,549,3,74259.0,162950400.0,41256.0
2274,60,732,27,211,111474,2,0,0,761,3,74259.0,162941760.0,42120.0
2275,14,311,27,211,111474,2,0,1,761,2,74259.0,163114560.0,24624.0
2276,60,732,27,211,125409,2,0,0,549,1,74259.0,163028160.0,33480.0


## Feed to the model

In [11]:
cat_feature_indexes = []
cont_feature_indexes = []
vocab_sizes = {}

for i in tqdm(range(len(cat_features))):
    cat_feature_indexes.append(i)
    vocab_sizes[i] = df[cat_features[i]].max() + 1

for i in tqdm(range(len(continous_features))):
    cont_feature_indexes.append(len(cat_features) + i)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 67.56it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 190650.18it/s]


In [12]:
vocab_sizes

{0: 81, 1: 985, 2: 37, 3: 599, 4: 199683, 5: 4, 6: 2, 7: 4, 8: 869}

In [24]:
model = LSTMModel(
        cat_feature_indexes=cat_feature_indexes,
        vocab_sizes=vocab_sizes,
        cont_feature_indexes=cont_feature_indexes,
        encoder_hidden_dim=16,
        hidden_dim=128,
        output_dim=7,
)

# weights_init_uniform_rule(model)

loss = nn.CrossEntropyLoss(
    weight=torch.tensor(
        compute_class_weight(
            class_weight="balanced",
            classes=sorted(df["age"].unique()),
            y=targets.numpy()
        )
    )
)

metric = AGE_METRIC

In [14]:
(dataset[0][0] == dataset[0][0] * dataset[0][1].unsqueeze(1)).sum() == 1024 * 13

tensor(True)

In [15]:
targets, dataset[2][2]

(tensor([2, 3, 1,  ..., 2, 3, 3]), tensor(1))

In [None]:
cross_validation(
        project_name="ХУЙ", 
        model=model, 
        dataset=dataset, 
        loss_function=loss, 
        metric_func=AGE_METRIC, 
        optimizer=torch.optim.AdamW, 
        get_scheduler=get_cosine_schedule_with_warmup, 
        strat_array=targets.numpy(), 
        device='cuda', 
        random_state=69, 
        shuffle=True, 
        dataloader_shuffle=False, 
        n_folds=4, 
        epochs=30, 
        lr=1e-2, 
        weight_decay=1e-4,
        start_fold=0, 
        batch_size=32,
)

FOLD 0
--------------------------------


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6329/6329 [02:12<00:00, 47.68it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2110/2110 [00:14<00:00, 146.14it/s]


EPOCH: 0
{'Age F1': 0.17784297242223743, 'loss': 1.8053496335098163}
{'Age F1': 0.18173120293833064, 'loss': 1.757360693259116}


 22%|████████████████████████████████████████▎                                                                                                                                             | 1401/6329 [00:28<01:39, 49.43it/s]