# seq2seq

In [1]:
import gc
import sys
import os
import warnings
from tqdm import tqdm

sys.path.append(os.path.abspath("modules/"))
sys.path.append(os.path.abspath("fe_modules/"))
sys.path.append(os.path.abspath("seq2seq_modules/"))

os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')
tqdm.pandas()

In [2]:
import pandas as pd
import numpy as np
import time
import polars as pl
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.utils.class_weight import compute_class_weight

import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn

from transformers import get_constant_schedule

In [5]:
import importlib

import modules
import fe_modules
import seq2seq_modules

importlib.reload(modules)
importlib.reload(fe_modules)
importlib.reload(seq2seq_modules)

from modules.memory_utils import pandas_reduce_mem_usage, pandas_string_to_cat
from seq2seq_modules.models import LSTMModel
from seq2seq_modules.weight_initialization import weights_init_uniform_rule
from seq2seq_modules.loops import cross_validation, single_model_training
from seq2seq_modules.utils import age_bucket
from seq2seq_modules.metrics import AGE_METRIC

## Read and process

In [6]:
LOCAL_DATA_PATH = './data/'
SPLIT_SEED = 42

In [7]:
ids = ["user_id"]

cat_features = [
    "region_name",
    "city_name",
    "cpe_manufacturer_name",
    "cpe_model_name",
    "url_host",
    "cpe_type_cd",
    "cpe_model_os_type",
    "part_of_day",
    "request_cnt",
    "domain"
]

continous_features = [
    "price",
    "timestamp",
    "relative_date"
]

In [8]:
df = pandas_reduce_mem_usage(
    pd.read_parquet("seq2seq_data/stages/stage_4.parquet.gzip")
)
df.head()

Memory usage of dataframe is 21327.81 MB


100%|███████████████████████████████████████████| 25/25 [00:09<00:00,  2.59it/s]

Memory usage after optimization is: 20073.23 MB
Decreased by 5.9%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,day_of_week,holiday,capital_marker,timezone,geo_lat,geo_lon,population,domain,timestamp,relative_date
0,21,409,1,589,5788,2,1,20368.0,2,1,...,2,5,2.0,3,45.040161,38.975964,744933,761,165525120.0,3888.0
1,21,409,1,589,12900,2,1,20368.0,2,1,...,6,5,2.0,3,45.040161,38.975964,744933,549,165559680.0,432.0
2,21,409,1,589,17626,2,1,20368.0,0,1,...,6,1,2.0,3,45.040161,38.975964,744933,712,165499200.0,6264.0
3,21,409,1,589,59366,2,1,20368.0,0,1,...,0,5,2.0,3,45.040161,38.975964,744933,712,165265920.0,29592.0
4,21,409,1,589,59366,2,1,20368.0,0,1,...,0,5,2.0,3,45.040161,38.975964,744933,712,165386880.0,17496.0


In [10]:
target = pandas_reduce_mem_usage(
    pd.read_parquet(
        f'{LOCAL_DATA_PATH}public_train.pqt', columns=["user_id", "age"]
    )
)
target

Memory usage of dataframe is 6.18 MB


100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 615.81it/s]

Memory usage after optimization is: 4.12 MB
Decreased by 33.3%





Unnamed: 0,user_id,age
350459,350459,31.0
188276,188276,35.0
99002,99002,41.0
155506,155506,33.0
213873,213873,54.0
...,...,...
225374,225374,49.0
25776,25776,22.0
148131,148131,28.0
205570,205570,28.0


In [11]:
df = pandas_reduce_mem_usage(df.merge(target, how="left", on="user_id").dropna(subset=["age"]))
df.head()

Memory usage of dataframe is 13920.19 MB


100%|███████████████████████████████████████████| 26/26 [00:06<00:00,  3.81it/s]

Memory usage after optimization is: 13920.19 MB
Decreased by 0.0%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,holiday,capital_marker,timezone,geo_lat,geo_lon,population,domain,timestamp,relative_date,age
2272,60,732,27,211,5790,2,0,74259.0,3,1,...,5,0.0,3,59.939133,30.315901,4848742,761,163261440.0,10584.0,35.0
2273,60,732,27,211,65865,2,0,74259.0,0,3,...,5,0.0,3,59.939133,30.315901,4848742,549,162950400.0,41256.0,35.0
2274,60,732,27,211,111474,2,0,74259.0,0,3,...,5,0.0,3,59.939133,30.315901,4848742,761,162941760.0,42120.0,35.0
2275,14,311,27,211,111474,2,0,74259.0,1,2,...,5,2.0,2,54.70747,20.507324,431491,761,163114560.0,24624.0,35.0
2276,60,732,27,211,125409,2,0,74259.0,0,1,...,5,0.0,3,59.939133,30.315901,4848742,549,163028160.0,33480.0,35.0


In [12]:
df["age"] = df["age"].progress_apply(age_bucket)
df = pandas_reduce_mem_usage(df)
df.head()

100%|████████████████████████| 214652540/214652540 [02:14<00:00, 1600131.52it/s]


Memory usage of dataframe is 14739.02 MB


100%|███████████████████████████████████████████| 26/26 [00:04<00:00,  6.31it/s]

Memory usage after optimization is: 13306.06 MB
Decreased by 9.7%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,holiday,capital_marker,timezone,geo_lat,geo_lon,population,domain,timestamp,relative_date,age
2272,60,732,27,211,5790,2,0,74259.0,3,1,...,5,0.0,3,59.939133,30.315901,4848742,761,163261440.0,10584.0,2
2273,60,732,27,211,65865,2,0,74259.0,0,3,...,5,0.0,3,59.939133,30.315901,4848742,549,162950400.0,41256.0,2
2274,60,732,27,211,111474,2,0,74259.0,0,3,...,5,0.0,3,59.939133,30.315901,4848742,761,162941760.0,42120.0,2
2275,14,311,27,211,111474,2,0,74259.0,1,2,...,5,2.0,2,54.70747,20.507324,431491,761,163114560.0,24624.0,2
2276,60,732,27,211,125409,2,0,74259.0,0,1,...,5,0.0,3,59.939133,30.315901,4848742,549,163028160.0,33480.0,2


In [13]:
df.to_parquet("seq2seq_data/version_3_age.parquet.gzip",
              compression='gzip')

## Make torch Dataset

In [1]:
import gc
import sys
import os
import warnings
from tqdm import tqdm

sys.path.append(os.path.abspath("modules/"))
sys.path.append(os.path.abspath("fe_modules/"))
sys.path.append(os.path.abspath("seq2seq_modules/"))

os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')
tqdm.pandas()

In [2]:
import pandas as pd
import numpy as np
import time
import polars as pl
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.utils.class_weight import compute_class_weight

import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn

from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import get_constant_schedule, get_cosine_schedule_with_warmup

In [3]:
import importlib

import modules
import fe_modules
import seq2seq_modules

importlib.reload(modules)
importlib.reload(fe_modules)
importlib.reload(seq2seq_modules)

from modules.memory_utils import pandas_reduce_mem_usage, pandas_string_to_cat
from seq2seq_modules.models import LSTMModel, StarterBERTModel, AttentionPoolingBERTModel
from seq2seq_modules.data import TargetDataset
from seq2seq_modules.weight_initialization import weights_init_uniform_rule, weights_init_xavier
from seq2seq_modules.loops import cross_validation, single_model_training
from seq2seq_modules.utils import age_bucket
from seq2seq_modules.metrics import AGE_METRIC
from seq2seq_modules.utils import fix_random_state

In [4]:
LOCAL_DATA_PATH = './data/'
SEED = 42
fix_random_state(SEED)

In [5]:
ids = ["user_id"]

cat_features = [
    "region_name",
    "city_name",
    "cpe_manufacturer_name",
    "cpe_model_name",
    "url_host",
    "cpe_type_cd",
    "cpe_model_os_type",
    "part_of_day",
    "domain"
]

continous_features = [
    "request_cnt",
    "price",
    "timestamp",
    "relative_date"
]

In [6]:
df = pandas_reduce_mem_usage(
    pd.read_parquet("seq2seq_data/version_3_age.parquet.gzip")
)

Memory usage of dataframe is 13306.06 MB


100%|███████████████████████████████████████████| 26/26 [00:06<00:00,  4.04it/s]

Memory usage after optimization is: 13306.06 MB
Decreased by 0.0%





In [7]:
dataset = TargetDataset(
         df,
         agg_column="user_id", 
         time_column="timestamp",
         target_column="age",
         cat_features=cat_features,
         cont_features=continous_features,
         max_len=1024,
         padding_side="left",
)
dataset

  0%|          | 0/214652540 [00:00<?, ?it/s]

<seq2seq_modules.data.TargetDataset at 0x7f7015847f40>

## Feed to the model

In [8]:
cat_feature_indexes = []
cont_feature_indexes = []
vocab_sizes = {}

for i in tqdm(range(len(cat_features))):
    cat_feature_indexes.append(i)
    vocab_sizes[i] = df[cat_features[i]].max() + 1

for i in tqdm(range(len(continous_features))):
    cont_feature_indexes.append(i)

100%|█████████████████████████████████████████████| 9/9 [00:00<00:00, 62.60it/s]
100%|█████████████████████████████████████████| 4/4 [00:00<00:00, 188508.04it/s]


In [9]:
vocab_sizes

{0: 81, 1: 985, 2: 37, 3: 599, 4: 199683, 5: 4, 6: 2, 7: 4, 8: 869}

In [10]:
targets = torch.cat([el[3].unsqueeze(0) for el in tqdm(dataset)], dim=0)
targets

100%|████████████████████████████████| 269998/269998 [00:14<00:00, 19139.22it/s]


tensor([2, 3, 1,  ..., 2, 3, 3])

In [11]:
# model = AttentionPoolingBERTModel(
#         cat_feature_indexes=cat_feature_indexes,
#         vocab_sizes=vocab_sizes,
#         cont_feature_indexes=cont_feature_indexes,
#         encoder_hidden_dim=16,
#         hidden_dim=256,
#         dim_feedforward=512,
#         output_dim=7,
#         pe_type="trainable",
#         use_mask=False,
#         max_len=1024,
#         use_key_padding_mask=True,
# )

model = StarterBERTModel(
        cat_feature_indexes=cat_feature_indexes,
        vocab_sizes=vocab_sizes,
        cont_feature_indexes=cont_feature_indexes,
        encoder_hidden_dim=16,
        hidden_dim=256,
        dim_feedforward=512,
        output_dim=7,
        pe_type="trainable",
        use_mask=False,
        max_len=1024,
        use_key_padding_mask=False,
        starter="randn",
        shared=False
)

# model = LSTMModel(
#         cat_feature_indexes=cat_feature_indexes,
#         vocab_sizes=vocab_sizes,
#         cont_feature_indexes=cont_feature_indexes,
#         encoder_hidden_dim=16,
#         hidden_dim=256,
#         output_dim=7,
# )


weights_init_xavier(model)

loss = nn.CrossEntropyLoss(
    weight=torch.tensor(
        compute_class_weight(
            class_weight="balanced",
            classes=sorted(df["age"].unique()),
            y=targets.numpy()
        )
    )
)

metric = AGE_METRIC

In [12]:
weight=torch.tensor(
    compute_class_weight(
        class_weight="balanced",
        classes=sorted(df["age"].unique()),
        y=targets.numpy()
    )
)
weight

tensor([35.8135,  1.1817,  0.4420,  0.4978,  0.9088,  1.6358,  7.0091],
       dtype=torch.float64)

In [13]:
cross_validation(
        project_name="ХУЙ", 
        model=model, 
        dataset=dataset, 
        loss_function=loss, 
        metric_func=AGE_METRIC, 
        optimizer=torch.optim.AdamW, 
        get_scheduler=get_cosine_schedule_with_warmup, 
        strat_array=targets.numpy(), 
        device='cuda', 
        random_state=69, 
        shuffle=True, 
        dataloader_shuffle=False, 
        n_folds=5, 
        epochs=10, 
        lr=1e-3, 
        weight_decay=1e-4,
        start_fold=0, 
        batch_size=64,
)

FOLD 0
--------------------------------


  0%|          | 0/3375 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.00 GiB (GPU 0; 23.65 GiB total capacity; 16.93 GiB already allocated; 152.81 MiB free; 17.19 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF