# seq2seq

In [12]:
import gc
import sys
import os
import warnings
from tqdm import tqdm

sys.path.append(os.path.abspath("../"))
os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')
tqdm.pandas()

In [13]:
import pandas as pd
import numpy as np
import time
import polars as pl
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.utils.class_weight import compute_class_weight

import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn

from transformers import get_constant_schedule

In [14]:
import importlib

import modules
import fe_modules
import seq2seq_modules

importlib.reload(modules)
importlib.reload(fe_modules)
importlib.reload(seq2seq_modules)

from modules.memory_utils import pandas_reduce_mem_usage, pandas_string_to_cat
from seq2seq_modules.models import LSTMModel
from seq2seq_modules.weight_initialization import weights_init_uniform_rule
from seq2seq_modules.loops import cross_validation, single_model_training
from seq2seq_modules.utils import age_bucket
from seq2seq_modules.metrics import AGE_METRIC
from fe_modules.encoders import CatBoostEncoderWrapper

In [15]:
def my_reset(*varnames):
    """
    varnames are what you want to keep
    """
    globals_ = globals()
    to_save = {v: globals_[v] for v in varnames}
    to_save['my_reset'] = my_reset  # lets keep this function by default
    del globals_
    get_ipython().magic("reset")
    globals().update(to_save)

## Read and process

In [23]:
DATA_PATH = '../data/'
SEQ2SEQ_DATA_PATH = '../seq2seq_data/'

SPLIT_SEED = 42

In [8]:
df = pandas_reduce_mem_usage(
    pd.read_parquet(f"{SEQ2SEQ_DATA_PATH}/stages/stage_2.parquet.gzip")
)
df.head()

Memory usage of dataframe is 26346.12 MB


  0%|          | 0/30 [00:00<?, ?it/s]

Memory usage after optimization is: 26346.12 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,timezone,geo_lat,geo_lon,population,dist_to_Moscow,dist_to_SaintP,dist_to_Novosibirsk,dist_to_Ekaterinburg,dist_to_Vladivostok,domain
0,21,409,1,589,5788,2,1,20368.0,2,1,...,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,761
1,21,409,1,589,12900,2,1,20368.0,2,1,...,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,549
2,21,409,1,589,17626,2,1,20368.0,0,1,...,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,712
3,21,409,1,589,59366,2,1,20368.0,0,1,...,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,712
4,21,409,1,589,59366,2,1,20368.0,0,1,...,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,712


In [9]:
target = pandas_reduce_mem_usage(
    pd.read_parquet(
        f'{DATA_PATH}public_train.pqt', columns=["user_id", "age"]
    )
)
target

Memory usage of dataframe is 6.18 MB


  0%|          | 0/2 [00:00<?, ?it/s]

Memory usage after optimization is: 5.15 MB
Decreased by 16.7%


Unnamed: 0,user_id,age
350459,350459,31.0
188276,188276,35.0
99002,99002,41.0
155506,155506,33.0
213873,213873,54.0
...,...,...
225374,225374,49.0
25776,25776,22.0
148131,148131,28.0
205570,205570,28.0


In [10]:
df = pandas_reduce_mem_usage(df.merge(target, how="left", on="user_id").dropna(subset=["age"]))
df.head()

Memory usage of dataframe is 18833.19 MB


  0%|          | 0/31 [00:00<?, ?it/s]

Memory usage after optimization is: 18833.19 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,geo_lat,geo_lon,population,dist_to_Moscow,dist_to_SaintP,dist_to_Novosibirsk,dist_to_Ekaterinburg,dist_to_Vladivostok,domain,age
2272,60,732,27,211,5790,2,0,74259.0,3,1,...,59.939133,30.315901,4848742,635.758972,0.024454,3115.404541,1787.926147,6555.964355,761,35.0
2273,60,732,27,211,65865,2,0,74259.0,0,3,...,59.939133,30.315901,4848742,635.758972,0.024454,3115.404541,1787.926147,6555.964355,549,35.0
2274,60,732,27,211,111474,2,0,74259.0,0,3,...,59.939133,30.315901,4848742,635.758972,0.024454,3115.404541,1787.926147,6555.964355,761,35.0
2275,14,311,27,211,111474,2,0,74259.0,1,2,...,54.70747,20.507324,431491,1092.055054,828.233154,3870.873535,2490.750244,7378.328613,761,35.0
2276,60,732,27,211,125409,2,0,74259.0,0,1,...,59.939133,30.315901,4848742,635.758972,0.024454,3115.404541,1787.926147,6555.964355,549,35.0


In [11]:
my_reset("df")

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [16]:
ids = ["user_id"]

cat_features = [
    "region_name",
    "city_name",
    "cpe_manufacturer_name",
    "cpe_model_name",
    "url_host",
    "cpe_type_cd",
    "cpe_model_os_type",
    "part_of_day",
    "domain",
    "capital_marker"
]

continous_features = [
    "request_cnt",
    "price",
    "timestamp",
    "relative_timestamp",
    "geo_lat",
    "geo_lon",
    "population",
    "timezone",
    "dist_to_Moscow",
    "dist_to_SaintP",
    "dist_to_Novosibirsk",
    "dist_to_Ekaterinburg",
    "dist_to_Vladivostok",
]

In [17]:
cbe = CatBoostEncoderWrapper(cat_features=cat_features, sort_col="timestamp")

In [18]:
df = cbe.fit_transform(df, target_col="age")

In [19]:
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,geo_lat,geo_lon,population,dist_to_Moscow,dist_to_SaintP,dist_to_Novosibirsk,dist_to_Ekaterinburg,dist_to_Vladivostok,domain,age
283148079,36.770563,38.35634,39.446444,38.262665,32.685074,37.097842,39.726215,15202.0,36.459209,1,...,53.454575,56.043877,66259,1214.075073,1719.235229,1749.992432,475.361786,5469.489746,35.600496,49.0
55415156,36.398467,35.6786,39.985874,41.581594,36.300617,37.097842,39.726215,12919.0,36.459209,1,...,47.411919,40.10421,169039,944.039307,1533.198975,3055.624268,1740.776978,6779.533691,38.174047,38.0
237055563,36.272424,35.128224,32.931036,32.078329,37.829524,37.097842,32.931036,51521.0,36.459209,1,...,61.254108,73.396156,306703,2143.172852,2322.031982,889.938171,881.775757,4290.968262,35.600496,39.0
237055441,36.272424,35.128224,32.931036,32.078329,34.18327,37.097842,32.931036,51521.0,36.459209,1,...,61.254108,73.396156,306703,2143.172852,2322.031982,889.938171,881.775757,4290.968262,38.174047,39.0
237055408,36.272424,35.128224,32.931036,32.078329,37.592145,37.097842,32.931036,51521.0,36.459209,1,...,61.254108,73.396156,306703,2143.172852,2322.031982,889.938171,881.775757,4290.968262,38.174047,39.0


In [20]:
cbe.save("../model_zoo", "age_cbe")

In [21]:
df["age"] = df["age"].progress_apply(age_bucket)
df = pandas_reduce_mem_usage(df)
df.head()

100%|████████████████████████| 214652540/214652540 [02:12<00:00, 1619275.24it/s]


Memory usage of dataframe is 31320.42 MB


  0%|          | 0/31 [00:00<?, ?it/s]

Memory usage after optimization is: 29887.46 MB
Decreased by 4.6%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,geo_lat,geo_lon,population,dist_to_Moscow,dist_to_SaintP,dist_to_Novosibirsk,dist_to_Ekaterinburg,dist_to_Vladivostok,domain,age
283148079,36.770563,38.35634,39.446444,38.262665,32.685074,37.097842,39.726215,15202.0,36.459209,1,...,53.454575,56.043877,66259,1214.075073,1719.235229,1749.992432,475.361786,5469.489746,35.600496,3
55415156,36.398467,35.6786,39.985874,41.581594,36.300617,37.097842,39.726215,12919.0,36.459209,1,...,47.411919,40.10421,169039,944.039307,1533.198975,3055.624268,1740.776978,6779.533691,38.174047,2
237055563,36.272424,35.128224,32.931036,32.078329,37.829524,37.097842,32.931036,51521.0,36.459209,1,...,61.254108,73.396156,306703,2143.172852,2322.031982,889.938171,881.775757,4290.968262,35.600496,2
237055441,36.272424,35.128224,32.931036,32.078329,34.18327,37.097842,32.931036,51521.0,36.459209,1,...,61.254108,73.396156,306703,2143.172852,2322.031982,889.938171,881.775757,4290.968262,38.174047,2
237055408,36.272424,35.128224,32.931036,32.078329,37.592145,37.097842,32.931036,51521.0,36.459209,1,...,61.254108,73.396156,306703,2143.172852,2322.031982,889.938171,881.775757,4290.968262,38.174047,2


In [24]:
df.to_parquet(f"{SEQ2SEQ_DATA_PATH}/version_5_age.parquet.gzip",
              compression='gzip')

## Make torch Dataset

In [1]:
import gc
import sys
import os
import warnings
from tqdm import tqdm

sys.path.append(os.path.abspath("../"))
os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')
tqdm.pandas()

In [2]:
import pandas as pd
import numpy as np
import time
import polars as pl
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.utils.class_weight import compute_class_weight

import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn

from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import get_constant_schedule, get_cosine_schedule_with_warmup

In [3]:
import importlib

import modules
import fe_modules
import seq2seq_modules

importlib.reload(modules)
importlib.reload(fe_modules)
importlib.reload(seq2seq_modules)

from modules.memory_utils import pandas_reduce_mem_usage, pandas_string_to_cat
from seq2seq_modules.models import LSTMModel, StarterBERTModel, AttentionPoolingBERTModel
from seq2seq_modules.data import TargetDataset
from seq2seq_modules.weight_initialization import weights_init_uniform_rule, weights_init_xavier
from seq2seq_modules.loops import cross_validation, single_model_training
from seq2seq_modules.trainer import CVTrainer
from seq2seq_modules.utils import age_bucket
from seq2seq_modules.metrics import AGE_METRIC
from seq2seq_modules.utils import fix_random_state

In [4]:
LOCAL_DATA_PATH = '../seq2seq_data/'
SEED = 42
fix_random_state(SEED)

In [5]:
ids = ["user_id"]

cat_features = [
    "region_name",
    "city_name",
    "cpe_manufacturer_name",
    "cpe_model_name",
    "url_host",
    "cpe_type_cd",
    "cpe_model_os_type",
    "part_of_day",
    "domain",
    "capital_marker"
]

continous_features = [
    "request_cnt",
    "price",
    "timestamp",
    "relative_timestamp",
    "geo_lat",
    "geo_lon",
    "population",
    "timezone",
    "dist_to_Moscow",
    "dist_to_SaintP",
    "dist_to_Novosibirsk",
    "dist_to_Ekaterinburg",
    "dist_to_Vladivostok",
]

In [6]:
df = pandas_reduce_mem_usage(
    pd.read_parquet(f"{LOCAL_DATA_PATH}/version_5_age.parquet.gzip")
)
df.head()

Memory usage of dataframe is 29887.46 MB


  0%|          | 0/31 [00:00<?, ?it/s]

Memory usage after optimization is: 29887.46 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,geo_lat,geo_lon,population,dist_to_Moscow,dist_to_SaintP,dist_to_Novosibirsk,dist_to_Ekaterinburg,dist_to_Vladivostok,domain,age
283148079,36.770563,38.35634,39.446444,38.262665,32.685074,37.097842,39.726215,15202.0,36.459209,1,...,53.454575,56.043877,66259,1214.075073,1719.235229,1749.992432,475.361786,5469.489746,35.600496,3
55415156,36.398467,35.6786,39.985874,41.581594,36.300617,37.097842,39.726215,12919.0,36.459209,1,...,47.411919,40.10421,169039,944.039307,1533.198975,3055.624268,1740.776978,6779.533691,38.174047,2
237055563,36.272424,35.128224,32.931036,32.078329,37.829524,37.097842,32.931036,51521.0,36.459209,1,...,61.254108,73.396156,306703,2143.172852,2322.031982,889.938171,881.775757,4290.968262,35.600496,2
237055441,36.272424,35.128224,32.931036,32.078329,34.18327,37.097842,32.931036,51521.0,36.459209,1,...,61.254108,73.396156,306703,2143.172852,2322.031982,889.938171,881.775757,4290.968262,38.174047,2
237055408,36.272424,35.128224,32.931036,32.078329,37.592145,37.097842,32.931036,51521.0,36.459209,1,...,61.254108,73.396156,306703,2143.172852,2322.031982,889.938171,881.775757,4290.968262,38.174047,2


In [7]:
dataset = TargetDataset(
         df,
         agg_column="user_id", 
         time_column="timestamp",
         target_column="age",
         cat_features=[],
         cont_features=continous_features + cat_features,
         max_len=1024,
         padding_side="left",
)
user_ids = dataset.get_agg_col()
dataset

  0%|          | 0/214652540 [00:00<?, ?it/s]

<seq2seq_modules.data.TargetDataset at 0x7f722222e9b0>

## Feed to the model

In [8]:
cat_feature_indexes = []
cont_feature_indexes = []
vocab_sizes = {}

# for i in tqdm(range(len(cat_features))):
#     cat_feature_indexes.append(i)
#     vocab_sizes[i] = int(df[cat_features[i]].max() + 1)

for i in tqdm(range(len(continous_features + cat_features))):
    cont_feature_indexes.append(i)

100%|███████████████████████████████████████| 23/23 [00:00<00:00, 373910.82it/s]


In [9]:
vocab_sizes

{}

In [10]:
targets = torch.cat([el[3].unsqueeze(0) for el in tqdm(dataset)], dim=0)
targets

100%|████████████████████████████████| 269999/269999 [00:18<00:00, 14726.39it/s]


tensor([1, 2, 0,  ..., 2, 2, 4])

In [14]:
# model = AttentionPoolingBERTModel(
#         cat_feature_indexes=cat_feature_indexes,
#         vocab_sizes=vocab_sizes,
#         cont_feature_indexes=cont_feature_indexes,
#         encoder_hidden_dim=16,
#         hidden_dim=256,
#         dim_feedforward=512,
#         output_dim=7,
#         pe_type="trainable",
#         use_mask=False,
#         max_len=1024,
#         use_key_padding_mask=True,
# )

# model = StarterBERTModel(
#         cat_feature_indexes=cat_feature_indexes,
#         vocab_sizes=vocab_sizes,
#         cont_feature_indexes=cont_feature_indexes,
#         encoder_hidden_dim=16,
#         hidden_dim=256,
#         dim_feedforward=512,
#         output_dim=7,
#         pe_type="trainable",
#         use_mask=False,
#         max_len=1024,
#         use_key_padding_mask=False,
#         starter="randn",
#         shared=False
# )

model = LSTMModel(
        cat_feature_indexes=cat_feature_indexes,
        vocab_sizes=vocab_sizes,
        cont_feature_indexes=cont_feature_indexes,
        encoder_hidden_dim=16,
        hidden_dim=256,
        output_dim=6,
)


weights_init_xavier(model)

loss = nn.CrossEntropyLoss(
    weight=torch.tensor(
        compute_class_weight(
            class_weight="balanced",
            classes=sorted(df["age"].unique()),
            y=targets.numpy()
        )
    )
)

metric = AGE_METRIC

In [15]:
trainer = CVTrainer(
            model_name="age_lstm_with_cbe",
            model=model,
            n_folds=6,
)

In [16]:
train_fold_metrics, eval_fold_metrics, embeddings_df, logits_df = trainer.fit_transform(
                          dataset=dataset,
                          loss_function=nn.CrossEntropyLoss,
                          metric_func=metric,
                          optimizer=torch.optim.AdamW,
                          get_scheduler=get_cosine_schedule_with_warmup,
                          strat_array=targets,
                          target_name="age",
                          user_ids=user_ids,
                          shuffle=True,
                          epochs=5,
                          lr=1e-3,
                          weight_decay=1e-2,
                          num_warmup_steps=0,
                          batch_size=256,
                          device= "cuda"
)

FOLD 0
--------------------------------


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/176 [00:00<?, ?it/s]

EPOCH: 0
{'Age F1': 0.19550520122332984, 'loss': 1.6820641836223396}
{'Age F1': 0.270838909825709, 'loss': 1.6177623903423053}


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/176 [00:00<?, ?it/s]

EPOCH: 1
{'Age F1': 0.25519800302473916, 'loss': 1.5945557906693346}
{'Age F1': 0.3007482527578546, 'loss': 1.531996437877229}


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/176 [00:00<?, ?it/s]

EPOCH: 2
{'Age F1': 0.2948791319153757, 'loss': 1.5324655520558876}
{'Age F1': 0.286520681279035, 'loss': 1.514471494939052}


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/176 [00:00<?, ?it/s]

EPOCH: 3
{'Age F1': 0.3177486348440692, 'loss': 1.495329026224726}
{'Age F1': 0.31363097416502184, 'loss': 1.4800834404616325}


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/176 [00:00<?, ?it/s]

EPOCH: 4
{'Age F1': 0.32511093762292814, 'loss': 1.478245035065502}
{'Age F1': 0.3132614085565049, 'loss': 1.4763455317684517}
FOLD 1
--------------------------------


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/176 [00:00<?, ?it/s]

EPOCH: 0
{'Age F1': 0.20077863125178572, 'loss': 1.6789538607038945}
{'Age F1': 0.22260666021671072, 'loss': 1.6442611737043649}


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/176 [00:00<?, ?it/s]

EPOCH: 1
{'Age F1': 0.25013269537601873, 'loss': 1.6049641978820848}
{'Age F1': 0.2145433322007549, 'loss': 1.561910490280661}


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/176 [00:00<?, ?it/s]

EPOCH: 2
{'Age F1': 0.2920352020572649, 'loss': 1.5386654157086688}
{'Age F1': 0.29745520188921615, 'loss': 1.497457585110135}


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/176 [00:00<?, ?it/s]

EPOCH: 3
{'Age F1': 0.31454486716881835, 'loss': 1.4960464586062128}
{'Age F1': 0.3108049760807496, 'loss': 1.4790399645181425}


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/176 [00:00<?, ?it/s]

EPOCH: 4
{'Age F1': 0.32571593178570635, 'loss': 1.4770532047645848}
{'Age F1': 0.31146179664882306, 'loss': 1.4746862628195863}
FOLD 2
--------------------------------


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/176 [00:00<?, ?it/s]

EPOCH: 0
{'Age F1': 0.1970730699941883, 'loss': 1.679396516393835}
{'Age F1': 0.1558564257097195, 'loss': 1.742041177917825}


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/176 [00:00<?, ?it/s]

EPOCH: 1
{'Age F1': 0.2496011087988346, 'loss': 1.6112873893509927}
{'Age F1': 0.270163956255371, 'loss': 1.5832204797267295}


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/176 [00:00<?, ?it/s]

EPOCH: 2
{'Age F1': 0.2882323931436959, 'loss': 1.5489179531824384}
{'Age F1': 0.26433276438994174, 'loss': 1.530226748661912}


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/176 [00:00<?, ?it/s]

EPOCH: 3
{'Age F1': 0.30761846433785617, 'loss': 1.5049876104396183}
{'Age F1': 0.313724494681408, 'loss': 1.492988880150245}


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/176 [00:00<?, ?it/s]

EPOCH: 4
{'Age F1': 0.3209301576012789, 'loss': 1.482153323140999}
{'Age F1': 0.31460752130872277, 'loss': 1.488161792619517}
FOLD 3
--------------------------------


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/176 [00:00<?, ?it/s]

EPOCH: 0
{'Age F1': 0.19320531363229626, 'loss': 1.6814985164328489}
{'Age F1': 0.27879382476660736, 'loss': 1.5952913426309372}


  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/176 [00:00<?, ?it/s]

In [17]:
CV_FEATURES = "../cv_data/"
MODEL_ZOO = "../model_zoo/"

In [18]:
trainer.save_model(model_name=trainer.model_name)

In [19]:
embeddings_df.to_parquet(f"{CV_FEATURES}/age_cv_embeddings.parquet.gzip",
                          compression='gzip')
logits_df.to_parquet(f"{CV_FEATURES}/age_cv_logits.parquet.gzip",
                          compression='gzip')