# seq2seq

In [22]:
import gc
import sys
import os
import warnings
from tqdm import tqdm

sys.path.append(os.path.abspath("../"))
os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')
tqdm.pandas()

In [23]:
import pandas as pd
import numpy as np
import time
import polars as pl
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.utils.class_weight import compute_class_weight

import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn

from transformers import get_constant_schedule

In [24]:
import importlib

import modules
import fe_modules
import seq2seq_modules

importlib.reload(modules)
importlib.reload(fe_modules)
importlib.reload(seq2seq_modules)

from modules.memory_utils import pandas_reduce_mem_usage, pandas_string_to_cat
from seq2seq_modules.models import LSTMModel
from seq2seq_modules.weight_initialization import weights_init_uniform_rule
from seq2seq_modules.loops import cross_validation, single_model_training
from seq2seq_modules.utils import age_bucket
from seq2seq_modules.metrics import AGE_METRIC
from fe_modules.encoders import CatBoostEncoderWrapper
from fe_modules.time_lags import generate_time_lags

In [25]:
def my_reset(*varnames):
    """
    varnames are what you want to keep
    """
    globals_ = globals()
    to_save = {v: globals_[v] for v in varnames}
    to_save['my_reset'] = my_reset  # lets keep this function by default
    del globals_
    get_ipython().magic("reset")
    globals().update(to_save)

## Read and process

In [26]:
LOCAL_DATA_PATH = '../data/'
SEQ2SEQ_DATA_PATH = '../seq2seq_data/'

SPLIT_SEED = 42

In [6]:
df = pandas_reduce_mem_usage(
    pd.read_parquet(f"{SEQ2SEQ_DATA_PATH}/stages/stage_2.parquet.gzip")
).drop(columns=["Сайт для взрослых людей", 
                "Сайт для молодежи", 
                "Сайт для пенсионеров", 
                'Сайты для женщин', 
                'Сайты для мужчин'])
df.head()

Memory usage of dataframe is 45164.78 MB


  0%|          | 0/45 [00:00<?, ?it/s]

Memory usage after optimization is: 45164.78 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,Блог,Интернет-магазин,Интернет-форум,Информационный сайт,Корпоративный сайт,Поисковая система,Порно-сайт,Почтовый сервис,Сайт-сервис,Социальная сеть
0,21,409,1,589,5788,2,1,20368.0,2,1,...,0.02206,0.010498,0.021562,0.127385,0.121726,0.079564,0.008757,0.008911,0.584294,0.015242
1,21,409,1,589,12900,2,1,20368.0,2,1,...,0.024201,0.734256,0.02749,0.03459,0.055687,0.025109,0.016231,0.016883,0.040856,0.024697
2,21,409,1,589,17626,2,1,20368.0,0,1,...,0.08717,0.082425,0.087657,0.096718,0.113347,0.138035,0.067645,0.076701,0.168411,0.081891
3,21,409,1,589,59366,2,1,20368.0,0,1,...,0.028243,0.056895,0.048744,0.062024,0.464972,0.193379,0.031416,0.018421,0.047987,0.04792
4,21,409,1,589,59366,2,1,20368.0,0,1,...,0.028243,0.056895,0.048744,0.062024,0.464972,0.193379,0.031416,0.018421,0.047987,0.04792


In [7]:
target = pandas_reduce_mem_usage(pd.read_parquet(
             f'{LOCAL_DATA_PATH}public_train.pqt', columns=["user_id", "is_male"]
         ).dropna())
target = target[target["is_male"] != "NA"]
target["is_male"] = target["is_male"].astype(np.int32)
target = pandas_reduce_mem_usage(target)

target.head()

Memory usage of dataframe is 6.18 MB


  0%|          | 0/2 [00:00<?, ?it/s]

Memory usage after optimization is: 5.15 MB
Decreased by 16.7%
Memory usage of dataframe is 4.03 MB


  0%|          | 0/2 [00:00<?, ?it/s]

Memory usage after optimization is: 3.28 MB
Decreased by 18.8%


Unnamed: 0,user_id,is_male
350459,350459,1
188276,188276,1
99002,99002,0
155506,155506,0
213873,213873,0


In [8]:
my_reset("df", "target")

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [15]:
df = df.merge(target, how="left", on="user_id").dropna(subset=["is_male"])
df["is_male"] = df["is_male"].astype(np.int32)
df = pandas_reduce_mem_usage(df)
df.head()

Memory usage of dataframe is 25716.93 MB


  0%|          | 0/41 [00:00<?, ?it/s]

Memory usage after optimization is: 25114.19 MB
Decreased by 2.3%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,Интернет-магазин,Интернет-форум,Информационный сайт,Корпоративный сайт,Поисковая система,Порно-сайт,Почтовый сервис,Сайт-сервис,Социальная сеть,is_male
2272,60,732,27,211,5790,2,0,74259.0,3,1,...,0.016404,0.025437,0.026872,0.042031,0.027271,0.013868,0.012945,0.788617,0.021409,1
2273,60,732,27,211,65865,2,0,74259.0,0,3,...,0.081871,0.068195,0.093617,0.091981,0.209684,0.077605,0.076367,0.123756,0.073332,1
2274,60,732,27,211,111474,2,0,74259.0,0,3,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1
2275,14,311,27,211,111474,2,0,74259.0,1,2,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1
2276,60,732,27,211,125409,2,0,74259.0,0,1,...,0.190411,0.065105,0.131912,0.076863,0.072595,0.035816,0.098164,0.086286,0.130622,1


## CatBoostEncoder

In [16]:
ids = ["user_id"]

cat_features = [
    "region_name",
    "city_name",
    "cpe_manufacturer_name",
    "cpe_model_name",
    "url_host",
    "cpe_type_cd",
    "cpe_model_os_type",
    "part_of_day",
    "domain",
    "capital_marker"
]

continous_features = [
    "request_cnt",
    "price",
    "timestamp",
    "relative_timestamp",
    "geo_lat",
    "geo_lon",
    "population",
    "timezone",
    "dist_to_Moscow",
    "dist_to_SaintP",
    "dist_to_Novosibirsk",
    "dist_to_Ekaterinburg",
    "dist_to_Vladivostok",
    'Блог', 
    'Интернет-магазин',
    'Интернет-форум', 
    'Информационный сайт', 
    'Корпоративный сайт',
    'Поисковая система', 
    'Порно-сайт', 
    'Почтовый сервис', 
    'Сайт-сервис',
    'Социальная сеть'
]

In [17]:
# cbe = CatBoostEncoderWrapper(cat_features=cat_features, sort_col="timestamp", n_folds=10)

In [18]:
# df = cbe.fit_transform(df, target_col="is_male")

In [19]:
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,Интернет-магазин,Интернет-форум,Информационный сайт,Корпоративный сайт,Поисковая система,Порно-сайт,Почтовый сервис,Сайт-сервис,Социальная сеть,is_male
2272,60,732,27,211,5790,2,0,74259.0,3,1,...,0.016404,0.025437,0.026872,0.042031,0.027271,0.013868,0.012945,0.788617,0.021409,1
2273,60,732,27,211,65865,2,0,74259.0,0,3,...,0.081871,0.068195,0.093617,0.091981,0.209684,0.077605,0.076367,0.123756,0.073332,1
2274,60,732,27,211,111474,2,0,74259.0,0,3,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1
2275,14,311,27,211,111474,2,0,74259.0,1,2,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1
2276,60,732,27,211,125409,2,0,74259.0,0,1,...,0.190411,0.065105,0.131912,0.076863,0.072595,0.035816,0.098164,0.086286,0.130622,1


In [20]:
# cbe.save("../model_zoo", "is_male_cbe")

In [21]:
my_reset("df")

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [27]:
df.to_parquet(f"{SEQ2SEQ_DATA_PATH}/stages/stage_3_is_male.parquet.gzip",
              compression='gzip')

## Make torch Dataset

In [1]:
import gc
import sys
import os
import warnings
from tqdm import tqdm

sys.path.append(os.path.abspath("../"))
os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')
tqdm.pandas()

In [2]:
import pandas as pd
import numpy as np
import time
import polars as pl
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.utils.class_weight import compute_class_weight

import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn

from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import get_constant_schedule, get_cosine_schedule_with_warmup

In [3]:
import importlib

import modules
import fe_modules
import seq2seq_modules

importlib.reload(modules)
importlib.reload(fe_modules)
importlib.reload(seq2seq_modules)

from modules.memory_utils import pandas_reduce_mem_usage, pandas_string_to_cat
from seq2seq_modules.data import TargetDataset
from seq2seq_modules.models import LSTMModel, StarterBERTModel, AttnLSTMModel, AttentionPoolingBERTModel
from seq2seq_modules.weight_initialization import weights_init_uniform_rule, weights_init_xavier
from seq2seq_modules.loops import cross_validation, single_model_training
from seq2seq_modules.trainer import CVTrainer
from seq2seq_modules.utils import age_bucket
from seq2seq_modules.metrics import GENDER_METRIC
from seq2seq_modules.utils import fix_random_state

In [4]:
def my_reset(*varnames):
    """
    varnames are what you want to keep
    """
    globals_ = globals()
    to_save = {v: globals_[v] for v in varnames}
    to_save['my_reset'] = my_reset  # lets keep this function by default
    del globals_
    get_ipython().magic("reset")
    globals().update(to_save)

In [5]:
LOCAL_DATA_PATH = '../seq2seq_data/'
SEED = 42
fix_random_state(SEED)

In [10]:
ids = ["user_id"]

cat_features = [
    "region_name",
    "city_name",
    "cpe_manufacturer_name",
    "cpe_model_name",
    "url_host",
    "cpe_type_cd",
    "cpe_model_os_type",
    "part_of_day",
    "domain",
    "capital_marker"
]

continous_features = [
    "request_cnt",
    "price",
    "timestamp",
    "relative_timestamp",
    "geo_lat",
    "geo_lon",
    "population",
    "timezone",
    "dist_to_Moscow",
    "dist_to_SaintP",
    "dist_to_Novosibirsk",
    "dist_to_Ekaterinburg",
    "dist_to_Vladivostok",
    'Блог', 
    'Интернет-магазин',
    'Интернет-форум', 
    'Информационный сайт', 
    'Корпоративный сайт',
    'Поисковая система', 
    'Порно-сайт', 
    'Почтовый сервис', 
    'Сайт-сервис',
    'Социальная сеть'
]

In [7]:
df = pandas_reduce_mem_usage(
    pd.read_parquet(f"{LOCAL_DATA_PATH}/stages/stage_3_is_male.parquet.gzip")
)
df.head()

Memory usage of dataframe is 25114.19 MB


  0%|          | 0/41 [00:00<?, ?it/s]

Memory usage after optimization is: 25114.19 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,Интернет-магазин,Интернет-форум,Информационный сайт,Корпоративный сайт,Поисковая система,Порно-сайт,Почтовый сервис,Сайт-сервис,Социальная сеть,is_male
2272,60,732,27,211,5790,2,0,74259.0,3,1,...,0.016404,0.025437,0.026872,0.042031,0.027271,0.013868,0.012945,0.788617,0.021409,1
2273,60,732,27,211,65865,2,0,74259.0,0,3,...,0.081871,0.068195,0.093617,0.091981,0.209684,0.077605,0.076367,0.123756,0.073332,1
2274,60,732,27,211,111474,2,0,74259.0,0,3,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1
2275,14,311,27,211,111474,2,0,74259.0,1,2,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1
2276,60,732,27,211,125409,2,0,74259.0,0,1,...,0.190411,0.065105,0.131912,0.076863,0.072595,0.035816,0.098164,0.086286,0.130622,1


In [11]:
gc.collect()

870

In [12]:
dataset = TargetDataset(
         df,
         agg_column="user_id", 
         time_column="timestamp",
         target_column="is_male",
         cat_features=cat_features,
         cont_features=continous_features,
         max_len=1024,
         padding_side="left",
)
user_ids = dataset.get_agg_col()
dataset

  0%|          | 0/210673077 [00:00<?, ?it/s]

<seq2seq_modules.data.TargetDataset at 0x7f91306276a0>

## Feed to the model

In [13]:
cat_feature_indexes = []
cont_feature_indexes = []
vocab_sizes = {}

for i in tqdm(range(len(cat_features))):
    cat_feature_indexes.append(i)
    vocab_sizes[i] = int(df[cat_features[i]].max() + 1)

for i in tqdm(range(len(continous_features))):
    cont_feature_indexes.append(i)

100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 36.59it/s]
100%|███████████████████████████████████████| 23/23 [00:00<00:00, 810663.80it/s]


In [14]:
targets = torch.cat([el[3].unsqueeze(0) for el in tqdm(dataset)], dim=0)
targets

100%|████████████████████████████████| 264326/264326 [00:20<00:00, 12844.15it/s]


tensor([0, 0, 0,  ..., 0, 1, 0])

In [15]:
# model = AttentionPoolingBERTModel(
#         cat_feature_indexes=cat_feature_indexes,
#         vocab_sizes=vocab_sizes,
#         cont_feature_indexes=cont_feature_indexes,
#         encoder_hidden_dim=16,
#         hidden_dim=256,
#         dim_feedforward=512,
#         output_dim=7,
#         pe_type="trainable",
#         use_mask=False,
#         max_len=1024,
#         use_key_padding_mask=True,
# )

# model = StarterBERTModel(
#         cat_feature_indexes=cat_feature_indexes,
#         vocab_sizes=vocab_sizes,
#         cont_feature_indexes=cont_feature_indexes,
#         encoder_hidden_dim=16,
#         hidden_dim=256,
#         dim_feedforward=512,
#         output_dim=2,
#         pe_type="trainable",
#         use_mask=False,
#         max_len=1024,
#         use_key_padding_mask=False,
#         starter="randn",
#         shared=False
# )

model = LSTMModel(
        cat_feature_indexes=cat_feature_indexes,
        vocab_sizes=vocab_sizes,
        cont_feature_indexes=cont_feature_indexes,
        encoder_hidden_dim=128,
        hidden_dim=256,
        num_layers=3,
        output_dim=2,
        dropout=0.2,
)

weights_init_xavier(model)

metric = GENDER_METRIC

In [16]:
trainer = CVTrainer(
            model_name="is_male_attn_rnn_text_features_6_folds",
            model=model,
            n_folds=6,
)

In [17]:
train_fold_metrics, eval_fold_metrics, embeddings_df, logits_df = trainer.fit_transform(
                          dataset=dataset,
                          loss_function=nn.CrossEntropyLoss,
                          metric_func=metric,
                          optimizer=torch.optim.AdamW,
                          get_scheduler=get_cosine_schedule_with_warmup,
                          strat_array=targets,
                          target_name="is_male",
                          user_ids=user_ids,
                          shuffle=True,
                          epochs=10,
                          lr=1e-3,
                          weight_decay=1e-2,
                          num_warmup_steps=0,
                          batch_size=256,
                          random_state=69,
                          device= "cuda"
)

FOLD 0
--------------------------------


  0%|          | 0/861 [00:00<?, ?it/s]

  0%|          | 0/173 [00:00<?, ?it/s]

EPOCH: 0
{'Gender GINI': 0.341041885845627, 'loss': 0.6436580294127004}
{'Gender GINI': 0.5698997856694976, 'loss': 0.5629423375032109}


  0%|          | 0/861 [00:00<?, ?it/s]

  0%|          | 0/173 [00:00<?, ?it/s]

EPOCH: 1
{'Gender GINI': 0.6405406942996863, 'loss': 0.5197133050991705}
{'Gender GINI': 0.6899293356608984, 'loss': 0.49142748239940415}


  0%|          | 0/861 [00:00<?, ?it/s]

  0%|          | 0/173 [00:00<?, ?it/s]

EPOCH: 2
{'Gender GINI': 0.7032836578778106, 'loss': 0.47846577544136487}
{'Gender GINI': 0.7013654099114266, 'loss': 0.4830894497113078}


  0%|          | 0/861 [00:00<?, ?it/s]

  0%|          | 0/173 [00:00<?, ?it/s]

EPOCH: 3
{'Gender GINI': 0.7310066244521263, 'loss': 0.45806115161932487}
{'Gender GINI': 0.7095552211706033, 'loss': 0.4772160448441858}


  0%|          | 0/861 [00:00<?, ?it/s]

  0%|          | 0/173 [00:00<?, ?it/s]

EPOCH: 4
{'Gender GINI': 0.756028157661579, 'loss': 0.4382069750198308}
{'Gender GINI': 0.703874399615134, 'loss': 0.48342964207293143}


  0%|          | 0/861 [00:00<?, ?it/s]

  0%|          | 0/173 [00:00<?, ?it/s]

EPOCH: 5
{'Gender GINI': 0.7786964998640933, 'loss': 0.4189379297956428}
{'Gender GINI': 0.7029514052299732, 'loss': 0.48851722700239253}


  0%|          | 0/861 [00:00<?, ?it/s]

  0%|          | 0/173 [00:00<?, ?it/s]

EPOCH: 6
{'Gender GINI': 0.7948128011767577, 'loss': 0.4042036598313632}
{'Gender GINI': 0.6980030954643559, 'loss': 0.49546727454569495}


  0%|          | 0/861 [00:00<?, ?it/s]

  0%|          | 0/173 [00:00<?, ?it/s]

EPOCH: 7
{'Gender GINI': 0.8064978610832512, 'loss': 0.39313067337200186}
{'Gender GINI': 0.6925289381729594, 'loss': 0.5038989505365383}


  0%|          | 0/861 [00:00<?, ?it/s]

  0%|          | 0/173 [00:00<?, ?it/s]

EPOCH: 8
{'Gender GINI': 0.8166605441820587, 'loss': 0.38310465020602874}
{'Gender GINI': 0.6886746180971512, 'loss': 0.5103732664547944}


  0%|          | 0/861 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [16]:
CV_FEATURES = "../cv_data/"
MODEL_ZOO = "../model_zoo/"

In [17]:
trainer.save_model(model_name=MODEL_ZOO + trainer.model_name)

In [18]:
embeddings_df.to_parquet(f"{CV_FEATURES}/is_male_cv_embeddings_20_folds.parquet.gzip",
                          compression='gzip')
logits_df.to_parquet(f"{CV_FEATURES}/is_male_cv_logits_20_folds.parquet.gzip",
                          compression='gzip')

In [15]:
1