# seq2seq

In [1]:
import gc
import sys
import os
import warnings
from tqdm import tqdm

sys.path.append(os.path.abspath("modules/"))
sys.path.append(os.path.abspath("fe_modules/"))
sys.path.append(os.path.abspath("seq2seq_modules/"))

os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')
tqdm.pandas()

In [2]:
import pandas as pd
import numpy as np
import time
import polars as pl
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.utils.class_weight import compute_class_weight

import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn

from transformers import get_constant_schedule

In [3]:
import importlib

import modules
import fe_modules
import seq2seq_modules

importlib.reload(modules)
importlib.reload(fe_modules)
importlib.reload(seq2seq_modules)

from modules.memory_utils import pandas_reduce_mem_usage, pandas_string_to_cat, my_reset, get_suitable_for_parquet
from seq2seq_modules.preprocessing import PandasPreprocessor
from seq2seq_modules.models import LSTMModel
from seq2seq_modules.weight_initialization import weights_init_uniform_rule
from seq2seq_modules.loops import cross_validation, single_model_training
from seq2seq_modules.utils import age_bucket
from seq2seq_modules.metrics import AGE_METRIC

## Read and process

In [4]:
LOCAL_DATA_PATH = './data/'
SPLIT_SEED = 42

In [5]:
ids = ["user_id"]

cat_features = [
    "region_name",
    "city_name",
    "cpe_manufacturer_name",
    "cpe_model_name",
    "url_host",
    "cpe_type_cd",
    "cpe_model_os_type",
    "part_of_day",
    "domain"
    "capital_marker"
]

continous_features = [
    "request_cnt",
    "price",
    "timestamp",
    "relative_date"
    "geo_lat",
    "geo_lon",
    "population",
    "timezone"
]

In [6]:
df = pandas_reduce_mem_usage(
    pd.read_parquet("seq2seq_data/stages/stage_2.parquet.gzip")
)
df.head()

Memory usage of dataframe is 17877.72 MB


100%|███████████████████████████████████████████| 19/19 [00:10<00:00,  1.88it/s]

Memory usage after optimization is: 15995.86 MB
Decreased by 10.5%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,user_id,capital_marker,timezone,geo_lat,geo_lon,population,domain,timestamp,relative_date
0,21,409,1,589,5788,2,1,20368.0,2,1,45098,2.0,3,45.03125,38.96875,744933,761,165525120.0,3888.0
1,21,409,1,589,12900,2,1,20368.0,2,1,45098,2.0,3,45.03125,38.96875,744933,549,165559680.0,432.0
2,21,409,1,589,17626,2,1,20368.0,0,1,45098,2.0,3,45.03125,38.96875,744933,712,165499200.0,6264.0
3,21,409,1,589,59366,2,1,20368.0,0,1,45098,2.0,3,45.03125,38.96875,744933,712,165265920.0,29592.0
4,21,409,1,589,59366,2,1,20368.0,0,1,45098,2.0,3,45.03125,38.96875,744933,712,165386880.0,17496.0


In [7]:
target = pd.read_parquet(
             f'{LOCAL_DATA_PATH}public_train.pqt', columns=["user_id", "is_male"]
         ).dropna()
target = target[target["is_male"] != "NA"]
target["is_male"] = target["is_male"].astype(np.int32)
target = pandas_reduce_mem_usage(target)

target.head()

Memory usage of dataframe is 5.04 MB


100%|███████████████████████████████████████████| 2/2 [00:00<00:00, 1992.54it/s]

Memory usage after optimization is: 3.28 MB
Decreased by 35.0%





Unnamed: 0,user_id,is_male
350459,350459,1
188276,188276,1
99002,99002,0
155506,155506,0
213873,213873,0


In [8]:
df = df.merge(target, how="left", on="user_id").dropna(subset=["is_male"])
df["is_male"] = df["is_male"].astype(np.int32)
df = pandas_reduce_mem_usage(df)
df.head()

KeyboardInterrupt: 

In [11]:
df.dtypes

region_name                uint8
city_name                 uint16
cpe_manufacturer_name      uint8
cpe_model_name            uint16
url_host                  uint32
cpe_type_cd                uint8
cpe_model_os_type          uint8
price                    float32
part_of_day                uint8
request_cnt                 int8
user_id                    int32
capital_marker           float16
timezone                    int8
geo_lat                  float16
geo_lon                  float16
population                 int32
domain                    uint16
timestamp                float32
relative_date            float32
is_male                     int8
dtype: object

In [15]:
get_suitable_for_parquet(df).to_parquet("seq2seq_data/version_2_is_male.parquet.gzip",
              compression='gzip')

100%|███████████████████████████████████████████| 20/20 [00:13<00:00,  1.48it/s]


## Make torch Dataset

In [1]:
import gc
import sys
import os
import warnings
from tqdm import tqdm

sys.path.append(os.path.abspath("modules/"))
sys.path.append(os.path.abspath("fe_modules/"))
sys.path.append(os.path.abspath("seq2seq_modules/"))

os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')
tqdm.pandas()

In [2]:
import pandas as pd
import numpy as np
import time
import polars as pl
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.utils.class_weight import compute_class_weight

import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn

from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import get_constant_schedule, get_cosine_schedule_with_warmup

In [3]:
import importlib

import modules
import fe_modules
import seq2seq_modules

importlib.reload(modules)
importlib.reload(fe_modules)
importlib.reload(seq2seq_modules)

from modules.memory_utils import pandas_reduce_mem_usage, pandas_string_to_cat, my_reset
from seq2seq_modules.preprocessing import PandasPreprocessor
from seq2seq_modules.models import LSTMModel, StarterBERTModel, AttentionPoolingBERTModel
from seq2seq_modules.weight_initialization import weights_init_uniform_rule, weights_init_xavier
from seq2seq_modules.loops import cross_validation, single_model_training
from seq2seq_modules.utils import age_bucket
from seq2seq_modules.metrics import GENDER_METRIC

In [4]:
LOCAL_DATA_PATH = './data/'
SPLIT_SEED = 42

In [5]:
ids = ["user_id"]

cat_features = [
    "region_name",
    "city_name",
    "cpe_manufacturer_name",
    "cpe_model_name",
    "url_host",
    "cpe_type_cd",
    "cpe_model_os_type",
    "part_of_day",
    "domain",
    "capital_marker",
]

continous_features = [
    "request_cnt",
    "price",
    "timestamp",
    "relative_date",
    "geo_lat",
    "geo_lon",
    "population",
    "timezone",
]

In [6]:
df = pandas_reduce_mem_usage(
    pd.read_parquet("seq2seq_data/version_2_is_male.parquet.gzip")
)

Memory usage of dataframe is 11652.98 MB


100%|███████████████████████████████████████████| 20/20 [00:06<00:00,  2.92it/s]

Memory usage after optimization is: 10447.50 MB
Decreased by 10.3%





In [7]:
gc.collect()

20

In [8]:
preprocessor = PandasPreprocessor(
         agg_column="user_id", 
         time_column="timestamp",
         target_column="is_male",
         features=cat_features + continous_features,
         max_len=1024,
         padding_side="left"
)

In [9]:
input_features, attention_masks, targets = preprocessor.transform(df)

100%|████████████████████████| 210673077/210673077 [00:28<00:00, 7368708.97it/s]
100%|████████████████████████████████| 264325/264325 [00:08<00:00, 30799.06it/s]
100%|████████████████████████████████| 264325/264325 [00:08<00:00, 31672.21it/s]
100%|███████████████████████████████| 264325/264325 [00:00<00:00, 442396.17it/s]
100%|███████████████████████████████| 264325/264325 [00:01<00:00, 211492.55it/s]
100%|███████████████████████████████| 264325/264325 [00:00<00:00, 525799.31it/s]


In [10]:
dataset = TensorDataset(input_features, attention_masks, targets)

In [11]:
df[cat_features + continous_features].head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,part_of_day,domain,capital_marker,request_cnt,price,timestamp,relative_date,geo_lat,geo_lon,population,timezone
2272,60,732,27,211,5790,2,0,3,761,0.0,1,74259.0,163261440.0,10584.0,59.9375,30.3125,4848742,3
2273,60,732,27,211,65865,2,0,0,549,0.0,3,74259.0,162950400.0,41256.0,59.9375,30.3125,4848742,3
2274,60,732,27,211,111474,2,0,0,761,0.0,3,74259.0,162941760.0,42120.0,59.9375,30.3125,4848742,3
2275,14,311,27,211,111474,2,0,1,761,2.0,2,74259.0,163114560.0,24624.0,54.71875,20.5,431491,2
2276,60,732,27,211,125409,2,0,0,549,0.0,1,74259.0,163028160.0,33480.0,59.9375,30.3125,4848742,3


## Feed to the model

In [17]:
cat_feature_indexes = []
cont_feature_indexes = []
vocab_sizes = {}

for i in tqdm(range(len(cat_features))):
    cat_feature_indexes.append(i)
    vocab_sizes[i] = int(df[cat_features[i]].max() + 1)

for i in tqdm(range(len(continous_features))):
    cont_feature_indexes.append(len(cat_features) + i)

100%|███████████████████████████████████████████| 10/10 [00:01<00:00,  5.41it/s]
100%|█████████████████████████████████████████| 8/8 [00:00<00:00, 307838.83it/s]


In [18]:
vocab_sizes

{0: 81, 1: 985, 2: 37, 3: 599, 4: 199683, 5: 4, 6: 2, 7: 4, 8: 869, 9: 3}

In [19]:
cat_feature_indexes

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [20]:
cont_feature_indexes

[10, 11, 12, 13, 14, 15, 16, 17]

In [21]:
# model = AttentionPoolingBERTModel(
#         cat_feature_indexes=cat_feature_indexes,
#         vocab_sizes=vocab_sizes,
#         cont_feature_indexes=cont_feature_indexes,
#         encoder_hidden_dim=16,
#         hidden_dim=256,
#         dim_feedforward=512,
#         output_dim=7,
#         pe_type="trainable",
#         use_mask=False,
#         max_len=1024,
#         use_key_padding_mask=True,
# )

# model = StarterBERTModel(
#         cat_feature_indexes=cat_feature_indexes,
#         vocab_sizes=vocab_sizes,
#         cont_feature_indexes=cont_feature_indexes,
#         encoder_hidden_dim=16,
#         hidden_dim=256,
#         dim_feedforward=512,
#         output_dim=1,
#         pe_type="trainable",
#         use_mask=False,
#         max_len=1024,
#         use_key_padding_mask=False,
#         starter="randn",
#         shared=True
# )

model = LSTMModel(
        cat_feature_indexes=cat_feature_indexes,
        vocab_sizes=vocab_sizes,
        cont_feature_indexes=cont_feature_indexes,
        encoder_hidden_dim=16,
        hidden_dim=256,
        output_dim=2,
)


weights_init_xavier(model)

loss = nn.CrossEntropyLoss(
    weight=torch.tensor(
        compute_class_weight(
            class_weight="balanced",
            classes=sorted(df["is_male"].unique()),
            y=targets.numpy()
        )
    )
)

metric = GENDER_METRIC

In [22]:
targets.unique(), sorted(df["is_male"].unique())

(tensor([0, 1]), [0, 1])

In [23]:
cross_validation(
        project_name="ХУЙ", 
        model=model, 
        dataset=dataset, 
        loss_function=loss, 
        metric_func=metric, 
        optimizer=torch.optim.AdamW, 
        get_scheduler=get_cosine_schedule_with_warmup, 
        strat_array=targets.numpy(), 
        device='cuda', 
        random_state=69, 
        shuffle=True, 
        dataloader_shuffle=False, 
        n_folds=5, 
        epochs=20, 
        lr=1e-3, 
        weight_decay=1e-4,
        start_fold=0, 
        batch_size=32,
)

FOLD 0
--------------------------------


100%|███████████████████████████████████████| 6609/6609 [06:11<00:00, 17.77it/s]
100%|███████████████████████████████████████| 1653/1653 [00:37<00:00, 44.60it/s]


EPOCH: 0
{'Gender GINI': 0.37190715383095174, 'loss': 0.6336282183798879}
{'Gender GINI': 0.5534749987459824, 'loss': 0.571661541199898}


100%|███████████████████████████████████████| 6609/6609 [06:12<00:00, 17.72it/s]
100%|███████████████████████████████████████| 1653/1653 [00:37<00:00, 43.80it/s]


EPOCH: 1
{'Gender GINI': 0.6095164467200975, 'loss': 0.5375130577941287}
{'Gender GINI': 0.647095167422788, 'loss': 0.5245620326219187}


100%|███████████████████████████████████████| 6609/6609 [06:20<00:00, 17.36it/s]
100%|███████████████████████████████████████| 1653/1653 [00:37<00:00, 44.44it/s]


EPOCH: 2
{'Gender GINI': 0.6750481722121238, 'loss': 0.49765217672404055}
{'Gender GINI': 0.69095183536766, 'loss': 0.497561290063874}


100%|███████████████████████████████████████| 6609/6609 [06:11<00:00, 17.80it/s]
100%|███████████████████████████████████████| 1653/1653 [00:37<00:00, 44.65it/s]


EPOCH: 3
{'Gender GINI': 0.709173280806815, 'loss': 0.4739603243900369}
{'Gender GINI': 0.7025362261836814, 'loss': 0.4825444808629083}


100%|███████████████████████████████████████| 6609/6609 [06:22<00:00, 17.30it/s]
100%|███████████████████████████████████████| 1653/1653 [00:37<00:00, 43.72it/s]


EPOCH: 4
{'Gender GINI': 0.7294813114381848, 'loss': 0.4589852727808074}
{'Gender GINI': 0.7074526321718062, 'loss': 0.4858053710865994}


100%|███████████████████████████████████████| 6609/6609 [06:21<00:00, 17.35it/s]
100%|███████████████████████████████████████| 1653/1653 [00:38<00:00, 43.26it/s]


EPOCH: 5
{'Gender GINI': 0.7426985681421625, 'loss': 0.44865111160247734}
{'Gender GINI': 0.7127541782799516, 'loss': 0.4762871996296752}


100%|███████████████████████████████████████| 6609/6609 [06:19<00:00, 17.42it/s]
100%|███████████████████████████████████████| 1653/1653 [00:36<00:00, 44.69it/s]


EPOCH: 6
{'Gender GINI': 0.7533702460164444, 'loss': 0.440293341694653}
{'Gender GINI': 0.713153974501425, 'loss': 0.47850431098652657}


100%|███████████████████████████████████████| 6609/6609 [06:12<00:00, 17.72it/s]
100%|███████████████████████████████████████| 1653/1653 [00:37<00:00, 44.52it/s]


EPOCH: 7
{'Gender GINI': 0.7629992819044389, 'loss': 0.4321631045733098}
{'Gender GINI': 0.7157956816526003, 'loss': 0.4750208519024249}


 90%|███████████████████████████████████▎   | 5978/6609 [05:40<00:35, 17.58it/s]


KeyboardInterrupt: 