# seq2seq

In [1]:
import gc
import sys
import os
import warnings
from tqdm import tqdm

sys.path.append(os.path.abspath("../"))
os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')
tqdm.pandas()

In [2]:
import pandas as pd
import numpy as np
import time
import polars as pl
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.utils.class_weight import compute_class_weight

import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn

from transformers import get_constant_schedule

In [3]:
import importlib

import modules
import fe_modules
import seq2seq_modules

importlib.reload(modules)
importlib.reload(fe_modules)
importlib.reload(seq2seq_modules)

from modules.memory_utils import pandas_reduce_mem_usage, pandas_string_to_cat
from seq2seq_modules.preprocessing import TargetPandasPreprocessor
from seq2seq_modules.models import LSTMModel
from seq2seq_modules.weight_initialization import weights_init_uniform_rule
from seq2seq_modules.loops import cross_validation, single_model_training
from seq2seq_modules.utils import age_bucket
from seq2seq_modules.metrics import AGE_METRIC

ModuleNotFoundError: No module named 'modules'

## Read and process

In [None]:
LOCAL_DATA_PATH = './data/'
SPLIT_SEED = 42

In [7]:
ids = ["user_id"]

cat_features = [
    "region_name",
    "city_name",
    "cpe_manufacturer_name",
    "cpe_model_name",
    "url_host",
    "cpe_type_cd",
    "cpe_model_os_type",
    "part_of_day",
    "domain"
    "capital_marker"
]

continous_features = [
    "request_cnt",
    "price",
    "timestamp",
    "relative_timestamp"
    "geo_lat",
    "geo_lon",
    "population",
    "timezone",
    "dist_to_Moscow",
    "dist_to_SaintP",
    "dist_to_Novosibirsk",
    "dist_to_Ekaterinburg",
    "dist_to_Vladivostok",
]

In [8]:
df = pandas_reduce_mem_usage(
    pd.read_parquet("seq2seq_data/stages/stage_6.parquet.gzip")
)
df.head()

Memory usage of dataframe is 28855.27 MB


  0%|          | 0/32 [00:00<?, ?it/s]

Memory usage after optimization is: 28855.27 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,geo_lat_y,geo_lon_y,dist_to_Moscow,dist_to_SaintP,dist_to_Novosibirsk,dist_to_Ekaterinburg,dist_to_Vladivostok,domain,timestamp,relative_timestamp
0,21,409,1,589,5788,2,1,20368.0,2,1,...,45.040161,38.975964,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,761,165528352.0,388.799988
1,21,409,1,589,12900,2,1,20368.0,2,1,...,45.040161,38.975964,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,549,165562912.0,43.200001
2,21,409,1,589,17626,2,1,20368.0,0,1,...,45.040161,38.975964,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,712,165504608.0,626.23999
3,21,409,1,589,59366,2,1,20368.0,0,1,...,45.040161,38.975964,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,712,165271328.0,2959.040039
4,21,409,1,589,59366,2,1,20368.0,0,1,...,45.040161,38.975964,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,712,165392288.0,1749.439941


In [9]:
target = pd.read_parquet(
             f'{LOCAL_DATA_PATH}public_train.pqt', columns=["user_id", "is_male"]
         ).dropna()
target = target[target["is_male"] != "NA"]
target["is_male"] = target["is_male"].astype(np.int32)
target = pandas_reduce_mem_usage(target)

target.head()

Memory usage of dataframe is 5.04 MB


  0%|          | 0/2 [00:00<?, ?it/s]

Memory usage after optimization is: 3.28 MB
Decreased by 35.0%


Unnamed: 0,user_id,is_male
350459,350459,1
188276,188276,1
99002,99002,0
155506,155506,0
213873,213873,0


In [10]:
df = df.merge(target, how="left", on="user_id").dropna(subset=["is_male"])
df["is_male"] = df["is_male"].astype(np.int32)
df = pandas_reduce_mem_usage(df)
df.head()

Memory usage of dataframe is 19287.70 MB


  0%|          | 0/33 [00:00<?, ?it/s]

Memory usage after optimization is: 18684.96 MB
Decreased by 3.1%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,geo_lon_y,dist_to_Moscow,dist_to_SaintP,dist_to_Novosibirsk,dist_to_Ekaterinburg,dist_to_Vladivostok,domain,timestamp,relative_timestamp,is_male
2272,60,732,27,211,5790,2,0,74259.0,3,1,...,30.315901,635.758972,0.024454,3115.404541,1787.926147,6555.964355,761,163262528.0,1058.23999,1
2273,60,732,27,211,65865,2,0,74259.0,0,3,...,30.315901,635.758972,0.024454,3115.404541,1787.926147,6555.964355,549,162955808.0,4125.439941,1
2274,60,732,27,211,111474,2,0,74259.0,0,3,...,30.315901,635.758972,0.024454,3115.404541,1787.926147,6555.964355,761,162947168.0,4211.839844,1
2275,14,311,27,211,111474,2,0,74259.0,1,2,...,20.507324,1092.055054,828.233154,3870.873535,2490.750244,7378.328613,761,163122112.0,2462.399902,1
2276,60,732,27,211,125409,2,0,74259.0,0,1,...,30.315901,635.758972,0.024454,3115.404541,1787.926147,6555.964355,549,163033568.0,3347.840088,1


In [11]:
df.to_parquet("seq2seq_data/version_4_is_male.parquet.gzip",
              compression='gzip')

## Make torch Dataset

In [1]:
import gc
import sys
import os
import warnings
from tqdm import tqdm

sys.path.append(os.path.abspath("../"))
os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')
tqdm.pandas()

In [2]:
sys.path

['/home/leffff/PycharmProjects/mts-ml-cup/sandbox',
 '/usr/lib/python310.zip',
 '/usr/lib/python3.10',
 '/usr/lib/python3.10/lib-dynload',
 '',
 '/home/leffff/PycharmProjects/mts-ml-cup/venv/lib/python3.10/site-packages',
 '/home/leffff/PycharmProjects/mts-ml-cup']

In [3]:
import pandas as pd
import numpy as np
import time
import polars as pl
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.utils.class_weight import compute_class_weight

import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn

from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import get_constant_schedule, get_cosine_schedule_with_warmup

In [4]:
import importlib

import modules
import fe_modules
import seq2seq_modules

importlib.reload(modules)
importlib.reload(fe_modules)
importlib.reload(seq2seq_modules)

from modules.memory_utils import pandas_reduce_mem_usage, pandas_string_to_cat
from seq2seq_modules.preprocessing import TargetPandasPreprocessor
from seq2seq_modules.data import TargetDataset
from seq2seq_modules.models import LSTMModel, StarterBERTModel, AttentionPoolingBERTModel
from seq2seq_modules.weight_initialization import weights_init_uniform_rule, weights_init_xavier
from seq2seq_modules.loops import cross_validation, single_model_training
from seq2seq_modules.utils import age_bucket
from seq2seq_modules.metrics import GENDER_METRIC
from seq2seq_modules.utils import fix_random_state

In [5]:
def my_reset(*varnames):
    """
    varnames are what you want to keep
    """
    globals_ = globals()
    to_save = {v: globals_[v] for v in varnames}
    to_save['my_reset'] = my_reset  # lets keep this function by default
    del globals_
    get_ipython().magic("reset")
    globals().update(to_save)

In [6]:
LOCAL_DATA_PATH = '../seq2seq_data/'
SEED = 42
fix_random_state(SEED)

In [7]:
ids = ["user_id"]

cat_features = [
    "region_name",
    "city_name",
    "cpe_manufacturer_name",
    "cpe_model_name",
    "url_host",
    "cpe_type_cd",
    "cpe_model_os_type",
    "part_of_day",
    "domain",
    "capital_marker"
]

continous_features = [
    "request_cnt",
    "price",
    "timestamp",
    "relative_timestamp",
    "geo_lat_x",
    "geo_lon_x",
    "population",
    "timezone",
    "dist_to_Moscow",
    "dist_to_SaintP",
    "dist_to_Novosibirsk",
    "dist_to_Ekaterinburg",
    "dist_to_Vladivostok",
]

In [8]:
df = pandas_reduce_mem_usage(
    pd.read_parquet(f"{LOCAL_DATA_PATH}/version_4_is_male.parquet.gzip")
)
df.head()

Memory usage of dataframe is 18684.96 MB


  0%|          | 0/33 [00:00<?, ?it/s]

Memory usage after optimization is: 18684.96 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,geo_lon_y,dist_to_Moscow,dist_to_SaintP,dist_to_Novosibirsk,dist_to_Ekaterinburg,dist_to_Vladivostok,domain,timestamp,relative_timestamp,is_male
2272,60,732,27,211,5790,2,0,74259.0,3,1,...,30.315901,635.758972,0.024454,3115.404541,1787.926147,6555.964355,761,163262528.0,1058.23999,1
2273,60,732,27,211,65865,2,0,74259.0,0,3,...,30.315901,635.758972,0.024454,3115.404541,1787.926147,6555.964355,549,162955808.0,4125.439941,1
2274,60,732,27,211,111474,2,0,74259.0,0,3,...,30.315901,635.758972,0.024454,3115.404541,1787.926147,6555.964355,761,162947168.0,4211.839844,1
2275,14,311,27,211,111474,2,0,74259.0,1,2,...,20.507324,1092.055054,828.233154,3870.873535,2490.750244,7378.328613,761,163122112.0,2462.399902,1
2276,60,732,27,211,125409,2,0,74259.0,0,1,...,30.315901,635.758972,0.024454,3115.404541,1787.926147,6555.964355,549,163033568.0,3347.840088,1


In [9]:
gc.collect()

21

In [None]:
dataset = TargetDataset(
         df,
         agg_column="user_id", 
         time_column="timestamp",
         target_column="is_male",
         cat_features=cat_features,
         cont_features=continous_features,
         max_len=1024,
         padding_side="left",
)
dataset

In [None]:
df.head()

## Feed to the model

In [None]:
cat_feature_indexes = []
cont_feature_indexes = []
vocab_sizes = {}

for i in tqdm(range(len(cat_features))):
    cat_feature_indexes.append(i)
    vocab_sizes[i] = int(df[cat_features[i]].max() + 1)

for i in tqdm(range(len(continous_features))):
    cont_feature_indexes.append(i)

In [None]:
targets = torch.cat([el[3].unsqueeze(0) for el in tqdm(dataset)], dim=0)
targets

In [None]:
# model = AttentionPoolingBERTModel(
#         cat_feature_indexes=cat_feature_indexes,
#         vocab_sizes=vocab_sizes,
#         cont_feature_indexes=cont_feature_indexes,
#         encoder_hidden_dim=16,
#         hidden_dim=256,
#         dim_feedforward=512,
#         output_dim=7,
#         pe_type="trainable",
#         use_mask=False,
#         max_len=1024,
#         use_key_padding_mask=True,
# )

# model = StarterBERTModel(
#         cat_feature_indexes=cat_feature_indexes,
#         vocab_sizes=vocab_sizes,
#         cont_feature_indexes=cont_feature_indexes,
#         encoder_hidden_dim=16,
#         hidden_dim=256,
#         dim_feedforward=512,
#         output_dim=2,
#         pe_type="trainable",
#         use_mask=False,
#         max_len=1024,
#         use_key_padding_mask=False,
#         starter="randn",
#         shared=False
# )

model = LSTMModel(
        cat_feature_indexes=cat_feature_indexes,
        vocab_sizes=vocab_sizes,
        cont_feature_indexes=cont_feature_indexes,
        encoder_hidden_dim=16,
        hidden_dim=256,
        output_dim=2,
)

weights_init_xavier(model)

loss = nn.CrossEntropyLoss(
    weight=torch.tensor(
        compute_class_weight(
            class_weight="balanced",
            classes=sorted(df["is_male"].unique()),
            y=targets.flatten().numpy()
        )
    )
)

metric = GENDER_METRIC

In [None]:
fold_train_scores, fold_eval_scores = cross_validation(
        project_name="gender_LSTMModel_v1", 
        model=model, 
        dataset=dataset, 
        loss_function=loss, 
        metric_func=metric, 
        optimizer=torch.optim.AdamW, 
        get_scheduler=get_cosine_schedule_with_warmup, 
        strat_array=targets.numpy(), 
        device='cuda', 
        random_state=SEED, 
        shuffle=True, 
        dataloader_shuffle=False, 
        n_folds=5, 
        epochs=10, 
        lr=1e-3, 
        weight_decay=1e-4,
        start_fold=0, 
        batch_size=64,
)

In [None]:
fold_train_scores

In [None]:
fold_eval_scores