# seq2seq

In [28]:
import gc
import sys
import os
import warnings
from tqdm import tqdm

sys.path.append(os.path.abspath("modules/"))
sys.path.append(os.path.abspath("fe_modules/"))
sys.path.append(os.path.abspath("seq2seq_modules/"))

os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')
tqdm.pandas()

In [29]:
import pandas as pd
import numpy as np
import time
import polars as pl
import scipy
import implicit
import bisect
import sklearn.metrics as m
import joblib
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.utils.class_weight import compute_class_weight

import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn

from transformers import get_constant_schedule

In [30]:
import importlib

import modules
import fe_modules
import seq2seq_modules

importlib.reload(modules)
importlib.reload(fe_modules)
importlib.reload(seq2seq_modules)

from modules.memory_utils import pandas_reduce_mem_usage, pandas_string_to_cat
from seq2seq_modules.data import TestDataset
from seq2seq_modules.models import LSTMModel
from seq2seq_modules.weight_initialization import weights_init_uniform_rule
from seq2seq_modules.loops import cross_validation, single_model_training
from seq2seq_modules.utils import age_bucket
from seq2seq_modules.metrics import AGE_METRIC
from seq2seq_modules.utils import fix_random_state

In [31]:
def my_reset(*varnames):
    """
    varnames are what you want to keep
    """
    globals_ = globals()
    to_save = {v: globals_[v] for v in varnames}
    to_save['my_reset'] = my_reset  # lets keep this function by default
    del globals_
    get_ipython().magic("reset")
    globals().update(to_save)

## Read and process

In [32]:
LOCAL_DATA_PATH = './data/'
SEED = 42
fix_random_state(SEED)

In [33]:
ids = ["user_id"]

cat_features = [
    "region_name",
    "city_name",
    "cpe_manufacturer_name",
    "cpe_model_name",
    "url_host",
    "cpe_type_cd",
    "cpe_model_os_type",
    "part_of_day",
    "domain",
    "capital_marker"
]

continous_features = [
    "request_cnt",
    "price",
    "timestamp",
    "relative_timestamp",
    "geo_lat",
    "geo_lon",
    "population",
    "timezone",
    "dist_to_Moscow",
    "dist_to_SaintP",
    "dist_to_Novosibirsk",
    "dist_to_Ekaterinburg",
    "dist_to_Vladivostok",
]

In [7]:
df = pandas_reduce_mem_usage(
    pd.read_parquet("seq2seq_data/stages/stage_2.parquet.gzip")
)
df.head()

Memory usage of dataframe is 26346.12 MB


  0%|          | 0/30 [00:00<?, ?it/s]

Memory usage after optimization is: 26346.12 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,timezone,geo_lat,geo_lon,population,dist_to_Moscow,dist_to_SaintP,dist_to_Novosibirsk,dist_to_Ekaterinburg,dist_to_Vladivostok,domain
0,21,409,1,589,5788,2,1,20368.0,2,1,...,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,761
1,21,409,1,589,12900,2,1,20368.0,2,1,...,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,549
2,21,409,1,589,17626,2,1,20368.0,0,1,...,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,712
3,21,409,1,589,59366,2,1,20368.0,0,1,...,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,712
4,21,409,1,589,59366,2,1,20368.0,0,1,...,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,712


In [8]:
needed_ids = pd.read_parquet(
             f'{LOCAL_DATA_PATH}submit_2.pqt'
         )

needed_ids

Unnamed: 0,user_id
221301,221301
31271,31271
211594,211594
253119,253119
192578,192578
...,...
145307,415284
145308,415285
145309,415286
145312,415306


In [9]:
df = needed_ids.merge(df, how="left", on="user_id")
df.head()

Unnamed: 0,user_id,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,...,timezone,geo_lat,geo_lon,population,dist_to_Moscow,dist_to_SaintP,dist_to_Novosibirsk,dist_to_Ekaterinburg,dist_to_Vladivostok,domain
0,221301,44,894,1,593,59366,2,1,6240.0,0,...,5,54.734943,55.957848,1062300,1168.546509,1636.681519,1720.041016,373.426453,5426.129395,712
1,221301,44,894,1,593,160869,2,1,6240.0,2,...,5,54.734943,55.957848,1062300,1168.546509,1636.681519,1720.041016,373.426453,5426.129395,549
2,221301,44,894,1,593,197762,2,1,20196.0,1,...,5,54.734943,55.957848,1062300,1168.546509,1636.681519,1720.041016,373.426453,5426.129395,761
3,221301,44,894,1,593,5790,2,1,6240.0,3,...,5,54.734943,55.957848,1062300,1168.546509,1636.681519,1720.041016,373.426453,5426.129395,761
4,221301,44,894,1,593,68348,2,1,20196.0,2,...,5,54.734943,55.957848,1062300,1168.546509,1636.681519,1720.041016,373.426453,5426.129395,549


In [10]:
LOCAL_DATA_PATH = './data/'
SPLIT_SEED = 42

In [11]:
MODEL_ZOO = "model_zoo/"

age_cbe = joblib.load(f"{MODEL_ZOO}/age_cbe.joblib")
is_male_cbe = joblib.load(f"{MODEL_ZOO}/is_male_cbe.joblib")

In [13]:
my_reset("df", "is_male_cbe", "age_cbe")

Once deleted, variables cannot be recovered. Proceed (y/[n])? н
Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [18]:
is_male_df = is_male_cbe.transform(df)
is_male_df.head()

Memory usage of dataframe is 16167.49 MB


  0%|          | 0/30 [00:00<?, ?it/s]

Memory usage after optimization is: 15733.47 MB
Decreased by 2.7%
Memory usage of dataframe is 16167.49 MB


  0%|          | 0/30 [00:00<?, ?it/s]

Memory usage after optimization is: 15733.47 MB
Decreased by 2.7%
Memory usage of dataframe is 16167.49 MB


  0%|          | 0/30 [00:00<?, ?it/s]

Memory usage after optimization is: 15733.47 MB
Decreased by 2.7%
Memory usage of dataframe is 16167.49 MB


  0%|          | 0/30 [00:00<?, ?it/s]

Memory usage after optimization is: 15733.47 MB
Decreased by 2.7%
Memory usage of dataframe is 16167.49 MB


  0%|          | 0/30 [00:00<?, ?it/s]

Memory usage after optimization is: 15733.47 MB
Decreased by 2.7%


Unnamed: 0,user_id,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,...,timezone,geo_lat,geo_lon,population,dist_to_Moscow,dist_to_SaintP,dist_to_Novosibirsk,dist_to_Ekaterinburg,dist_to_Vladivostok,domain
0,221301,0.500012,0.524168,0.49747,0.673992,0.52232,0.520257,0.49747,6240.0,0.510338,...,5,54.734943,55.957848,1062300,1168.546509,1636.681519,1720.041016,373.426453,5426.129395,0.527697
1,221301,0.500012,0.524168,0.49747,0.673992,0.516654,0.520257,0.49747,6240.0,0.525272,...,5,54.734943,55.957848,1062300,1168.546509,1636.681519,1720.041016,373.426453,5426.129395,0.516202
2,221301,0.500012,0.524168,0.49747,0.673992,0.544248,0.520257,0.49747,20196.0,0.513347,...,5,54.734943,55.957848,1062300,1168.546509,1636.681519,1720.041016,373.426453,5426.129395,0.520483
3,221301,0.500012,0.524168,0.49747,0.673992,0.507646,0.520257,0.49747,6240.0,0.558631,...,5,54.734943,55.957848,1062300,1168.546509,1636.681519,1720.041016,373.426453,5426.129395,0.520483
4,221301,0.500012,0.524168,0.49747,0.673992,0.42631,0.520257,0.49747,20196.0,0.525272,...,5,54.734943,55.957848,1062300,1168.546509,1636.681519,1720.041016,373.426453,5426.129395,0.516202


In [26]:
is_male_dataset = TestDataset(
         is_male_df,
         agg_column="user_id", 
         time_column="timestamp",
         cat_features=[],
         cont_features=continous_features + cat_features,
         max_len=1024,
         padding_side="left",
)
user_ids = is_male_dataset.get_agg_col()

  0%|          | 0/113777491 [00:00<?, ?it/s]

In [34]:
age_df = age_cbe.transform(df)
age_df.head()

Memory usage of dataframe is 16167.49 MB


  0%|          | 0/30 [00:00<?, ?it/s]

Memory usage after optimization is: 15733.47 MB
Decreased by 2.7%
Memory usage of dataframe is 16167.49 MB


  0%|          | 0/30 [00:00<?, ?it/s]

Memory usage after optimization is: 15733.47 MB
Decreased by 2.7%
Memory usage of dataframe is 16167.49 MB


  0%|          | 0/30 [00:00<?, ?it/s]

Memory usage after optimization is: 15733.47 MB
Decreased by 2.7%
Memory usage of dataframe is 16167.49 MB


  0%|          | 0/30 [00:00<?, ?it/s]

Memory usage after optimization is: 15733.47 MB
Decreased by 2.7%
Memory usage of dataframe is 16167.49 MB


  0%|          | 0/30 [00:00<?, ?it/s]

Memory usage after optimization is: 15733.47 MB
Decreased by 2.7%


Unnamed: 0,user_id,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,...,timezone,geo_lat,geo_lon,population,dist_to_Moscow,dist_to_SaintP,dist_to_Novosibirsk,dist_to_Ekaterinburg,dist_to_Vladivostok,domain
0,221301,37.102139,37.102139,37.102139,37.102139,37.102139,37.102139,37.102139,6240.0,37.102139,...,5,54.734943,55.957848,1062300,1168.546509,1636.681519,1720.041016,373.426453,5426.129395,37.102139
1,221301,37.102139,37.102139,37.102139,37.102139,37.102139,37.102139,37.102139,6240.0,37.102139,...,5,54.734943,55.957848,1062300,1168.546509,1636.681519,1720.041016,373.426453,5426.129395,37.102139
2,221301,37.102139,37.102139,37.102139,37.102139,37.102139,37.102139,37.102139,20196.0,37.102139,...,5,54.734943,55.957848,1062300,1168.546509,1636.681519,1720.041016,373.426453,5426.129395,37.102139
3,221301,37.102139,37.102139,37.102139,37.102139,37.102139,37.102139,37.102139,6240.0,37.102139,...,5,54.734943,55.957848,1062300,1168.546509,1636.681519,1720.041016,373.426453,5426.129395,37.102139
4,221301,37.102139,37.102139,37.102139,37.102139,37.102139,37.102139,37.102139,20196.0,37.102139,...,5,54.734943,55.957848,1062300,1168.546509,1636.681519,1720.041016,373.426453,5426.129395,37.102139


In [35]:
age_dataset = TestDataset(
         age_df,
         agg_column="user_id", 
         time_column="timestamp",
         cat_features=[],
         cont_features=continous_features + cat_features,
         max_len=1024,
         padding_side="left",
)
user_ids = is_male_dataset.get_agg_col()

  0%|          | 0/113777491 [00:00<?, ?it/s]

In [36]:
is_male_dataset, age_dataset

(<seq2seq_modules.data.TestDataset at 0x7fcb5b7d7ac0>,
 <seq2seq_modules.data.TestDataset at 0x7fcb5b7d6e60>)

In [37]:
df.head()

Unnamed: 0,user_id,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,...,timezone,geo_lat,geo_lon,population,dist_to_Moscow,dist_to_SaintP,dist_to_Novosibirsk,dist_to_Ekaterinburg,dist_to_Vladivostok,domain
0,221301,37.102139,37.102139,37.102139,37.102139,37.102139,37.102139,37.102139,6240.0,37.102139,...,5,54.734943,55.957848,1062300,1168.546509,1636.681519,1720.041016,373.426453,5426.129395,37.102139
1,221301,37.102139,37.102139,37.102139,37.102139,37.102139,37.102139,37.102139,6240.0,37.102139,...,5,54.734943,55.957848,1062300,1168.546509,1636.681519,1720.041016,373.426453,5426.129395,37.102139
2,221301,37.102139,37.102139,37.102139,37.102139,37.102139,37.102139,37.102139,20196.0,37.102139,...,5,54.734943,55.957848,1062300,1168.546509,1636.681519,1720.041016,373.426453,5426.129395,37.102139
3,221301,37.102139,37.102139,37.102139,37.102139,37.102139,37.102139,37.102139,6240.0,37.102139,...,5,54.734943,55.957848,1062300,1168.546509,1636.681519,1720.041016,373.426453,5426.129395,37.102139
4,221301,37.102139,37.102139,37.102139,37.102139,37.102139,37.102139,37.102139,20196.0,37.102139,...,5,54.734943,55.957848,1062300,1168.546509,1636.681519,1720.041016,373.426453,5426.129395,37.102139


## Feed to the model

In [38]:
cat_feature_indexes = []
cont_feature_indexes = []
vocab_sizes = {}

# for i in tqdm(range(len(cat_features))):
#     cat_feature_indexes.append(i)
#     vocab_sizes[i] = int(df[cat_features[i]].max() + 1)

for i in tqdm(range(len(continous_features + cat_features))):
    cont_feature_indexes.append(i)

100%|███████████████████████████████████████| 23/23 [00:00<00:00, 333802.74it/s]


In [39]:
import joblib

MODEL_ZOO = "model_zoo/"

age_embedding_extractor = joblib.load(f"{MODEL_ZOO}/age_lstm_with_cbe.joblib")
is_male_embedding_extractor = joblib.load(f"{MODEL_ZOO}/is_male_baseline.joblib")

In [40]:
age_embeddings_df, age_logits_df = age_embedding_extractor.transform(age_dataset, user_ids)

  0%|          | 0/4523 [00:00<?, ?it/s]

  0%|          | 0/4523 [00:00<?, ?it/s]

  0%|          | 0/4523 [00:00<?, ?it/s]

  0%|          | 0/4523 [00:00<?, ?it/s]

  0%|          | 0/4523 [00:00<?, ?it/s]

  0%|          | 0/4523 [00:00<?, ?it/s]

In [41]:
is_male_embeddings_df, is_male_logits_df = is_male_embedding_extractor.transform(is_male_dataset, user_ids)

  0%|          | 0/4523 [00:00<?, ?it/s]

  0%|          | 0/4523 [00:00<?, ?it/s]

  0%|          | 0/4523 [00:00<?, ?it/s]

  0%|          | 0/4523 [00:00<?, ?it/s]

  0%|          | 0/4523 [00:00<?, ?it/s]

In [44]:
is_male_embeddings_df.to_parquet(f"test_is_male_embeddings.parquet.gzip",
                                  compression='gzip')
age_embeddings_df.to_parquet(f"test_age_embeddings.parquet.gzip",
                                  compression='gzip')

In [48]:
is_male_logits_df.to_parquet(f"test_is_male_logits.parquet.gzip",
                                  compression='gzip')
age_logits_df.to_parquet(f"test_age_logits.parquet.gzip",
                                  compression='gzip')

In [42]:
is_male_logits_df

In [43]:
age_logits_df

In [22]:
def merge_predictions(is_male, age):
    is_male_proba = is_male["target_feature_1"].to_numpy()
    age_class = age[[feature for feature in age.columns if "feature" in feature]].to_numpy().argmax(axis=1)
    
    final_df = pd.DataFrame({"user_id": age["user_id"], "age": age_class, "is_male": is_male_proba})
    return final_df

In [23]:
final_df = merge_predictions(is_male_logits_df, age_logits_df)
final_df

Unnamed: 0,user_id,age,is_male
0,6,2,0.321732
1,7,3,0.637350
2,9,1,0.276358
3,10,2,0.271077
4,11,6,0.723262
...,...,...,...
144719,415306,6,0.553725
144720,415310,6,0.506669
144721,415314,0,0.501669
144722,415315,6,0.500138


In [24]:
final_df.to_csv("submission_1.csv", index=False)