In [1]:
import os
import gc
import math
import json
from datetime import datetime
import shutil
from pathlib import Path
import optuna
import numpy as np
import pandas as pd
import pytorch_lightning as pl
from tqdm import tqdm
import torch
from sklearn.model_selection import StratifiedGroupKFold
from torch.utils.data import DataLoader
from typing import Dict, List, Tuple, NamedTuple
from transformers import AutoTokenizer, AutoConfig, BertTokenizerFast
from transformers import BertConfig, RobertaConfig, EncoderDecoderConfig, EncoderDecoderModel
import scml
import mylib

In [2]:
class ModelConf(NamedTuple):
    directory: Path
    model_max_length: int
    stride: int
    batch_size: int
    gradient_checkpointing: bool


class Conf(NamedTuple):
    model_name: str = "roberta-base"
    epochs: int = 3
    lr: Tuple[float, float] = (1e-4, 1e-4)
    multi_sample_dropout_size: Tuple[int, int] = (8, 8)
    multi_sample_dropout_increment: Tuple[float, float] = (0, 0)
    swa_start_epoch: Tuple[int, int] = (-1, -1)
    swa_anneal_epochs: Tuple[int, int] = (3, 3)
    pretrained_dir: Path = Path("pretrained")
    zoo: Dict[str, ModelConf] = {
        "deberta-v3-base": ModelConf( # sentencepiece tokenizer
            directory=pretrained_dir / "microsoft" / "deberta-v3-base",
            model_max_length=32,
            stride=0,
            batch_size=2,
            gradient_checkpointing=False
        ),
        "roberta-base": ModelConf( # bbpe tokenizer
            directory=pretrained_dir / "roberta-base",
            model_max_length=32,
            stride=0,
            batch_size=1,
            gradient_checkpointing=False
        )
    }
    sample_frac: float = 1e-4
    gpus: List[int] = [0]
    patience: int = 1
    n_trials: int = 1
    n_folds: int = 3
    seed: int = 31
        
        
conf = Conf()
mc = conf.zoo[conf.model_name]
print(conf)

Conf(model_name='roberta-base', epochs=3, lr=(0.0001, 0.0001), multi_sample_dropout_size=(8, 8), multi_sample_dropout_increment=(0, 0), swa_start_epoch=(-1, -1), swa_anneal_epochs=(3, 3), pretrained_dir=WindowsPath('pretrained'), zoo={'deberta-v3-base': ModelConf(directory=WindowsPath('pretrained/microsoft/deberta-v3-base'), model_max_length=32, stride=0, batch_size=2, gradient_checkpointing=False), 'roberta-base': ModelConf(directory=WindowsPath('pretrained/roberta-base'), model_max_length=32, stride=0, batch_size=1, gradient_checkpointing=False)}, sample_frac=0.0001, gpus=[0], patience=1, n_trials=1, n_folds=3, seed=31)


In [3]:
tim = scml.Timer()
tim.start()
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything(conf.seed)
pl.seed_everything(conf.seed)

Global seed set to 31


31

In [4]:
device: torch.device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')
else:
    print("cpu")

device=0, NVIDIA GeForce GTX 1060 6GB
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB


In [5]:
%%time
df = pd.read_parquet("input/sequences.parquet")
if conf.sample_frac<1:
    df = df.sample(frac=conf.sample_frac)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1290 entries, 2326609 to 1752488
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   session  1290 non-null   int32 
 1   seq      1290 non-null   object
 2   length   1290 non-null   int16 
dtypes: int16(1), int32(1), object(1)
memory usage: 27.7+ KB
Wall time: 14.5 s


In [6]:
#tokenizer = AutoTokenizer.from_pretrained(str(mc.directory), model_max_length=mc.model_max_length)
tokenizer = BertTokenizerFast(
    vocab_file="input/vocab.txt", 
    unk_token="<unk>",
    sep_token="<s>",
    pad_token="<pad>",
    cls_token="<cls>",
    mask_token="<mask>",
    bos_token="<s>",
    eos_token="</s>",
    additional_special_tokens=["click_token", "cart_token", "order_token"],
    model_max_length=mc.model_max_length,
    padding_side="right",
)
tokenizer.add_special_tokens({
    "additional_special_tokens": ["<click>", "<cart>", "<order>"],
})
unk_token = tokenizer.unk_token
unk_id = tokenizer.unk_token_id
pad_token = tokenizer.pad_token
pad_id = tokenizer.pad_token_id
sep_token = tokenizer.sep_token
sep_id = tokenizer.sep_token_id
print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")
print(f"{unk_token}={unk_id}\n{pad_token}={pad_id}\n{sep_token}={sep_id}")

PreTrainedTokenizerFast(name_or_path='', vocab_size=1855612, model_max_len=32, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '<s>', 'pad_token': '<pad>', 'cls_token': '<cls>', 'mask_token': '<mask>', 'additional_special_tokens': ['<click>', '<cart>', '<order>']})
model_input_names=['input_ids', 'token_type_ids', 'attention_mask']
<unk>=3
<pad>=1
<s>=0


In [7]:
sids, s1, s2 = [], [], []
# encoder or decoder must have min 2 tokens 
minlen = 2
for t in tqdm(df.itertuples()):
    length = int(getattr(t, "length"))
    if length<2*minlen:
        continue
    sid = int(getattr(t, "session"))
    seq = getattr(t, "seq").split()
    al, bl = [], []
    length = mc.model_max_length-2  # minus 2 for CLS, EOS tokens
    i = 0
    j = i+length
    while j+length<=len(seq):
        al.append(seq[i:j])
        bl.append(seq[j:j+length])
        i += length
        j += length
    if i<len(seq) and len(seq)-i>=2*minlen:
        j = i+((len(seq)-i)//2)
        if j%2==1:
            j+=1
        al.append(seq[i:j])
        bl.append(seq[j:])
    for i in range(len(al)):
        a, b = al[i], bl[i]
        if len(a)%2==1:
            raise ValueError("a must have even length")
        if len(b)%2==1:
            raise ValueError(f"b must have even length. b={b}")
        if len(a)<minlen:
            raise ValueError("length of a must not be less than minlen")
        if len(b)<minlen:
            raise ValueError("length of b must not be less than minlen")
        s1.append(" ".join(a))
        s2.append(" ".join(b))
        sids.append(sid)
print(f"len(s1)={len(s1):,}")

1290it [00:00, 115515.96it/s]

len(s1)=1,925





In [8]:
del df
gc.collect()

30

In [9]:
%%time
x = tokenizer(
    s1,
    truncation=True, 
    padding="max_length",
    stride=mc.stride,
    add_special_tokens=True,
    return_overflowing_tokens=False,
    return_offsets_mapping=False,
    return_special_tokens_mask=False,
    return_token_type_ids=False,
)
print(f"{repr(x.keys())}")
#overflow_to_sample_mapping = x["overflow_to_sample_mapping"]
#print(f"len(overflow_to_sample_mapping)={len(overflow_to_sample_mapping):,}")
#offset_mapping = x["offset_mapping"]
#print(f"len(offset_mapping)={len(offset_mapping):,}")
#print(x["input_ids"][0])

dict_keys(['input_ids', 'attention_mask'])
Wall time: 174 ms


In [10]:
%%time
y = tokenizer(
    s2,
    truncation=True, 
    padding="max_length",
    stride=mc.stride,
    add_special_tokens=True,
    return_overflowing_tokens=False,
    return_offsets_mapping=False,
    return_special_tokens_mask=False,
    return_token_type_ids=False,
)
print(f"{repr(y.keys())}")
#overflow_to_sample_mapping = x["overflow_to_sample_mapping"]
#print(f"len(overflow_to_sample_mapping)={len(overflow_to_sample_mapping):,}")
#offset_mapping = x["offset_mapping"]
#print(f"len(offset_mapping)={len(offset_mapping):,}")
#print(x["input_ids"][0])

dict_keys(['input_ids', 'attention_mask'])
Wall time: 155 ms


In [11]:
%%time
input_ids = np.array(x["input_ids"], dtype=np.uint32)
n_pad = (input_ids == tokenizer.pad_token_id).sum()
n_unk = (input_ids == tokenizer.unk_token_id).sum()
n_sep = (input_ids == tokenizer.sep_token_id).sum()
d = input_ids.shape[0] * input_ids.shape[1]
print(f"UNK {n_unk/d*100:.2f}%\t{n_unk:,} out of {d:,} tokens")
print(f"PAD {n_pad/d*100:.2f}%\t{n_pad:,} out of {d:,} tokens")
print(f"SEP {n_sep/d*100:.2f}%\t{n_sep:,} out of {d:,} tokens")
print(f"input_ids.shape={input_ids.shape}")

UNK 0.00%	0 out of 61,600 tokens
PAD 43.18%	26,600 out of 61,600 tokens
SEP 3.12%	1,925 out of 61,600 tokens
input_ids.shape=(1925, 32)
Wall time: 4.14 ms


In [12]:
ds = mylib.OttoDataset(
    x, 
    labels=y["input_ids"],
    session_ids=sids,
)
shape = (len(ds), ds.seqlen())
print(f"ds.shape={shape}\n{ds[30]}\n{ds[0]}")

ds.shape=(1925, 32)
{'input_ids': tensor([      5,       6, 1285437,       6,  570963,       6,   30425,       6,
          33745,       6,  468517,       6,    5610,       6,  570963,       0,
              1,       1,       1,       1,       1,       1,       1,       1,
              1,       1,       1,       1,       1,       1,       1,       1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]), 'labels': tensor([     5,      6,  42610,      6,  76693,      6,  42610,      6,   3855,
             6,   3855,      6, 668804,      0,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1])}
{'input_ids': tensor([     5,      6,  23702,      7,  12656,      6,  95742,      6, 382832,
             7, 382832,      6,  29147,      6, 382832,      6,  12732,      6,
        652875,      6,  78014,      6

In [13]:
job_ts = datetime.now().strftime('%Y%m%d_%H%M%S')
job_dir = Path("models") / conf.model_name / job_ts
job_dir.mkdir(parents=True, exist_ok=True)
print(f"job_dir={job_dir}")
tokenizer.save_pretrained(str(job_dir))

job_dir=models\roberta-base\20221231_054243


('models\\roberta-base\\20221231_054243\\tokenizer_config.json',
 'models\\roberta-base\\20221231_054243\\special_tokens_map.json',
 'models\\roberta-base\\20221231_054243\\vocab.txt',
 'models\\roberta-base\\20221231_054243\\added_tokens.json',
 'models\\roberta-base\\20221231_054243\\tokenizer.json')

# Train final model on best Hps

In [14]:
%%time
splitter = StratifiedGroupKFold(n_splits=3)
dummy = np.zeros(len(ds))
for ti, vi in splitter.split(dummy, y=ds.stratification(), groups=ds.session_ids):
    tra_ds = torch.utils.data.Subset(ds, ti)
    val_ds = torch.utils.data.Subset(ds, vi)
    break
print(f"len(tra_ds)={len(tra_ds)}, len(val_ds)={len(val_ds)}")

len(tra_ds)=1284, len(val_ds)=641
Wall time: 265 ms


In [15]:
#best = df.iloc[0].to_dict()
best = {
    "lr": conf.lr[0],
}
print(f"best={best}")

best={'lr': 0.0001}


In [16]:
model = mylib.OttoLightningModel(
    lr=best["lr"],
    decoder_start_token_id=tokenizer.cls_token_id,
    pad_token_id=tokenizer.pad_token_id,
    vocab_size=len(tokenizer),
    hidden_size=64,
)
print(model.model.config)

EncoderDecoderConfig {
  "decoder": {
    "_name_or_path": "",
    "add_cross_attention": true,
    "architectures": null,
    "attention_probs_dropout_prob": 0.1,
    "bad_words_ids": null,
    "bos_token_id": 0,
    "chunk_size_feed_forward": 0,
    "classifier_dropout": null,
    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "diversity_penalty": 0.0,
    "do_sample": false,
    "early_stopping": false,
    "encoder_no_repeat_ngram_size": 0,
    "eos_token_id": 2,
    "exponential_decay_length_penalty": null,
    "finetuning_task": null,
    "forced_bos_token_id": null,
    "forced_eos_token_id": null,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 64,
    "id2label": {
      "0": "LABEL_0",
      "1": "LABEL_1"
    },
    "initializer_range": 0.02,
    "intermediate_size": 256,
    "is_decoder": true,
    "is_encoder_decoder": false,
    "label2id": {
      "LABEL_0": 0,
      "LABEL_1": 1
    },
    "layer_norm_eps": 1e-1

In [17]:
print(model.model)

EncoderDecoderModel(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30522, 64, padding_idx=1)
      (position_embeddings): Embedding(512, 64, padding_idx=1)
      (token_type_embeddings): Embedding(2, 64)
      (LayerNorm): LayerNorm((64,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=64, out_features=64, bias=True)
              (key): Linear(in_features=64, out_features=64, bias=True)
              (value): Linear(in_features=64, out_features=64, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=64, out_features=64, bias=True)
              (LayerNorm): LayerNorm((64,), ep

In [18]:
gc.collect()
torch.cuda.empty_cache()
trainer = mylib.Trainer(
    default_root_dir=job_dir,
    gpus=conf.gpus,
    max_epochs=conf.epochs,  
    callbacks=mylib.training_callbacks(patience=conf.patience),
    deterministic=False,
)
trainer.fit(
    model,
    train_dataloaders=DataLoader(
        tra_ds,
        batch_size=mc.batch_size,
        shuffle=True,
        num_workers=0,
    ),
    val_dataloaders=DataLoader(
        val_ds,
        batch_size=mc.batch_size,
        shuffle=False,
        num_workers=0,
    ),
)

  f"Setting `Trainer(gpus={gpus!r})` is deprecated in v1.7 and will be removed"
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: models\roberta-base\20221231_054243\lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                | Params
----------------------------------------------
0 | model | EncoderDecoderModel | 5.4 M 
----------------------------------------------
5.4 M     Trainable params
0         Non-trainable params
5.4 M     Total params
21.651    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]



RuntimeError: CUDA error: device-side assert triggered

In [None]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")
#print(f"Saved {str(job_dir)}")