In [1]:
import os
import gc
import math
import json
from datetime import datetime
import shutil
from pathlib import Path
import optuna
import numpy as np
import pandas as pd
import pytorch_lightning as pl
from tqdm import tqdm
import torch
from sklearn.model_selection import GroupKFold
#from sklearn.model_selection import StratifiedGroupKFold
from torch.utils.data import DataLoader
from transformers import BatchEncoding, BertTokenizerFast
from typing import Dict, List, Tuple, NamedTuple
import scml
import mylib

In [2]:
class ModelConf(NamedTuple):
    directory: Path
    model_max_length: int
    stride: int
    batch_size: int
    gradient_checkpointing: bool


class Conf(NamedTuple):
    sample_frac: float = 5e-3
    epochs: int = 1
    lr: Tuple[float, float] = (5e-4, 5e-4)
    multi_sample_dropout_size: Tuple[int, int] = (8, 8)
    multi_sample_dropout_increment: Tuple[float, float] = (0, 0)
    swa_start_epoch: Tuple[int, int] = (-1, -1)
    swa_anneal_epochs: Tuple[int, int] = (3, 3)
    pretrained_dir: Path = Path("pretrained")
    hidden_size: int = 256
    model_max_length: int = 32
    stride: int = 0
    batch_size: int = 4
    model_name: str = "roberta_to_roberta"
    gradient_checkpointing: bool = False
    gpus: List[int] = [0]
    patience: int = 0
    n_trials: int = 1
    n_folds: int = 3
    seed: int = 31
    
        
conf = Conf()
print(conf)

Conf(sample_frac=0.005, epochs=1, lr=(0.0005, 0.0005), multi_sample_dropout_size=(8, 8), multi_sample_dropout_increment=(0, 0), swa_start_epoch=(-1, -1), swa_anneal_epochs=(3, 3), pretrained_dir=WindowsPath('pretrained'), hidden_size=256, model_max_length=32, stride=0, batch_size=4, model_name='roberta_to_roberta', gradient_checkpointing=False, gpus=[0], patience=0, n_trials=1, n_folds=3, seed=31)


In [3]:
tim = scml.Timer()
tim.start()
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything(conf.seed)
pl.seed_everything(conf.seed)

Global seed set to 31


31

In [4]:
device: torch.device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')
else:
    print("cpu")

device=0, NVIDIA GeForce GTX 1060 6GB
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB


In [5]:
tokenizer = BertTokenizerFast.from_pretrained("tokenizer")
print(tokenizer)

PreTrainedTokenizerFast(name_or_path='tokenizer', vocab_size=500009, model_max_len=32, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '<s>', 'pad_token': '<pad>', 'cls_token': '<cls>', 'mask_token': '<mask>', 'additional_special_tokens': ['<click>', '<cart>', '<order>']})


In [6]:
%%time
with open("input/x.json") as f:
    x = json.load(f)
with open("input/y.json") as f:
    y = json.load(f)
with open("input/sids.json") as f:
    sids = json.load(f)
print(f"len(x)={len(x['input_ids'])}\nlen(y)={len(y['input_ids'])}\nlen(sids)={len(sids)}")
assert len(sids)==len(y["input_ids"])
assert len(sids)==len(x["input_ids"])

len(x)=18751938
len(y)=18751938
len(sids)=18751938
Wall time: 4min 32s


In [7]:
if conf.sample_frac<1:
    i = int(conf.sample_frac*len(sids))+1
    for k in x.keys():
        x[k] = x[k][:i]
        y[k] = y[k][:i]
    sids = sids[:i]
    print(f"len(x)={len(x['input_ids'])}\nlen(y)={len(y['input_ids'])}\nlen(sids)={len(sids)}")

len(x)=93760
len(y)=93760
len(sids)=93760


In [8]:
ds = mylib.OttoDataset(
    BatchEncoding(x), 
    labels=y["input_ids"],
    session_ids=sids,
)
shape = (len(ds), ds.seqlen())
print(f"ds.shape={shape}\n{ds[30]}\n{ds[0]}")

ds.shape=(93760, 32)
{'input_ids': tensor([     6,  43167,      6,  39054,      6,      3,      6,  97507,      6,
        311033,      6, 253136,      6, 261299,      7, 261299,      6, 302088,
             6, 261299,      6, 438206,      6, 288066,      7, 288066,      6,
        461904,      7, 461904,      6, 351457]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1]), 'labels': tensor([     6,  30814,      6,  23538,      6,   4587,      6,  84785,      6,
          5659,      6,  17615,      6,      3,      6,  13244,      6,  20058,
             6, 137657,      6, 368419,      6,   4699,      6,  23710,      6,
         18206,      6,  91388,      6,  39746])}
{'input_ids': tensor([     6, 237334,      6,      3,      6,   3970,      6,  18117,      6,
        494151,      6,      3,      7,  34949,      7,   1437,      8,  18476,
             8,   1437,      6,  89932,      6,  34949,      6,  3494

In [9]:
job_ts = datetime.now().strftime('%Y%m%d_%H%M%S')
job_dir = Path("models") / conf.model_name / job_ts
job_dir.mkdir(parents=True, exist_ok=True)
print(f"job_dir={job_dir}")

job_dir=models\roberta_to_roberta\20230107_093622


# Train final model on best Hps

In [10]:
%%time
splitter = GroupKFold(n_splits=50)
dummy = np.zeros(len(ds))
for ti, vi in splitter.split(dummy, y=ds.stratification(), groups=ds.groups()):
    tra_ds = torch.utils.data.Subset(ds, ti)
    val_ds = torch.utils.data.Subset(ds, vi)
    break
print(f"len(tra_ds)={len(tra_ds)}, len(val_ds)={len(val_ds)}")

len(tra_ds)=91884, len(val_ds)=1876
Wall time: 77.4 ms


In [11]:
#best = df.iloc[0].to_dict()
best = {
    "lr": conf.lr[0],
}
print(f"best={best}")

best={'lr': 0.0005}


In [12]:
model = mylib.OttoLightningModel(
    lr=best["lr"],
    decoder_start_token_id=tokenizer.cls_token_id,
    pad_token_id=tokenizer.pad_token_id,
    vocab_size=len(tokenizer),
    hidden_size=conf.hidden_size,
)
print(model.model.config)

EncoderDecoderConfig {
  "decoder": {
    "_name_or_path": "",
    "add_cross_attention": true,
    "architectures": null,
    "attention_probs_dropout_prob": 0.1,
    "bad_words_ids": null,
    "bos_token_id": 0,
    "chunk_size_feed_forward": 0,
    "classifier_dropout": null,
    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "diversity_penalty": 0.0,
    "do_sample": false,
    "early_stopping": false,
    "encoder_no_repeat_ngram_size": 0,
    "eos_token_id": 2,
    "exponential_decay_length_penalty": null,
    "finetuning_task": null,
    "forced_bos_token_id": null,
    "forced_eos_token_id": null,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 256,
    "id2label": {
      "0": "LABEL_0",
      "1": "LABEL_1"
    },
    "initializer_range": 0.02,
    "intermediate_size": 256,
    "is_decoder": true,
    "is_encoder_decoder": false,
    "label2id": {
      "LABEL_0": 0,
      "LABEL_1": 1
    },
    "layer_norm_eps": 1e-

In [13]:
print(model.model)

EncoderDecoderModel(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(500009, 256, padding_idx=1)
      (position_embeddings): Embedding(512, 256, padding_idx=1)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerN

In [14]:
gc.collect()
torch.cuda.empty_cache()
trainer = mylib.Trainer(
    default_root_dir=job_dir,
    gpus=conf.gpus,
    max_epochs=conf.epochs,  
    callbacks=mylib.training_callbacks(patience=conf.patience),
    deterministic=False,
)
trainer.fit(
    model,
    train_dataloaders=DataLoader(
        tra_ds,
        batch_size=conf.batch_size,
        shuffle=True,
        num_workers=0,
    ),
    val_dataloaders=DataLoader(
        val_ds,
        batch_size=conf.batch_size,
        shuffle=False,
        num_workers=0,
    ),
)

  f"Setting `Trainer(gpus={gpus!r})` is deprecated in v1.7 and will be removed"
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: models\roberta_to_roberta\20230107_093622\lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                | Params
----------------------------------------------
0 | model | EncoderDecoderModel | 269 M 
----------------------------------------------
269 M     Trainable params
0         Non-trainable params
269 M     Total params
1,078.255 Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 490.00 MiB (GPU 0; 6.00 GiB total capacity; 4.99 GiB already allocated; 0 bytes free; 5.02 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")
print(f"Saved {str(job_dir)}")