In [1]:
from datetime import datetime
from pathlib import Path
from dataclasses import dataclass, field

In [2]:
import torch
from transformers import (
    IntervalStrategy,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    AutoModelForMaskedLM,
)
from datasets import load_dataset, GenerateMode

In [3]:
from lm.train import train
from lm.trainer import MyTrainer
from lm.metrics import compute_metrics
from lm.show import ShowExample
from vae.model import Because
from common.config import config
from common import (
    LM_MODEL_PATH,  # the files of the model and its checkpoints
    RUNS_DIR,  # the tensoboard logs
    CACHE  # model and dataset cached
)

In [34]:
@dataclass
class MyTrainingArguments(TrainingArguments):
    output_dir: str = field(default=LM_MODEL_PATH)
    overwrite_output_dir: bool = field(default=True)
    logging_steps: int = field(default=100)
    evaluation_strategy: str = IntervalStrategy.STEPS
    per_device_train_batch_size: int = field(default=2)
    per_device_eval_batch_size: int = field(default=2)
#     per_gpu_train_batch_size: int = field(default=4)
#     per_gpu_eval_batch_size: int = field(default=4)
    save_total_limit: int = field(default=5)

In [35]:
training_args = MyTrainingArguments()
print(training_args)

using `logging_steps` to initialize `eval_steps` to 100
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


MyTrainingArguments(
_n_gpu=4,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_steps=100,
evaluation_strategy=IntervalStrategy.STEPS,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=/lm_models/runs/Jan

In [6]:
data_config_name = "MLM"
output_dir_path = Path(training_args.output_dir)
training_args.logging_dir = f"{RUNS_DIR}/lm-{data_config_name}-{datetime.now().isoformat().replace(':','-')}"
if not output_dir_path.exists():
    output_dir_path.mkdir()
    print(f"Created {output_dir_path}.")


In [7]:
no_cache = False
train_dataset, eval_dataset, test_dataset = load_dataset(
        path='./lm/loader.py',
        name="MLM",
        data_dir="/data/json/emboj_abstracts",
        split=["train", "validation", "test"],
#         download_mode=GenerateMode.FORCE_REDOWNLOAD if no_cache else GenerateMode.REUSE_DATASET_IF_EXISTS,
        cache_dir=CACHE
)

Using custom data configuration MLM-f3c59cec1b660d97
Reusing dataset bio_lang (/cache/bio_lang/MLM-f3c59cec1b660d97/0.0.1/8876baa5b426637ea3ce476c0910651f36d58aeb7fd728c75f23664fd5071f57)


  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
tokenizer = config.tokenizer

In [9]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True
)

In [10]:
print(f"\nTraining with {len(train_dataset)} examples.")
print(f"Evaluating on {len(eval_dataset)} examples.")


Training with 12873 examples.
Evaluating on 3626 examples.


In [26]:
print(f"loading model {config.from_pretrained}")
seq2seq = AutoModelForMaskedLM.from_pretrained(config.from_pretrained)
model = Because(
    pretrained=seq2seq,
    max_nodes=4,
    num_features=50,
    num_entities=3,
    num_interactions=6,
    sampling_iterations=100
)  # make these part of BecauseConfig

loading model facebook/bart-base


loading configuration file https://huggingface.co/facebook/bart-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/f5310d276a6d1648d00c32fadc8bf7b4607e0fbd5b404fc4a0045960aa2bdfdb.da0f3c0e2dc1c2fecc46738a1ebf4806f2fc36aae3d5c1947f21e063e7cab34b
Model config BartConfig {
  "_name_or_path": "facebook/bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "gradient_checkpointin

In [27]:
seq2seq.config

BartConfig {
  "_name_or_path": "facebook/bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 1024,
  "model_type": "bart",
  "no_repeat_ng

In [36]:
training_args.remove_unused_columns = False   # we need pos_mask and special_tokens_mask in collator

In [37]:
print("\nTraining arguments:")
print(training_args)


Training arguments:
MyTrainingArguments(
_n_gpu=4,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_steps=100,
evaluation_strategy=IntervalStrategy.STEPS,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_di

In [38]:
print(f"num_train_epochs={training_args.num_train_epochs}")
print(f"per_device_eval_batch_size={training_args.per_device_eval_batch_size}")
print(f"per_device_train_batch_size={training_args.per_device_train_batch_size}")

num_train_epochs=10.0
per_device_eval_batch_size=2
per_device_train_batch_size=2


In [39]:
print(f"CUDA available: {torch.cuda.is_available()}")

CUDA available: True


In [40]:
trainer = MyTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
#     compute_metrics=compute_metrics,
    callbacks=[ShowExample(tokenizer)]
)

In [41]:
trainer.train()

***** Running training *****
  Num examples = 12873
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 16100


Step,Training Loss,Validation Loss


RuntimeError: CUDA out of memory. Tried to allocate 1.52 GiB (GPU 0; 31.71 GiB total capacity; 28.51 GiB already allocated; 9.00 MiB free; 29.81 GiB reserved in total by PyTorch)