In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
from pathlib import Path

In `config`:

    config = Config(
        max_length = 64  # in tokens
        from_pretrained = "facebook/bart-base"
        model_type = "VAE"
    )

In [4]:
from smtag.config import config
assert config.max_length==64
config.from_pretrained=='facebook/bart-base'
config.model_type=='GVAE'

True

In [5]:
from transformers import __version__
__version__

'4.15.0'

## Extracting examples for LM

In [4]:
from smtag.extract import ExtractorXML

In [None]:
! dir /data/xml/mini

In [None]:
! rm -fr /data/text/mini

In [6]:
corpus = "/data/xml/mini"
text_examples = "/data/text/mini"
xpath = ".//article-meta/title-group/article-title"
sentence_level = False
keep_xml = False
inclusion_probability = 1.0

In [None]:
extractor_lm = ExtractorXML(
    corpus,
    destination_dir=text_examples,
    sentence_level=sentence_level,
    xpath=xpath,
    keep_xml=keep_xml,
    inclusion_probability=inclusion_probability
)

In [None]:
extractor_lm.extract_from_corpus()

#### same via CLI:

```bash
python -m smtag.cli.prepro.extract /data/xml/emboj_all /data/text/emboj_twin --xpath ".//article-meta/title-group/article-title" ".//abstract"
```

## Preparing tokenized dataset for LM

In [7]:
from smtag.dataprep import PreparatorLM

In [7]:
tokenized_examples = "/data/json/mini"

In [9]:
! rm -fr /data/json/mini

In [10]:
prep_lm = PreparatorLM(
    text_examples,
    tokenized_examples,
    max_length=config.max_length
)

/data/json/mini created


In [11]:
prep_lm.run()

Preparing: train


100%|██████████| 5/5 [00:00<00:00, 80.33it/s]


Preparing: eval


100%|██████████| 2/2 [00:00<00:00, 87.56it/s]


Preparing: test


100%|██████████| 2/2 [00:00<00:00, 94.96it/s]


same vie CLI:
    
```bash
python -m smtag.cli.lm.dataprep /data/text/mini /data/json/mini
```

## Train LM

In [13]:
from smtag.train.train_lm import (
    train as train_lm,
    TrainingArgumentsLM
)

In [14]:
no_cache = True
loader_path = "./smtag/loader/loader_lm.py"
data_config_name = "SEQ2SEQ"
tokenizer = config.tokenizer  # tokenizer has to be the same application-wide
model_type = "GVAE" # "VAE"  # 
from_pretrained = config.from_pretrained

In [15]:
tokenizer.mask_token = '<mask>'  # why is this here? maybe because in case of character-level tokenizer
# tokenizer.unk_token = '<unk>'

Model architecture parameters for SEQ2SEQ VAE: 

```python
model_config = VAEConfigLM(
    freeze_pretrained=None,  # 'encoder' # 'both' # 'decoder' # None
    hidden_features=256,
    z_dim=96,
    gamma=1E-1,  # weight of lm loss as compared to z_loss
    sampling_iterations=200,
    seq_length=config.max_length,
    residuals=False,
    latent_var_loss="kl"  # "kl" or "mmd" or None
)
```

In [16]:
training_args_tokcl = TrainingArgumentsLM(
    num_train_epochs = 1,
    logging_steps = 100,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
)
training_args_tokcl

using `logging_steps` to initialize `eval_steps` to 100
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


TrainingArgumentsLM(output_dir='/lm_models', overwrite_output_dir=True, do_train=False, do_eval=True, do_predict=False, evaluation_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=True, per_device_train_batch_size=32, per_device_eval_batch_size=32, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1, max_steps=-1, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, warmup_ratio=0.0, warmup_steps=0, log_level=-1, log_level_replica=-1, log_on_each_node=True, logging_dir='/lm_models/runs/Jul28_10-37-21_43c552cd90fe', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=False, logging_steps=100, logging_nan_inf_filter=True, save_strategy=<IntervalStrategy.STEPS: 'steps'>, save_steps=500, save_total_limit=5, save_on_each_node=False, no_cuda=False, seed=42, bf1

In [17]:
train_lm(
    training_args_tokcl,
    loader_path,
    data_config_name,
    tokenized_examples,
    no_cache,
    tokenizer,
    model_type,
    from_pretrained
)



tokenizer vocab size: 50265

Loading datasets found in /data/json/mini.
using ./smtag/loader/loader_lm.py as dataset loader.
Downloading and preparing dataset bio_lang/SEQ2SEQ (download: Unknown size, generated: 2.38 KiB, post-processed: Unknown size, total: 2.38 KiB) to /cache/bio_lang/SEQ2SEQ-data_dir=%2Fdata%2Fjson%2Fmini/0.0.1/1563470eccc4c19fc5835c4884631442fce5f6de931fa8774e29f1959bd0050f...


0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset bio_lang downloaded and prepared to /cache/bio_lang/SEQ2SEQ-data_dir=%2Fdata%2Fjson%2Fmini/0.0.1/1563470eccc4c19fc5835c4884631442fce5f6de931fa8774e29f1959bd0050f. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]


Training with 5 examples.
Evaluating on 2 examples.


loading configuration file https://huggingface.co/facebook/bart-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/f5310d276a6d1648d00c32fadc8bf7b4607e0fbd5b404fc4a0045960aa2bdfdb.a243ed957122436adb0b8d8e9d20f896f45c174b6324d625ca0a20a84f72a910
Model config BartConfig {
  "_name_or_path": "facebook/bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id":


Training arguments:
TrainingArgumentsLM(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_steps=100,
evaluation_strategy=IntervalStrategy.STEPS,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_di

  0%|          | 0/1 [00:00<?, ?it/s]

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x125 and 25x25)

#### With CLI:


```bash
python -m smtag.cli.lm.train smtag/loader/loader_lm.py SEQ2SEQ --data_dir /data/json/oapmc_title --per_device_train_batch_size=128 --per_device_eval_batch_size=128 --logging_steps=100 --num_train_epochs=1 --no_cache

python -m smtag.cli.lm.train smtag/loader/loader_lm.py SEQ2SEQ --data_dir /data/json/emboj_abstracts --per_device_train_batch_size=8 --per_device_eval_batch_size=8 --logging_steps=100 --num_train_epochs=1 --no_cache

python -m smtag.cli.lm.train smtag/loader/loader_lm.py SEQ2SEQ --data_dir /data/json/mini --per_device_train_batch_size=8 --per_device_eval_batch_size=8 --logging_steps=100 --num_train_epochs=1
```