In [1]:
from pathlib import Path

In [2]:
from smtag.config import config
config

Config(vocab_siz=54000, max_length=512, truncation=True, min_char_length=120, split_ratio={'train': 0.7, 'eval': 0.2, 'test': 0.1, 'max_eval': 10000, 'max_test': 10000}, celery_batch_size=1000, from_pretrained='facebook/bart-base', model_type='GraphRepresentation', tokenizer=PreTrainedTokenizerFast(name_or_path='facebook/bart-base', vocab_size=50265, model_max_len=1024, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}), nlp=<spacy.lang.en.English object at 0x7f202df55a00>)

In [3]:
from transformers import __version__
__version__

'4.15.0'

## Extracting examples for VAE LM

In [6]:
from smtag.extract import ExtractorXML

In [7]:
corpus = "/data/xml/mini"
text_examples = "/data/text/mini_sentences"
xpath = ".//abstract"
sentence_level = True
keep_xml = False
inclusion_probability = 1.0

In [8]:
extractor_lm = ExtractorXML(
    corpus,
    destination_dir=text_examples,
    sentence_level=sentence_level,
    xpath=xpath,
    keep_xml=keep_xml,
    inclusion_probability=inclusion_probability
)

/data/text/mini_sentences created


In [9]:
extractor_lm.extract_from_corpus()

100%|██████████| 1/1 [00:00<00:00,  4.02it/s]
100%|██████████| 1/1 [00:00<00:00, 17.01it/s]
100%|██████████| 1/1 [00:00<00:00, 56.06it/s]


{PosixPath('/data/text/mini_sentences/train.txt'): 74,
 PosixPath('/data/text/mini_sentences/eval.txt'): 29,
 PosixPath('/data/text/mini_sentences/test.txt'): 4}

Same with CLI:
    
```bash
python -m smtag.cli.prepro.extract \
/data/xml/mini \
/data/text/mini_sentences \
--xpath ".//abstract" \
--sentence_level
```

## Preparing tokenized dataset for VAE LM

In [10]:
from smtag.dataprep import PreparatorLM

In [11]:
tokenized_examples = "/data/json/mini_sentences"

In [12]:
prep_lm = PreparatorLM(
    text_examples,
    tokenized_examples,
    max_length=config.max_length
)

/data/json/mini_sentences created


In [13]:
prep_lm.run()

Preparing: train


100%|██████████| 74/74 [00:00<00:00, 2097.11it/s]



Length verification: OK!

average input_ids length = 43 (min=26, max=106) tokens
longest example: <s>Our findings are as follows: (i) the synthetic sites function when separated by several different prokaryotic or eukaryotic DNA fragments providing bulk intron sequence, (ii) intron size need not be greater than 29 bp, (iii) an AG dinucleotide 11 bp upstream from the invariant AG of an acceptor splice site renders the latter non-functional, and (iv) sequence changes distant from splice sites can affect the efficiency of their utilisation.ImagesFig.</s>
shortest example: <s>Both in vitro kinase activity and in vivo antibiotics resistance studies using structure-guided mutants further support the novel activation mechanism.</s>
Preparing: eval


100%|██████████| 29/29 [00:00<00:00, 1196.07it/s]



Length verification: OK!

average input_ids length = 40 (min=26, max=64) tokens
longest example: <s>In this paper, we show that the FUS/TLS (fused in sarcoma/translocated in liposarcoma) protein, associated with familial forms of Amyotrophic Lateral Sclerosis (ALS), contributes to the biogenesis of a specific subset of microRNAs.</s>
shortest example: <s>Campisi and colleagues now show that this also requires additional signalling via the stress-activated p38MAP kinase pathway.</s>
Preparing: test


100%|██████████| 4/4 [00:00<00:00, 186.85it/s]


Length verification: OK!

average input_ids length = 44 (min=38, max=55) tokens
longest example: <s>DNA double-strand breaks (DSBs) can be processed by the Mre11-Rad50-Nbs1 (MRN) complex, which is essential to promote ataxia telangiectasia-mutated (ATM) activation.</s>
shortest example: <s>These results suggest that MRN-dependent generation of ssDNA oligos, which constitute a unique signal of ongoing DSB repair not encountered in normal DNA metabolism, stimulates ATM activity.</s>





Same with CLI:

```bash
python -m smtag.cli.lm.dataprep data/text/mini_sentences /data/json/mini_sentences
```


## Train VAE LM

In [14]:
from smtag.train.train_lm import (
    train as train_lm,
    TrainingArgumentsLM
)

In [15]:
no_cache = False
path = "./smtag/loader/loader_lm.py"  # includes a loading script with the same name ./lm/lm.py
data_config_name = "SEQ2SEQ"
tokenizer = config.tokenizer  # tokenizer has to be the same application-wide

In [18]:
training_args_vae = TrainingArgumentsLM(
    num_train_epochs = 1,
    logging_steps = 5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
)
training_args_vae

TrainingArgumentsLM(output_dir='/lm_models', overwrite_output_dir=True, do_train=False, do_eval=True, do_predict=False, evaluation_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=True, per_device_train_batch_size=4, per_device_eval_batch_size=4, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1, max_steps=-1, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, warmup_ratio=0.0, warmup_steps=0, log_level=-1, log_level_replica=-1, log_on_each_node=True, logging_dir='/lm_models/runs/Feb05_17-38-30_d3fc75aa7886', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=False, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<IntervalStrategy.STEPS: 'steps'>, save_steps=500, save_total_limit=5, save_on_each_node=False, no_cuda=False, seed=42, bf16=Fa

In [20]:
train_lm(
    no_cache,
    path,
    tokenized_examples,
    data_config_name,
    training_args_vae,
    tokenizer
)



tokenizer vocab size: 50265

Loading datasets found in /data/json/mini_sentences.
using ./smtag/loader/loader_lm.py as dataset loader.
Downloading and preparing dataset bio_lang/SEQ2SEQ to /cache/bio_lang/SEQ2SEQ-bf6d38cd1d68b0e5/0.0.1/fb8223194f3b4d2c2f0c8c81326915fa02e07d4d90a3852fab1bd897761142c8...


0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset bio_lang downloaded and prepared to /cache/bio_lang/SEQ2SEQ-bf6d38cd1d68b0e5/0.0.1/fb8223194f3b4d2c2f0c8c81326915fa02e07d4d90a3852fab1bd897761142c8. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]


Training with 74 examples.
Evaluating on 29 examples.

Training arguments:
TrainingArgumentsLM(
_n_gpu=4,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_steps=5,
evaluation_strategy=IntervalStrategy.STEPS,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
lo

***** Running training *****
  Num examples = 74
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5


CUDA available: True
Available devices  4
Current cuda device  0




Step,Training Loss,Validation Loss,Loss,Supp Data Lm Loss,Supp Data Adj Distro Loss,Supp Data L Adj Sparse,Supp Data L Dag,Supp Data L Node Sparse,Runtime,Samples Per Second,Steps Per Second
5,176.0198,1214.44458,118.381729,7.204056,111.222267,0.328185,923.880249,0.399622,0.6686,43.373,2.991


INFO:***** Running Evaluation *****
INFO:  Num examples = 29
INFO:  Batch size = 16




[32;1m<s>[0m[31;1mThe[micro][0m[31;1m-[RNA][0m[31;1m.[abundance][0m[31;1m was[has][0m[32;1m been[0m[31;1m a[shown][0m[32;1m to[0m[31;1m be[depend][0m[32;1m on[0m[32;1m the[0m[31;1m former[amount][0m[32;1m of[0m[32;1m the[0m[31;1m former[micro][0m[31;1m-[processor][0m[31;1m of[components][0m[31;1m of[or][0m[31;1m the[,][0m[31;1m the[in][0m[31;1m the[some][0m[31;1m of[cases][0m[32;1m,[0m[31;1m the[on][0m[31;1m the[specific][0m[31;1m software[auxiliary][0m[31;1m software[co][0m[32;1m-[0m[31;1mf[fact][0m[31;1m,[ors][0m[32;1m.[0m[31;1mThe[</s>][0m


◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦
◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦
◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦
◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦
◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦
◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦
◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦
◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦◦



Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /lm_models
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
INFO:***** Running Prediction *****
INFO:  Num examples = 4
INFO:  Batch size = 16


Testing on 4.


AttributeError: 'dict' object has no attribute 'detach'

Same with CLI:

```bash
python -m smtag.cli.lm.train \
./smtag/loader/loader_lm.py \
SEQ2SEQ \
--data_dir /data/json/mini_sentences \
--num_train_epochs=1 \
--logging_steps=100 \
--per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \
--no_cache
```