In [1]:
from pathlib import Path

In [2]:
from smtag.config import config
config

Config(vocab_siz=54000, max_length=64, truncation=True, min_char_length=120, split_ratio={'train': 0.7, 'eval': 0.2, 'test': 0.1, 'max_eval': 10000, 'max_test': 10000}, celery_batch_size=1000, from_pretrained='facebook/bart-base', model_type='GraphRepresentation', tokenizer=PreTrainedTokenizerFast(name_or_path='facebook/bart-base', vocab_size=50265, model_max_len=1024, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}), nlp=<spacy.lang.en.English object at 0x7f7fc185de20>)

In [3]:
from transformers import __version__
__version__

'4.15.0'

## Extracting examples for VAE LM

In [4]:
from smtag.extract import ExtractorXML

In [4]:
corpus = "/data/xml/mini"
text_examples = "/data/text/mini_sentences"
xpath = ".//abstract"
sentence_level = True
keep_xml = False
inclusion_probability = 1.0

In [6]:
!rm -fr /data/text/mini_sentences

In [7]:
extractor_lm = ExtractorXML(
    corpus,
    destination_dir=text_examples,
    sentence_level=sentence_level,
    xpath=xpath,
    keep_xml=keep_xml,
    inclusion_probability=inclusion_probability
)

/data/text/mini_sentences created


In [8]:
extractor_lm.extract_from_corpus()

100%|██████████| 1/1 [00:00<00:00,  3.48it/s]
100%|██████████| 1/1 [00:00<00:00, 25.11it/s]
100%|██████████| 1/1 [00:00<00:00, 29.59it/s]


{PosixPath('/data/text/mini_sentences/train.txt'): 82,
 PosixPath('/data/text/mini_sentences/eval.txt'): 12,
 PosixPath('/data/text/mini_sentences/test.txt'): 13}

Same with CLI:
    
```bash
python -m smtag.cli.prepro.extract \
/data/xml/mini \
/data/text/mini_sentences \
--xpath ".//abstract" \
--sentence_level
```

## Preparing tokenized dataset for VAE LM

In [9]:
from smtag.dataprep import PreparatorLM

In [5]:
tokenized_examples = "/data/json/mini_sentences"

In [11]:
!rm -fr /data/json/mini_sentences

In [12]:
prep_lm = PreparatorLM(
    text_examples,
    tokenized_examples,
    max_length=config.max_length
)

/data/json/mini_sentences created


In [13]:
prep_lm.run()

Preparing: train


100%|██████████| 82/82 [00:00<00:00, 1522.95it/s]



Length verification: OK!

average input_ids length = 41 (min=26, max=64) tokens
longest example: <s>To establish the mechanism for adaptive immunity provided by the Streptococcus thermophilus CRISPR4-Cas (CRISPR-associated) system (St-CRISPR4-Cas), we isolated an effector complex (St-Cascade) containing 61-nucleotide CRISPR</s>
shortest example: <s>Campisi and colleagues now show that this also requires additional signalling via the stress-activated p38MAP kinase pathway.</s>
Preparing: eval


100%|██████████| 12/12 [00:00<00:00, 416.88it/s]



Length verification: OK!

average input_ids length = 43 (min=29, max=64) tokens
longest example: <s>Our findings are as follows: (i) the synthetic sites function when separated by several different prokaryotic or eukaryotic DNA fragments providing bulk intron sequence, (ii) intron size need not be greater than 29 bp, (iii) an AG dinucleotide 11 bp upstream</s>
shortest example: <s>The structure allows identification of a putative Chi-recognition site in an inactivated helicase domain of the AddB subunit.</s>
Preparing: test


100%|██████████| 13/13 [00:00<00:00, 585.92it/s]


Length verification: OK!

average input_ids length = 45 (min=31, max=62) tokens
longest example: <s>Unlike proteasomes from eukaryotic cells which are composed of 10-20 different subunits, the Thermoplasma proteasome is made of only two types of subunit, alpha and beta, which have molecular weights of 25.8 and 22.3 kDa, respectively.</s>
shortest example: <s>In this communication we present a three-dimensional stoichiometric model of the archaebacterial proteasome deduced from electron microscopic investigations.</s>





Same with CLI:

```bash
python -m smtag.cli.lm.dataprep data/text/mini_sentences /data/json/mini_sentences
```


## Train VAE LM

In [6]:
from smtag.train.train_lm import (
    train as train_lm,
    TrainingArgumentsLM
)

In [7]:
no_cache = False
loader_path = "./smtag/loader/loader_lm.py"
data_config_name = "SEQ2SEQ"
tokenizer = config.tokenizer  # tokenizer has to be the same application-wide
model_type = "GraphRepresentation"
from_pretrained = "facebook/bart-base"

In [8]:
training_args = TrainingArgumentsLM(
    num_train_epochs = 1,
    logging_steps = 5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
)
training_args

TrainingArgumentsLM(output_dir='/lm_models', overwrite_output_dir=True, do_train=False, do_eval=True, do_predict=False, evaluation_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=True, per_device_train_batch_size=4, per_device_eval_batch_size=4, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1, max_steps=-1, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, warmup_ratio=0.0, warmup_steps=0, log_level=-1, log_level_replica=-1, log_on_each_node=True, logging_dir='/lm_models/runs/Feb10_17-14-18_ff6b0b617c42', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=False, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<IntervalStrategy.STEPS: 'steps'>, save_steps=500, save_total_limit=5, save_on_each_node=False, no_cuda=False, seed=42, bf16=Fa

In [9]:
train_lm(
    training_args,
    loader_path,
    data_config_name,
    tokenized_examples,
    no_cache,
    tokenizer,
    model_type,
    from_pretrained
)



tokenizer vocab size: 50265

Loading datasets found in /data/json/mini_sentences.
using ./smtag/loader/loader_lm.py as dataset loader.


  0%|          | 0/3 [00:00<?, ?it/s]


Training with 74 examples.
Evaluating on 29 examples.

Training arguments:
TrainingArgumentsLM(
_n_gpu=4,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_steps=5,
evaluation_strategy=IntervalStrategy.STEPS,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
lo

***** Running training *****
  Num examples = 74
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5


CUDA available: True
Available devices  4
Current cuda device  0


RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
    output = module(*input, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 744, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/app/smtag/models/vae.py", line 343, in forward
    logits = self.lm_head(outputs[0])
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 744, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/linear.py", line 94, in forward
    return F.linear(input, self.weight, self.bias)
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/functional.py", line 1667, in linear
    output = input.matmul(weight.t())
RuntimeError: both arguments to matmul need to be at least 1D, but they are 0D and 2D


Same with CLI:

```bash
python -m smtag.cli.lm.train \
smtag/loader/loader_lm.py \
SEQ2SEQ \
--data_dir /data/json/mini_sentences \
--num_train_epochs=1 \
--logging_steps=100 \
--per_device_train_batch_size=4 \
--per_device_eval_batch_size=4 \
--no_cache \
--model_typ="GraphRepresentation" \
--from_pretrained="facebook/bart-base"
```