In [1]:
from pathlib import Path

In [2]:
from smtag.config import config
config

Config(vocab_siz=54000, max_length=128, truncation=True, min_char_length=120, split_ratio={'train': 0.7, 'eval': 0.2, 'test': 0.1, 'max_eval': 10000, 'max_test': 10000}, celery_batch_size=1000, from_pretrained='facebook/bart-base', model_type='GraphRepresentation', tokenizer=PreTrainedTokenizerFast(name_or_path='facebook/bart-base', vocab_size=50265, model_max_len=1024, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}), nlp=<spacy.lang.en.English object at 0x7f0befcdd940>)

In [3]:
from transformers import __version__
__version__

'4.15.0'

## Extracting examples for TOKCL

In [4]:
from smtag.extract import ExtractorXML

In [5]:
corpus = "/data/xml/191012"
xml_examples = "/data/text/sd_test"
xpath = ".//sd-panel"
sentence_level = False
keep_xml = True
inclusion_probability = 1.0

In [6]:
!rm -fr /data/text/sd_test

In [7]:
extractor_tokcl = ExtractorXML(
    corpus,
    destination_dir=xml_examples,
    sentence_level=sentence_level,
    xpath=xpath,
    keep_xml=keep_xml,
    inclusion_probability=inclusion_probability
)

/data/text/sd_test created


In [8]:
extractor_tokcl.extract_from_corpus()

100%|██████████| 1/1 [00:24<00:00, 24.03s/it]
100%|██████████| 1/1 [00:07<00:00,  7.44s/it]
100%|██████████| 1/1 [00:03<00:00,  3.50s/it]


{PosixPath('/data/text/sd_test/train.txt'): 15859,
 PosixPath('/data/text/sd_test/eval.txt'): 4869,
 PosixPath('/data/text/sd_test/test.txt'): 2399}

same via CLI:

```bash
python -m smtag.cli.prepro.extract /data/xml/191012/ /data/text/sd_test --xpath ".//sd-panel" --sentence_level --keep_xml --inclusion_probability 1.0
```

## Preparing tokenized dataset for VAE TOKCL

In [9]:
from smtag.dataprep import PreparatorTOKCL

In [10]:
from smtag.xml2labels import SourceDataCodes as sd

In [11]:
!rm -fr /data/json/sd_test

In [12]:
tokenized_examples = "/data/json/sd_test"
code_maps: code_maps = [
    sd.ENTITY_TYPES,
    sd.GENEPROD_ROLES,
    sd.BORING,
    sd.PANELIZATION
]
max_length = config.max_length

In [13]:
prep_tokcl = PreparatorTOKCL(
    xml_examples,
    tokenized_examples,
    code_maps,
    max_length=max_length
)

/data/json/sd_test created


In [14]:
prep_tokcl.run()

Preparing: train


  8%|▊         | 1292/15859 [00:02<00:31, 464.70it/s]



  9%|▉         | 1451/15859 [00:03<00:28, 503.42it/s]



 13%|█▎        | 2041/15859 [00:04<00:25, 543.32it/s]



 15%|█▍        | 2317/15859 [00:04<00:27, 488.07it/s]



 36%|███▌      | 5644/15859 [00:11<00:21, 482.80it/s]



 39%|███▊      | 6127/15859 [00:12<00:21, 458.89it/s]



 49%|████▉     | 7785/15859 [00:16<00:14, 543.75it/s]



 59%|█████▉    | 9350/15859 [00:19<00:13, 486.81it/s]



 95%|█████████▌| 15108/15859 [00:31<00:01, 434.32it/s]



100%|██████████| 15859/15859 [00:32<00:00, 480.76it/s]



Length verification: OK!

average input_ids length = 91 (min=24, max=128) tokens
longest example: <s>Knockdown of components of the BRM complex reduces Su(H) recruitment both in Notch-OFF (B) and Notch-ON (C) conditions. Fold enrichment of Su(H) occupancy at the indicated positions detected by ChIP, relative to input, in Kc167 cells treated with brm, Snr1 or GFP RNAi as a control. Notch-ON conditions (C) were induced by 30 minutes of EGTA treatment. Mean +/- SEM, n = 3 (B); Mean, n = 2 (C); * p<0.05 with one-tailed student's</s>
shortest example: <s>E Ten-fold serial dilutions and subsequent colony formation of the indicated mutant cells at the indicated temperatures. </s>
Preparing: eval


  9%|▉         | 433/4869 [00:00<00:08, 513.92it/s]



 55%|█████▍    | 2677/4869 [00:05<00:04, 526.82it/s]



100%|██████████| 4869/4869 [00:09<00:00, 487.89it/s]



Length verification: OK!

average input_ids length = 88 (min=26, max=128) tokens
longest example: <s>(F) Bar graphs showing the mean velocity and the mean run length of processive DDB events for the three Lis1 conditions (as in A, B and C). Error bars are the s.e.m. Mean velocities were 0.37 ± 0.02 µm/s (A), 0.38 ± 0.03 µm/s (B), and 0.35 ± 0.03 µm/s (C). Mean run lengths were 3.1 ± 0.4 µm (A), 3.2 ± 0.5 µm (B), and 2.9 ±</s>
shortest example: <s>Number of soft agar colonies (n=3 independent experiments formed by Rat2 cells stably expressing the indicated constructs </s>
Preparing: test


100%|██████████| 2399/2399 [00:05<00:00, 466.32it/s]



Length verification: OK!

average input_ids length = 90 (min=26, max=128) tokens
longest example: <s>C and D, Motoneuron defects induced in zebrafish embryos after expression of the indicated ALS-linked PDI mutants and wild-type controls (PDIA1WT and PDIA1R300H: 80 pg mRNA/embryo; ERp57WT, ERp57D217N, ERp57D217N and ERp57Q481K: 30 pgtba mRNA/embryo). The most frequent global phenotypes induced by PDI injection are shown in lateral views of embryos at 24 hpf (left column). Black arrows indicate the presence of curly tail and/or shorter axis</s>
shortest example: <s>B. Indicated stable RPE clones in densely cultured conditions were analyzed by Western blotting for the indicated proteins. </s>


same vie CLI:
    
```bash
python -m smtag.cli.tokcl.dataprep /data/text/sd_test /data/json/sd_test
```

## Train VAE model for TOKCL

In [15]:
from smtag.train.train_tokcl import (
    train as train_tokcl,
    TrainingArgumentsTOKCL
)

In [16]:
training_args_tokcl = TrainingArgumentsTOKCL(
    num_train_epochs=5,
    logging_steps=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    replacement_probability=.0,
    masking_probability=.0
)
training_args_tokcl

TrainingArgumentsTOKCL(output_dir='/tokcl_models', overwrite_output_dir=True, do_train=False, do_eval=True, do_predict=False, evaluation_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=True, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=0.0001, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5, max_steps=-1, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, warmup_ratio=0.0, warmup_steps=0, log_level=-1, log_level_replica=-1, log_on_each_node=True, logging_dir='/tokcl_models/runs/Feb12_23-19-41_ff6b0b617c42', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=False, logging_steps=100, logging_nan_inf_filter=True, save_strategy=<IntervalStrategy.STEPS: 'steps'>, save_steps=500, save_total_limit=5, save_on_each_node=False, no_cuda=False, se

In [17]:
no_cache = True
loader_path = "./smtag/loader/loader_tokcl.py"
data_config_name =  "NER"
tokenizer = config.tokenizer
model_type = "GraphRepresentation"
from_pretrained = "facebook/bart-base"  # specialized model from huggingface.co/embo #  "roberta-base" # general lm model

In [15]:
train_tokcl(
    training_args_tokcl,
    loader_path,
    data_config_name,
    tokenized_examples,
    no_cache,
    tokenizer,
    model_type,
    from_pretrained
)



tokenizer vocab size: 50265

Loading and tokenizing datasets found in /data/json/sd_test.
using ./smtag/loader/loader_tokcl.py as dataset loader.
Downloading and preparing dataset source_data_nlp/NER to /cache/source_data_nlp/NER-e90bf469e54b0531/0.0.1/c48c223ec09c8e1fd9e2312185ea3720f8c653a7c58323168790637ebb931f03...


0 examples [00:00, ? examples/s]

KeyError: 'special_tokens_mask'

same via CLI:
    
```bash
python -m smtag.cli.tokcl.train \
./smtag/loader/loader_tokcl.py NER \
--data_dir /data/json/sd_test \
--num_train_epochs=20 --logging_steps=100 \
--per_device_train_batch_size=16 \
--per_device_eval_batch_size=16 \
--replacement_probability=.0 \
--masking_probability=.0 \
--model_type="GraphRepresentation" \
--from_pretrained="facebook/bart-base"
```