In [6]:
from pathlib import Path

In [7]:
from smtag.config import config
config

Config(max_length=512, truncation=True, min_char_length=120, celery_batch_size=1000, from_pretrained='', model_type='Autoencoder', nlp=<spacy.lang.en.English object at 0x7fb4de00df40>)

In [8]:
from transformers import __version__
__version__

'4.15.0'

In [3]:
import torch
print(torch.__version__)

1.11.0a0+bfe5ad2


In [9]:
config.tokenizer.mask_token

'\ue003'

## Extracting examples for LM

In [67]:
from smtag.extract import ExtractorXML

In [37]:
! dir /data/xml/emboj_all

eval  test  train


In [38]:
! rm -fr /data/text/emboj_abstract_test

In [2]:
corpus = "/data/xml/emboj_all"
text_examples = "/data/text/emboj_abstract_test"
xpath = ".//abstract"
sentence_level = False
keep_xml = False
inclusion_probability = 1.0

In [40]:
extractor_lm = ExtractorXML(
    corpus,
    destination_dir=text_examples,
    sentence_level=sentence_level,
    xpath=xpath,
    keep_xml=keep_xml,
    inclusion_probability=inclusion_probability
)

/data/text/emboj_abstract_test created


In [41]:
extractor_lm.extract_from_corpus()

100%|██████████| 12/12 [00:31<00:00,  2.60s/it]
100%|██████████| 4/4 [00:10<00:00,  2.54s/it]
100%|██████████| 4/4 [00:10<00:00,  2.66s/it]


{PosixPath('/data/text/emboj_abstract_test/train.txt'): 11016,
 PosixPath('/data/text/emboj_abstract_test/eval.txt'): 3661,
 PosixPath('/data/text/emboj_abstract_test/test.txt'): 3685}

same via CLI:

```bash
python -m smtag.cli.prepro.extract /data/xml/mini/ /data/text/mini --xpath ".//abstract"
```

## Preparing tokenized dataset for LM

In [6]:
from smtag.dataprep import PreparatorLM

In [3]:
tokenized_examples = "/data/json/emboj_abstract_test"

In [8]:
! rm -fr /data/json/emboj_abstract_test

In [9]:
prep_lm = PreparatorLM(
    text_examples,
    tokenized_examples,
    max_length=config.max_length
)

/data/json/emboj_abstract_test created


In [10]:
prep_lm.run()

Preparing: train


100%|██████████| 11016/11016 [00:27<00:00, 393.88it/s]



Length verification: OK!

average input_ids length = 497 (min=128, max=512) tokens
longest example: [CLS]Overexpression of the erbB-2 gene contributes to aggressive behavior of various human adenocarcinomas, including breast cancer, through an unknown molecular mechanism. The erbB-2-encoded protein is a member of the ErbB family of growth factor receptors, but no direct ligand of ErbB-2 has been reported. We show that in various cells ErbB-2 can form heterodimers with both EGF receptor (ErbB-1) and NDF receptors (ErbB-3 and ErbB-4), suggesting that it may affect the action of heterologous ligands without the[SEP]
shortest example: [CLS]A newly identified chromosome end-protection factor removes polysumoylated Rap1 from telomeres to prevent chromosomal fusions.[SEP]
Preparing: eval


100%|██████████| 3661/3661 [00:07<00:00, 477.07it/s]



Length verification: OK!

average input_ids length = 498 (min=125, max=512) tokens
longest example: [CLS]Programmed necrosis induced by DNA alkylating agents, such as MNNG, is a caspase-independent mode of cell death mediated by apoptosis-inducing factor (AIF). After poly(ADP-ribose) polymerase 1, calpain, and Bax activation, AIF moves from the mitochondria to the nucleus where it induces chromatinolysis and cell death. The mechanisms underlying the nuclear action of AIF are, however, largely unknown. We show here that, through its C-terminal proline-rich binding domain (PBD, residues 543-559), AIF associate[SEP]
shortest example: [CLS]The DNA damage-inducible protein MCPIP1 teams up with a deubiquitinating enzyme to cleave linear ubiquitin chains off NEMO.[SEP]
Preparing: test


100%|██████████| 3685/3685 [00:07<00:00, 483.64it/s]



Length verification: OK!

average input_ids length = 497 (min=129, max=512) tokens
longest example: [CLS]We have previously identified a protein, consisting of seven WD-repeats, forming a putative β-propeller, and an FYVE domain, ProF, which is highly expressed in 3T3-L1 cells, a cell line that can be differentiated into adipocytes. We recently found ProF to interact with the kinases Akt and protein kinase Cζ. Here we demonstrate that ProF is a positive regulator of adipogenesis. Knockdown of ProF by RNA interference leads to decreased adipocyte differentiation. This is shown by reduced lipid accumulation, d[SEP]
shortest example: [CLS]PIAS3- and SENP2-controlled SUMOylation protects the p53 co-activator hnRNP-K from Mdm2-mediated ubiquitination upon UV damage.[SEP]


same vie CLI:
    
```bash
python -m smtag.cli.lm.dataprep /data/text/mini /data/json/mini
```

## Train LM

In [10]:
from smtag.train.train_lm import (
    train as train_lm,
    TrainingArgumentsLM
)

In [11]:
no_cache = True
loader_path = "./smtag/loader/loader_lm.py"  # includes a loading script with the same name ./lm/lm.py
data_config_name = "DET"
tokenizer = config.tokenizer  # tokenizer has to be the same application-wide
model_type = "Autoencoder"
from_pretrained = config.from_pretrained

In [26]:
tokenizer.mask_token = '<mask>'
# tokenizer.unk_token = '<unk>'

In [12]:
training_args_tokcl = TrainingArgumentsLM(
    num_train_epochs = 30,
    logging_steps = 100,
    per_device_train_batch_size=4, #32,
    per_device_eval_batch_size=3, #32,
)
training_args_tokcl

TrainingArgumentsLM(output_dir='/lm_models', overwrite_output_dir=True, do_train=False, do_eval=True, do_predict=False, evaluation_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=True, per_device_train_batch_size=4, per_device_eval_batch_size=3, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=30, max_steps=-1, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, warmup_ratio=0.0, warmup_steps=0, log_level=-1, log_level_replica=-1, log_on_each_node=True, logging_dir='/lm_models/runs/Feb15_06-03-46_1d98755ab36c', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=False, logging_steps=100, logging_nan_inf_filter=True, save_strategy=<IntervalStrategy.STEPS: 'steps'>, save_steps=500, save_total_limit=5, save_on_each_node=False, no_cuda=False, seed=42, bf16

In [13]:
train_lm(
    training_args_tokcl,
    loader_path,
    data_config_name,
    tokenized_examples,
    no_cache,
    tokenizer,
    model_type,
    from_pretrained
)



tokenizer vocab size: 1114112

Loading datasets found in /data/json/emboj_abstract_test.
using ./smtag/loader/loader_lm.py as dataset loader.
Downloading and preparing dataset bio_lang/DET (download: Unknown size, generated: 43.68 MiB, post-processed: Unknown size, total: 43.68 MiB) to /cache/bio_lang/DET-df7c34febfa51e88/0.0.1/ff9dbe678aba55520bef244a55868bd4acf85eb086ce21a8454da23294256c2e...


0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset bio_lang downloaded and prepared to /cache/bio_lang/DET-df7c34febfa51e88/0.0.1/ff9dbe678aba55520bef244a55868bd4acf85eb086ce21a8454da23294256c2e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]


Training with 11016 examples.
Evaluating on 3661 examples.

Training arguments:
TrainingArgumentsLM(
_n_gpu=4,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_steps=100,
evaluation_strategy=IntervalStrategy.STEPS,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level

***** Running training *****
  Num examples = 11016
  Num Epochs = 30
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 20670


CUDA available: True
Available devices  4
Current cuda device  0




RuntimeError: CUDA error: an illegal memory access was encountered