In [19]:
from pathlib import Path

In [20]:
from smtag.config import config
config

Config(vocab_siz=54000, max_length=512, truncation=True, min_char_length=120, split_ratio={'train': 0.7, 'eval': 0.2, 'test': 0.1, 'max_eval': 10000, 'max_test': 10000}, celery_batch_size=1000, from_pretrained='roberta-base', model_type='Autoencoder', tokenizer=PreTrainedTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}), nlp=<spacy.lang.en.English object at 0x7fd5da5b7550>)

In [21]:
from transformers import __version__
__version__

'4.15.0'

## Extracting examples for TOKCL

In [22]:
from smtag.extract import ExtractorXML

In [23]:
corpus = "/data/xml/191012"
xml_examples = "/data/text/sd_test"
xpath = ".//sd-panel"
sentence_level = False
keep_xml = True
inclusion_probability = 1.0

In [24]:
# !rm -fr /data/text/sd_test

In [25]:
extractor_tokcl = ExtractorXML(
    corpus,
    destination_dir=xml_examples,
    sentence_level=sentence_level,
    xpath=xpath,
    keep_xml=keep_xml,
    inclusion_probability=inclusion_probability
)

/data/text/sd_test created


In [26]:
extractor_tokcl.extract_from_corpus()

100%|██████████| 1/1 [00:24<00:00, 24.07s/it]
100%|██████████| 1/1 [00:06<00:00,  6.96s/it]
100%|██████████| 1/1 [00:03<00:00,  3.93s/it]


{PosixPath('/data/text/sd_test/train.txt'): 15859,
 PosixPath('/data/text/sd_test/eval.txt'): 4869,
 PosixPath('/data/text/sd_test/test.txt'): 2399}

same via CLI:

```bash
python -m smtag.cli.prepro.extract /data/xml/191012/ /data/text/sd_test --xpath ".//sd-panel" --sentence_level --keep_xml --inclusion_probability 1.0
```

## Preparing tokenized dataset for TOKCL

In [27]:
from smtag.dataprep import PreparatorTOKCL

In [28]:
from smtag.xml2labels import SourceDataCodes as sd

In [30]:
# !rm -fr /data/json/sd_test

In [31]:
tokenized_examples = "/data/json/sd_test"
code_maps: code_maps = [
    sd.ENTITY_TYPES,
    sd.GENEPROD_ROLES,
    sd.BORING,
    sd.PANELIZATION
]
max_length = config.max_length

In [32]:
prep_tokcl = PreparatorTOKCL(
    xml_examples,
    tokenized_examples,
    code_maps,
    max_length=max_length
)

/data/json/sd_test created


In [33]:
prep_tokcl.run()

Preparing: train


  8%|▊         | 1279/15859 [00:02<00:29, 487.27it/s]



  9%|▉         | 1446/15859 [00:02<00:27, 517.73it/s]



 13%|█▎        | 2045/15859 [00:03<00:24, 554.82it/s]



 15%|█▍        | 2329/15859 [00:04<00:27, 490.56it/s]



 36%|███▌      | 5650/15859 [00:11<00:20, 498.65it/s]



 39%|███▊      | 6144/15859 [00:12<00:20, 479.15it/s]



 49%|████▉     | 7785/15859 [00:15<00:14, 566.53it/s]



 59%|█████▉    | 9326/15859 [00:18<00:13, 493.17it/s]



 95%|█████████▌| 15118/15859 [00:30<00:01, 387.40it/s]



100%|██████████| 15859/15859 [00:32<00:00, 493.44it/s]



Length verification: OK!

average input_ids length = 104 (min=24, max=512) tokens
longest example: <s>The timeline of GFP-lentivirus and CSP or CSP-TTK21 (one injection, 500 μg/mouse) injection is shown. (Left) The total number of spines was significantly decreased in TAU VEH compared to WT VEH mice. TAU MOL hippocampi showed a significant increase in the total number of spines compared to TAU VEH (One-way ANOVA p=0.0011, post hoc Holm Sidak's multiple comparisons test F(2,243) = 7.03; WT VEH vs. TAU VEH, p < 0.0009; TAU MOL vs. TAU VEH, p=0.0455). (Middle) Based on spine type, the number of head spines (Mushrooms, thins) was significantly lower in both TAU VEH and TAU MOL than in WT VEH controls (One-way ANOVA p=0.0018, post hoc Holm Sidak's multiple comparisons test F(2,243) = 6.467; WT VEH vs. TAU VEH, p=0.0013; WT VEH vs. TAU MOL, p=0.0272. Stubby spine density was significantly increased in TAU MOL compared to TAU VEH mice (One-way ANOVA F(2,243) = 4.311, p = 0.0145, post hoc Hol

 10%|▉         | 475/4869 [00:00<00:08, 506.97it/s]



 13%|█▎        | 618/4869 [00:01<00:10, 415.14it/s]



 55%|█████▍    | 2674/4869 [00:05<00:03, 566.74it/s]



100%|██████████| 4869/4869 [00:09<00:00, 519.03it/s]



Length verification: OK!

average input_ids length = 100 (min=26, max=512) tokens
longest example: <s>C Glucose tolerance test (GTT) was performed on 6-h-fasted mice during 120 min after injection of 1 g glucose/kg of body weight. Curves on the left show blood glucose level monitored after injection of glucose. Histograms on the right show area under curve (AUC). Fasting blood glucose: P (−/− versus +/+; CTRL; t = 0 min) = 0.045; P (−/− versus +/+; CTRL; t = 15 min) = 0.013; P (−/− versus +/+; CTRL; t = 30 min) = 0.016; P (−/− versus +/+; CTRL; t = 60 min) = 0.030; P (−/− versus +/+; CTRL; t = 90 min) = 0.017; P (−/− versus +/+; HFD; t = 60 min) = 0.041; P (−/− versus +/+; HFD; t = 90 min) = 0.043; P (−/− versus +/+; HFD; t = 120 min) = 0.034; P (HFD versus CTRL; +/+; t = 15 min) = 0.0076; P (HFD versus CTRL; +/+; t = 30 min) = 0.0067; P (HFD versus CTRL; +/+; t = 60 min) = 0.00058; P (HFD versus CTRL; +/+; t = 90 min) = 0.0010; P (HFD versus CTRL; +/+; t = 120 min) = 0.023; P (HFD ve

100%|██████████| 2399/2399 [00:04<00:00, 487.08it/s]



Length verification: OK!

average input_ids length = 103 (min=26, max=512) tokens
longest example: <s>Heat-map representing the correlations (expressed as the R2) between the abundances of 73 known proteasome-interacting proteins (PIPs) and the abundances of 8 reference proteins or protein complexes, PA28γ, β2i (representing the iP20S), PA28αβ, ncP20S (median of α1-α7, β3, β4, β6, and β7 profiles), 19S (median of Rpt1-6, Rpn1-3, 5-14 profiles), PI31, β5 (representing the sP20S), and PA200. For protein complexes, the median PAI of their subunits in each of the 24 AP-MS experiments was used: α1-α7, β3, β4, β6, and β7 subunits for the ncP20S, Rpt1-6, Rpn1-3, 5-14 for the 19S RP, and PA28α and PA28β subunits for the PA28αβ RP. The R2 values were hierarchically clustered. Three distinct clusters of composition detailed hereafter could be obtained. Cluster 1 (from top to bottom): Rpt3, Rpn13, α2, Rpn7, USP14, hHR23B, α1, β6, β3, α4, α7, Rpn6, Rpn3, Rpt4, Rpn10, Rpn5, Rpt5, Rpn1, Rpn11, Rpt1

same vie CLI:
    
```bash
python -m smtag.cli.tokcl.dataprep /data/text/sd_test /data/json/sd_test
```

## Train model for TOKCL

In [34]:
from smtag.train.train_tokcl import (
    train as train_tokcl,
    TrainingArgumentsTOKCL
)

In [35]:
training_args_tokcl = TrainingArgumentsTOKCL(
    num_train_epochs=5,
    logging_steps=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    replacement_probability=.0,
    masking_probability=.0
)
training_args_tokcl

using `logging_steps` to initialize `eval_steps` to 100
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


TrainingArgumentsTOKCL(output_dir='/tokcl_models', overwrite_output_dir=True, do_train=False, do_eval=True, do_predict=False, evaluation_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=True, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=0.0001, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5, max_steps=-1, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, warmup_ratio=0.0, warmup_steps=0, log_level=-1, log_level_replica=-1, log_on_each_node=True, logging_dir='/tokcl_models/runs/Feb10_09-23-26_b8af305fb798', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=False, logging_steps=100, logging_nan_inf_filter=True, save_strategy=<IntervalStrategy.STEPS: 'steps'>, save_steps=500, save_total_limit=5, save_on_each_node=False, no_cuda=False, se

In [37]:
no_cache = False
loader_path = "./smtag/loader/loader_tokcl.py"
data_config_name =  "NER"
tokenizer = config.tokenizer
from_pretrained = "EMBO/bio-lm"  # specialized model from huggingface.co/embo #  "roberta-base" # general lm model

In [38]:
train_tokcl(
    training_args_tokcl,
    loader_path,
    data_config_name,
    tokenized_examples,
    no_cache,
    tokenizer,
    from_pretrained
)

Using custom data configuration NER-e90bf469e54b0531
Reusing dataset source_data_nlp (/cache/source_data_nlp/NER-e90bf469e54b0531/0.0.1/e65d5b9e503369f63bb0dc6ae8e7b78be209fbfa242d23c1ac7ac97b09bef36a)


tokenizer vocab size: 50265

Loading and tokenizing datasets found in /data/json/sd_test.
using ./smtag/loader/loader_tokcl.py as dataset loader.


  0%|          | 0/3 [00:00<?, ?it/s]


Training with 15859 examples.
Evaluating on 4869 examples.

Training on 15 features:
O, I-SMALL_MOLECULE, B-SMALL_MOLECULE, I-GENEPROD, B-GENEPROD, I-SUBCELLULAR, B-SUBCELLULAR, I-CELL, B-CELL, I-TISSUE, B-TISSUE, I-ORGANISM, B-ORGANISM, I-EXP_ASSAY, B-EXP_ASSAY


loading configuration file https://huggingface.co/EMBO/bio-lm/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/09fed88b4a07fe6baced126e3cdb14f2764c1bc57f62d1026a75b3ffdb3ec5f8.c781727f43e25ac5b298f775b2dd4f32f53c9890a2367bbd99ffdbd856251b85
Model config RobertaConfig {
  "_name_or_path": "EMBO/bio-lm",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "labe


Training arguments:
TrainingArgumentsTOKCL(
_n_gpu=4,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_steps=100,
evaluation_strategy=IntervalStrategy.STEPS,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0001,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
loggin

Step,Training Loss,Validation Loss
100,0.335,0.181989
200,0.1496,0.173905
300,0.1191,0.181071
400,0.1045,0.172787
500,0.0972,0.17253
600,0.0743,0.189217
700,0.0721,0.183583
800,0.0607,0.192558
900,0.0545,0.193504
1000,0.0515,0.197006


***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>[0mD[0m,[0m E[0m End[0mogenous[0m[1m[4m[38;5;4m SL[0m[1m[4m[38;5;3mX[0m[1m[4m[38;5;3m4[0m and[0m[1m[4m[38;5;4m ER[0m[1m[4m[38;5;3mCC[0m[1m[4m[38;5;3m1[0m were[0m[1m[4m[38;5;14m immun[0m[1m[4m[38;5;13mop[0m[1m[4m[38;5;13mrec[0m[1m[4m[38;5;13mip[0m[1m[4m[38;5;13mitated[0m from[0m extracts[0m of[0m WT[0m or[0m[1m[4m[38;5;4m ER[0m[1m[4m[38;5;3mCC[0m[1m[4m[38;5;3m1[0m KO[0m[1m[4m[38;5;8m ME[0m[1m[4m[38;5;7mFs[0m ([0mD[0m),[0m or[0m WT[0m or[0m[1m[4m[38;5;4m SL[0m[1m[4m[38;5;3mX[0m[1m[4m[38;5;3m4[0m KO[0m[1m[4m[38;5;8m ME[0m[1m[4m[38;5;7mFs[0m ([0mE[0m)[0m and[0m subjected[0m to[0m[1m[4m[38;5;14m immun[0m[1m[4m[38;5;13mob[0m[1m[4m[38;5;13mlot[0m[1m[4m[38;5;13mting[0m with[0m the[0m indicated[0m antibodies[0m.[0m [0m</s>[0m




***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>[0m([0mC[0m)[0m[1m[4m[38;5;8m PC[0m[1m[4m[38;5;7m9[0m[1m[4m[38;5;7mER[0m cells[0m were[0m treated[0m overnight[0m with[0m the[0m indicated[0m T[0mK[0mIs[0m,[0m or[0m with[0m 3[0mX[0mm[0mAbs[0m,[0m and[0m whole[0m cell[0m extracts[0m were[0m prepared[0m.[0m Cle[0mared[0m extracts[0m were[0m[1m[4m[38;5;14m elect[0m[1m[4m[38;5;13mroph[0m[1m[4m[38;5;13mores[0m[1m[4m[38;5;13med[0m,[0m and[0m resolved[0m proteins[0m were[0m transferred[0m onto[0m filters[0m.[0m Fil[0mters[0m were[0m[1m[4m[38;5;14m immun[0m[1m[4m[38;5;13mobl[0m[1m[4m[38;5;13motted[0m for[0m the[0m indicated[0m proteins[0m or[0m for[0m their[0m phosph[0mory[0mlated[0m forms[0m.[0m Bl[0mots[0m are[0m representative[0m of[0m two[0m independent[0m experiments[0m.[0m [0m</s>[0m




***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>[0m[1m[4m[38;5;14mWestern[0m[1m[4m[38;5;13m bl[0m[1m[4m[38;5;13mots[0m showing[0m[1m[4m[38;5;14m co[0m[1m[4m[38;5;13m-[0m[1m[4m[38;5;13mim[0m[1m[4m[38;5;13mmun[0m[1m[4m[38;5;13mop[0m[1m[4m[38;5;13mrec[0m[1m[4m[38;5;13mip[0m[1m[4m[38;5;13mitation[0m of[0m[1m[4m[38;5;4m G[0m[1m[4m[38;5;3mAPD[0m[1m[4m[38;5;3mH[0m from[0m the[0m total[0m l[0mys[0mate[0m ([0mleft[0m)[0m and[0m isolated[0m[1m[4m[38;5;6m mitochond[0m[1m[4m[38;5;5mria[0m ([0mright[0m)[0m of[0m[1m[4m[38;5;10m brain[0m tissues[0m of[0m wild[0m-[0mtype[0m ([0mWT[0m)[0m and[0m trans[0mgenic[0m R[0m[1m[4m[38;5;11m6[0m/[0m[1m[4m[38;5;11m2[0m ([0mHD[0m)[0m[1m[4m[38;5;12m mice[0m with[0m anti[0m-[0m[1m[4m[38;5;4mG[0m[1m[4m[38;5;3mAPD[0m[1m[4m[38;5;3mH[0m antibody[0m.[0m The[0m presence[0m of[0m[1m[4m[38;5;4m hunting[0m[1m[4m[38;5;3mtin[0m was[0m examined[0m with[0m anti[0m-[0m[1m[4m

***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>[0m([0mD[0m)[0m[1m[4m[38;5;8m PC[0m[1m[4m[38;5;7m9[0m[1m[4m[38;5;7mER[0m cells[0m were[0m treated[0m for[0m the[0m indicated[0m time[0m intervals[0m with[0m[1m[4m[38;5;2m o[0m[1m[4m[38;5;1msim[0m[1m[4m[38;5;1mert[0m[1m[4m[38;5;1min[0m[1m[4m[38;5;1mib[0m ([0m500[0m n[0mM[0m),[0m 3[0mX[0mm[0mAbs[0m ([0m20[0m μg[0m/[0mmL[0m),[0m or[0m the[0m combination[0m of[0m both[0m treatments[0m.[0m Whole[0m cell[0m extracts[0m were[0m analyzed[0m for[0m[1m[4m[38;5;4m c[0m[1m[4m[38;5;3masp[0m[1m[4m[38;5;3mase[0m[1m[4m[38;5;3m 9[0m and[0m cle[0mavage[0m products[0m.[0m[1m[4m[38;5;4m G[0m[1m[4m[38;5;3mAPD[0m[1m[4m[38;5;3mH[0m levels[0m were[0m used[0m for[0m comparison[0m of[0m gel[0m loading[0m.[0m [0m</s>[0m




***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>[0mF[0m,[0m G[0m[1m[4m[38;5;14m PLA[0m for[0m[1m[4m[38;5;4m Park[0m[1m[4m[38;5;3min[0m and[0m �[0m�[0m[1m[4m[38;5;4mT[0m[1m[4m[38;5;3mau[0m,[0m revealing[0m[1m[4m[38;5;14m interactions[0m via[0m the[0m projection[0m domain[0m of[0m[1m[4m[38;5;4m t[0m[1m[4m[38;5;3mau[0m.[0m Data[0m were[0m analysed[0m with[0m a[0m Mann[0m-[0mWhit[0mney[0m test[0m ([0mU[0m =[0m 185[0m,[0m p[0m =[0m 0[0m.[0m92[0m24[0m,[0m n[0m =[0m 21[0m,[0m 18[0m cells[0m/[0mgroup[0m for[0m h[0m[1m[38;5;4mT[0m[1m[4m[38;5;3mau[0m and[0m �[0m�[0m[1m[4m[38;5;4mT[0m[1m[4m[38;5;3mau[0m,[0m respectively[0m).[0m Data[0m information[0m:[0m Scale[0m bars[0m =[0m 10[0m μ[0mm[0m.[0m Data[0m are[0m given[0m as[0m mean[0m and[0m SEM[0m,[0m **[0m =[0m p[0m <[0m 0[0m.[0m01[0m,[0m ****[0m =[0m p[0m <[0m 0[0m.[0m0001[0m.[0m [0m</s>[0m




Saving model checkpoint to /tokcl_models/checkpoint-500
Configuration saved in /tokcl_models/checkpoint-500/config.json
Model weights saved in /tokcl_models/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>[0mB[0m.[0m Percentage[0m of[0m[1m[4m[38;5;4m IF[0m[1m[4m[38;5;3mN[0m[1m[4m[38;5;3mγ[0m+[0m/[0m[1m[4m[38;5;4mIL[0m[1m[4m[38;5;3m-[0m[1m[4m[38;5;3m10[0m+[0m live[0m[1m[4m[38;5;4m T[0m[1m[4m[38;5;3mCR[0m[1m[4m[38;5;3mβ[0m+[0m[1m[38;5;4m CD[0m[1m[38;5;3m4[0m+[0m cells[0m from[0m[1m[4m[38;5;12m L[0m[1m[4m[38;5;11m.[0m[1m[4m[38;5;11m don[0m[1m[4m[38;5;11mov[0m[1m[4m[38;5;11mani[0m infected[0m WT[0m ([0mblue[0m)[0m or[0m[1m[4m[38;5;4m mi[0m[1m[4m[38;5;3mR[0m[1m[4m[38;5;3m-[0m[1m[4m[38;5;3m132[0m-[0m/-[0m ([0mred[0m)[0m[1m[4m[38;5;12m mice[0m,[0m determined[0m by[0m[1m[4m[38;5;14m intr[0m[1m[4m[38;5;13mace[0m[1m[4m[38;5;13mllular[0m[1m[4m[38;5;13m cytok[0m[1m[4m[38;5;13mine[0m[1m[4m[38;5;13m st[0m[1m[4m[38;5;13maining[0m.[0m Data[0m representative[0m of[0m 3[0m independent[0m experiments[0m with[0m 3[0m-[0m5[0m[1m[4m[38;5;12m mice[0m per

***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>[0m([0mE[0m)[0m[1m[4m[38;5;14m Immun[0m[1m[4m[38;5;13moh[0m[1m[4m[38;5;13mist[0m[1m[4m[38;5;13mochemical[0m[1m[4m[38;5;13m st[0m[1m[4m[38;5;13maining[0m for[0m cle[0maved[0m[1m[4m[38;5;4m c[0m[1m[4m[38;5;3masp[0m[1m[4m[38;5;3mase[0m[1m[4m[38;5;3m 3[0m performed[0m on[0m par[0maff[0min[0m-[0membed[0mded[0m sections[0m derived[0m from[0m[1m[38;5;12m xen[0m[1m[38;5;11mog[0m[1m[38;5;11mraft[0m[1m[38;5;11ms[0m of[0m either[0m[1m[4m[38;5;8m PC[0m[1m[4m[38;5;7m9[0mER[0m or[0m[1m[4m[38;5;8m H[0m[1m[4m[38;5;7m1975[0m cells[0m.[0m Two[0m weeks[0m after[0m[1m[4m[38;5;10m tumor[0m inoc[0mulation[0m,[0m[1m[4m[38;5;12m mice[0m were[0m randomized[0m ([0m3[0m-[0m4[0m[1m[4m[38;5;12m mice[0m/[0mgroup[0m)[0m and[0m treated[0m for[0m 12[0m days[0m either[0m with[0m vehicle[0m,[0m 3[0mX[0mm[0mAbs[0m ([0m[1m[4m[38;5;2mCT[0m[1m[4m[38;5;1mX[0m,[0m[1m[4m[38;5;2m 

***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>[0m([0mB[0m)[0m[1m[4m[38;5;8m Mel[0m[1m[4m[38;5;7mJ[0m[1m[4m[38;5;7mus[0m[1m[4m[38;5;7mo[0m cells[0m were[0m trans[0mfect[0med[0m with[0m two[0m different[0m si[0mRN[0mAs[0m against[0m[1m[4m[38;5;4m NR[0m[1m[4m[38;5;3mAS[0m ([0msi[0m#[0m1[0m,[0m si[0m#[0m2[0m),[0m their[0m combination[0m ([0msi[0mPool[0m)[0m or[0m with[0m a[0m non[0m-[0mtarget[0med[0m si[0mRNA[0m sequence[0m ([0mNT[0msi[0mRNA[0m).[0m A[0m double[0m si[0mRNA[0m trans[0mfect[0mion[0m ([0m48[0m and[0m 72[0m hrs[0m)[0m was[0m performed[0m to[0m have[0m an[0m optimal[0m knock[0mdown[0m.[0m Cells[0m were[0m treated[0m with[0m[1m[4m[38;5;2m 2[0m[1m[4m[38;5;1mD[0m[1m[4m[38;5;1mG[0m ([0m11[0mm[0mM[0m)[0m for[0m 4[0m hrs[0m.[0m Cell[0m l[0mys[0mates[0m were[0m[1m[4m[38;5;14m western[0m[1m[4m[38;5;13m bl[0m[1m[4m[38;5;13motted[0m for[0m[1m[4m[38;5;4m NR[0m[1m[4m[38;5;3mAS[0m,[0m phos

***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>[0m([0mA[0m)[0m[1m[4m[38;5;10m Muscle[0m [0m[1m[4m[38;5;14mmass[0m in[0m[1m[4m[38;5;12m A[0m[1m[4m[38;5;11mAV[0m-[0msh[0mRNA[0m injected[0m[1m[4m[38;5;12m mice[0m ([0mn[0m=[0m8[0m).[0m The[0m[1m[4m[38;5;10m TA[0m[1m[4m[38;5;14mweights[0m were[0m normalized[0m by[0m the[0m total[0m[1m[38;5;14m body[0m[1m[38;5;14mweight[0m ([0mmg[0m/[0mg[0m).[0m BW[0m:[0m[1m[38;5;14m Body[0m[1m[38;5;13m weight[0m.[0m In[0m A[0m,[0m C[0m,[0m E[0m,[0m F[0m sc[0matters[0m blot[0m,[0m bars[0m represent[0m mean[0m ±[0m SEM[0m.[0m Statistical[0m analysis[0m was[0m performed[0m using[0m a[0m one[0m-[0mtailed[0m Mann[0m-[0mWhit[0mney[0m U[0m-[0mtest[0m *[0m P[0m<[0m0[0m.[0m05[0m,[0m **[0m P[0m<[0m0[0m.[0m01[0m and[0m ***[0m P[0m<[0m0[0m.[0m001[0m compared[0m to[0m WT[0m.[0m �[0m�[0m P[0m<[0m0[0m.[0m05[0m,[0m �[0m�[0m†[0m P[0m<[0m0[0m.[0m01[0m and[0m �[0m�[0m†[0m†

***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>[0mD[0m.[0m Cor[0mrelation[0m analysis[0m of[0m M[0m2[0m like[0m[1m[38;5;10m tumor[0m accumulation[0m ([0m[1m[4m[38;5;4mM[0m[1m[4m[38;5;3mRC[0m[1m[4m[38;5;3m1[0m positive[0m)[0m with[0m[1m[4m[38;5;10m vessel[0m dys[0mmor[0mphia[0m depending[0m on[0m the[0m[1m[4m[38;5;10m tumor[0m grade[0m.[0m Statistical[0m analysis[0m:[0m A[0m.[0mC[0m.[0mD[0m.[0m one[0m-[0mway[0m AN[0mOVA[0m followed[0m by[0m multiple[0m comparisons[0m Tu[0mkey[0m's[0m test[0m;[0m [0m</s>[0m




Saving model checkpoint to /tokcl_models/checkpoint-1000
Configuration saved in /tokcl_models/checkpoint-1000/config.json
Model weights saved in /tokcl_models/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>[0m([0mC[0m)[0m[1m[4m[38;5;8m PC[0m[1m[4m[38;5;7m9[0mER[0m cells[0m were[0m treated[0m overnight[0m with[0m the[0m indicated[0m T[0mK[0mIs[0m,[0m or[0m with[0m 3[0mX[0mm[0mAbs[0m,[0m and[0m whole[0m cell[0m extracts[0m were[0m prepared[0m.[0m Cle[0mared[0m extracts[0m were[0m elect[0mroph[0mores[0med[0m,[0m and[0m resolved[0m proteins[0m were[0m transferred[0m onto[0m filters[0m.[0m Fil[0mters[0m were[0m[1m[4m[38;5;14m immun[0m[1m[4m[38;5;13mobl[0m[1m[4m[38;5;13motted[0m for[0m the[0m indicated[0m proteins[0m or[0m for[0m their[0m phosph[0mory[0mlated[0m forms[0m.[0m Bl[0mots[0m are[0m representative[0m of[0m two[0m independent[0m experiments[0m.[0m [0m</s>[0m




***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>[0m([0mB[0m)[0m[1m[4m[38;5;8m Mel[0m[1m[4m[38;5;7mJ[0m[1m[4m[38;5;7mus[0m[1m[4m[38;5;7mo[0m cells[0m were[0m trans[0mfect[0med[0m with[0m two[0m different[0m si[0mRN[0mAs[0m against[0m[1m[4m[38;5;4m NR[0m[1m[4m[38;5;3mAS[0m ([0msi[0m#[0m1[0m,[0m si[0m#[0m2[0m),[0m their[0m combination[0m ([0msi[0mPool[0m)[0m or[0m with[0m a[0m non[0m-[0mtarget[0med[0m si[0mRNA[0m sequence[0m ([0mNT[0msi[0mRNA[0m).[0m A[0m double[0m si[0mRNA[0m trans[0mfect[0mion[0m ([0m48[0m and[0m 72[0m hrs[0m)[0m was[0m performed[0m to[0m have[0m an[0m optimal[0m knock[0mdown[0m.[0m Cells[0m were[0m treated[0m with[0m[1m[4m[38;5;2m 2[0m[1m[4m[38;5;1mD[0m[1m[4m[38;5;1mG[0m ([0m11[0mm[0mM[0m)[0m for[0m 4[0m hrs[0m.[0m Cell[0m l[0mys[0mates[0m were[0m[1m[4m[38;5;14m western[0m[1m[4m[38;5;13m bl[0m[1m[4m[38;5;13motted[0m for[0m[1m[4m[38;5;4m NR[0m[1m[4m[38;5;3mAS[0m,[0m phos



Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /tokcl_models
Configuration saved in /tokcl_models/config.json
Model weights saved in /tokcl_models/pytorch_model.bin
***** Running Prediction *****
  Num examples = 2399
  Batch size = 64


Testing on 2399.



                                                                                
                precision    recall  f1-score   support

          CELL       0.82      0.83      0.82      2148
     EXP_ASSAY       0.69      0.67      0.68      3328
      GENEPROD       0.86      0.90      0.88      9966
      ORGANISM       0.73      0.87      0.80      1742
SMALL_MOLECULE       0.78      0.78      0.78      2012
   SUBCELLULAR       0.68      0.74      0.71      1319
        TISSUE       0.73      0.69      0.71      1261

     micro avg       0.79      0.82      0.81     21776
     macro avg       0.75      0.78      0.77     21776
  weighted avg       0.79      0.82      0.81     21776

{'test_loss': 0.2077668160200119, 'test_accuracy_score': 0.947730776370695, 'test_precision': 0.7920674137474631, 'test_recall': 0.8244397501836884, 'test_f1': 0.8079294361189865, 'test_runtime': 20.3747, 'test_samples_per_second': 117.744, 'test_steps_per_second': 1.865}


same via CLI:
    
```bash
python -m smtag.cli.tokcl.train \
./smtag/loader/loader_tokcl.py NER \
--data_dir /data/json/sd_test \
--num_train_epochs=1 \
--logging_steps=100 \
--per_device_train_batch_size=16 \
--per_device_eval_batch_size=16 \
--replacement_probability=0.2 \
--masking_probability=.0
```