In [1]:
from pathlib import Path

In [2]:
from smtag.config import config
config

Config(max_length=512, truncation=True, min_char_length=120, celery_batch_size=1000, from_pretrained='roberta-base', model_type='Autoencoder', nlp=<spacy.lang.en.English object at 0x7fa10a4ae130>)

In [None]:
from transformers import __version__
__version__

In [None]:
! more .env

## Extracting examples for TOKCL

In [None]:
from smtag.extract import ExtractorXML

#### Dataset with individual panels

In [3]:
xml_examples = "/data/text/sd_test"

In [None]:
! rm -fr /data/text/sd_test

In [None]:
extractor_tokcl = ExtractorXML(
    "/data/xml/191012",
    destination_dir=xml_examples,
    sentence_level=False,
    xpath=".//sd-panel",
    keep_xml=True,
    inclusion_probability=1.0
)

In [None]:
extractor_tokcl.extract_from_corpus()

same via CLI:

```bash
python -m smtag.cli.prepro.extract /data/xml/191012/ /data/text/sd_test --xpath ".//sd-panel" --sentence_level --keep_xml --inclusion_probability 1.0
```

#### Dataset with full figures (used for panelization training)

In [4]:
xml_figure_examples = "/data/text/sd_test_figs"

In [None]:
! rm -fr "/data/text/sd_test_figs"

In [None]:
extractor_tokcl_2 = ExtractorXML(
    "/data/xml/191012",
    destination_dir=xml_figure_examples,
    sentence_level=False,
    xpath=".//caption",
    keep_xml=True,
    inclusion_probability=1.0
)

In [None]:
extractor_tokcl_2.extract_from_corpus()

## Preparing tokenized dataset for TOKCL

In [None]:
from smtag.dataprep import PreparatorTOKCL

In [6]:
from smtag.xml2labels import SourceDataCodes as sd

#### Tokenize panels

In [None]:
! rm -fr /data/json/sd_test

In [7]:
tokenized_examples = "/data/json/sd_test"
code_maps: code_maps = [
    sd.ENTITY_TYPES,
    sd.GENEPROD_ROLES,
    sd.SMALL_MOL_ROLES,
    sd.BORING,
    sd.PANELIZATION
]

In [None]:
prep_tokcl = PreparatorTOKCL(
    xml_examples,
    tokenized_examples,
    code_maps,
    max_length=config.max_length
)

In [None]:
prep_tokcl.run()

same vie CLI:
    
```bash
python -m smtag.cli.tokcl.dataprep /data/text/sd_test /data/json/sd_test
```

#### Tokenize figures

In [8]:
tokenized_figures = "/data/json/sd_test_figs"
code_maps: code_maps = [
    sd.PANELIZATION
]

In [None]:
! rm -fr /data/json/sd_test_figs

In [None]:
prep_tokcl_2 = PreparatorTOKCL(
    xml_figure_examples,
    tokenized_figures,
    code_maps,
    max_length=config.max_length
)

In [None]:
prep_tokcl_2.run()

## Train model for TOKCL

In [9]:
from smtag.config import config
from smtag.train.train_tokcl import (
    train as train_tokcl,
    TrainingArgumentsTOKCL
)

In [10]:
training_args = TrainingArgumentsTOKCL(
    logging_steps=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
)

In [11]:
tokenized_examples = "/data/json/sd_test"
tokenized_figures = "/data/json/sd_test_figs"
no_cache = False
loader_path = "./smtag/loader/loader_tokcl.py"
tokenizer = config.tokenizer
model_type = "Autoencoder"
from_pretrained = "EMBO/bio-lm"  # "roberta-base" # specialized model from huggingface.co/embo #  "roberta-base" # general lm model

In [12]:
! rm -fr /runs/*

In [13]:
! rm -fr /tokcl_models/*

### Train NER

In [14]:
training_args.overwrite_output_dir=False

In [15]:
training_args.num_train_epochs=1
training_args.prediction_loss_only=True
training_args.masking_probability=0.5
training_args.replacement_probability=.0
training_args

TrainingArgumentsTOKCL(output_dir='/tokcl_models', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, evaluation_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=True, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=0.0001, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1, max_steps=-1, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, warmup_ratio=0.0, warmup_steps=0, log_level=-1, log_level_replica=-1, log_on_each_node=True, logging_dir='/tokcl_models/runs/Feb16_22-41-22_e6255381d0ac', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=False, logging_steps=50, logging_nan_inf_filter=True, save_strategy=<IntervalStrategy.STEPS: 'steps'>, save_steps=500, save_total_limit=5, save_on_each_node=False, no_cuda=False, se

In [16]:
train_tokcl(
    training_args,
    loader_path,
    "NER",
    tokenized_examples,
    no_cache,
    tokenizer,
    model_type,
    from_pretrained
)



Created /tokcl_models/NER.
tokenizer vocab size: 50265

Loading and tokenizing datasets found in /data/json/sd_test.
using ./smtag/loader/loader_tokcl.py as dataset loader.


  0%|          | 0/3 [00:00<?, ?it/s]


Training with 15859 examples.
Evaluating on 4869 examples.

Training on 15 features:
O, I-SMALL_MOLECULE, B-SMALL_MOLECULE, I-GENEPROD, B-GENEPROD, I-SUBCELLULAR, B-SUBCELLULAR, I-CELL, B-CELL, I-TISSUE, B-TISSUE, I-ORGANISM, B-ORGANISM, I-EXP_ASSAY, B-EXP_ASSAY


Some weights of the model checkpoint at EMBO/bio-lm were not used when initializing RobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at EMBO/bio-lm and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-


Training arguments for model type Autoencoder:
RobertaConfig {
  "_name_or_path": "EMBO/bio-lm",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_14": 14,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,

***** Running training *****
  Num examples = 15859
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 248


CUDA available: True




Step,Training Loss,Validation Loss
50,0.5076,0.189321
100,0.16,0.146227
150,0.1352,0.134621
200,0.1224,0.129407


***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>A, B, C. Cell lysates of[1m[4m[38;5;8m<mask>[0m[1m[4m[38;5;7mK[0m[1m[4m[38;5;7m293[0m cells co-transfected with[1m[4m[38;5;4m<mask>[0m[1m[4m[38;5;3m<mask>[0m-[1m[4m[38;5;4mF[0m[1m[4m[38;5;3m<mask>[0m[1m[4m[38;5;3m200[0m (A) and either empty vector control,[1m[4m[38;5;4m My[0m[1m[4m[38;5;3mc[0m-[1m[4m[38;5;4m<mask>[0m[1m[4m[38;5;3m<mask>[0m[1m[4m[38;5;3m<mask>[0m[1m[4m[38;5;3m<mask>[0m[1m[38;5;3mS[0m or[1m[4m[38;5;4m My[0m[1m[4m[38;5;3m<mask>[0m-[1m[4m[38;5;4mC[0m[1m[4m[38;5;3m<mask>[0m[1m[4m[38;5;3morf[0m[1m[4m[38;5;3m<mask>[0m[1m[38;5;3mL[0m were subjected to[1m[4m[38;5;14m immun[0m[1m[4m[38;5;13m<mask>[0m[1m[4m[38;5;13m<mask>[0m[1m[4m[38;5;13m<mask>[0m[1m[4m[38;5;13mitation[0m with anti-[1m[4m[38;5;4m<mask>[0m[1m[4m[38;5;3m<mask>[0m (A and B) antibodies. Immune pellets were probed for[1m[4m[38;5;4m<mask>[0m[1m[4m[38;5;3m<mask>[0m-[1m[4m[38;5;4mC[0m[1m[4

***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>[1m[4m[38;5;4m<mask>[0m[1m[4m[38;5;3mO[0m[1m[4m[38;5;3m<mask>[0mKO_[1m[38;5;4mFH[0m[1m[38;5;3m<mask>[0m[1m[4m[38;5;3mGO[0m[1m[38;5;7m<mask>[0m and[1m[4m[38;5;4m<mask>[0m[1m[4m[38;5;3m<mask>[0m[1m[4m[38;5;3m<mask>[0mKO_[1m[4m[38;5;4m<mask>[0m[1m[4m[38;5;3m<mask>[0m cell extracts (n=3 experimental replicates) were[1m[4m[38;5;14m immun[0m[1m[4m[38;5;13m<mask>[0m[1m[4m[38;5;13mrec[0m[1m[4m[38;5;13m<mask>[0m[1m[4m[38;5;13mitated[0m using an anti-[1m[4m[38;5;4mTER[0m[1m[4m[38;5;3m<mask>[0m antibody or IgG as mock IP.[1m[4m[38;5;4m<mask>[0m[1m[4m[38;5;3m<mask>[0m and[1m[4m[38;5;4m<mask>[0m[1m[4m[38;5;3mOTA[0m[1m[4m[38;5;3mIR[0m (as a negative control) abundance was assessed by[1m[4m[38;5;14m RT[0m[1m[4m[38;5;13m<mask>[0m[1m[4m[38;5;13mq[0m[1m[4m[38;5;13mPC[0m[1m[4m[38;5;13mR[0m.[1m[4m[38;5;4m H[0m[1m[4m[38;5;3mPR[0m[1m[4m[38;5;3mT[0m[1m[4m[38;5;3m<mask>[0m was 

***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>A[1m[4m[38;5;8m<mask>[0m[1m[4m[38;5;7mTS[0m[1m[4m[38;5;7m-[0m[1m[4m[38;5;7m2[0m[1m[4m[38;5;7m<mask>[0m[1m[4m[38;5;7m<mask>[0m cells were incubated with[1m[4m[38;5;8m<mask>[0m[1m[4m[38;5;7m21[0m.221-[1m[4m[38;5;4mC[0m[1m[4m[38;5;3m<mask>[0m[1m[4m[38;5;3m4[0m or[1m[4m[38;5;4m<mask>[0m[1m[4m[38;5;3mw[0m[1m[4m[38;5;3m7[0m (221-[1m[4m[38;5;4mC[0m[1m[4m[38;5;3m<mask>[0m[1m[4m[38;5;3m4[0m or[1m[4m[38;5;4m<mask>[0m[1m[4m[38;5;3m<mask>[0m[1m[4m[38;5;3m<mask>[0m) cells for 5 min at 370C. The cells were lysed, and[1m[4m[38;5;14m immun[0m[1m[4m[38;5;13mop[0m[1m[4m[38;5;13mrec[0m[1m[4m[38;5;13m<mask>[0m[1m[4m[38;5;13m<mask>[0m (IP) of[1m[4m[38;5;4m<mask>[0m[1m[4m[38;5;3mP[0m[1m[4m[38;5;3m<mask>[0m[1m[4m[38;5;3m<mask>[0m were[1m[4m[38;5;14m immun[0m[1m[4m[38;5;13mobl[0m[1m[4m[38;5;13motted[0m (IB) with anti-p[1m[38;5;3mTy[0m (top panel), anti-[1m[4m[38;5;4m<mask>

***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>A Control animals (+/w; UAS-[1m[4m[38;5;4m<mask>[0m[1m[4m[38;5;3m<mask>[0mIR/+), n= 19, and those with[1m[4m[38;5;10m<mask>[0m[1m[4m[38;5;9miv[0m[1m[4m[38;5;9m<mask>[0m[1m[4m[38;5;9m gland[0m-specific knockdown of[1m[4m[38;5;4m sec[0m[1m[4m[38;5;3m5[0m ([1m[4m[38;5;4mf[0m[1m[4m[38;5;3m<mask>[0m-[1m[4m[38;5;4m<mask>[0m[1m[4m[38;5;3m<mask>[0m[1m[4m[38;5;3m<mask>[0m/w; UAS-[1m[4m[38;5;4m<mask>[0m[1m[4m[38;5;3m<mask>[0mIR/+), n=20, were analyzed by[1m[4m[38;5;14m hist[0m[1m[4m[38;5;13m<mask>[0m for the presence of[1m[4m[38;5;10m<mask>[0m[1m[4m[38;5;9miv[0m[1m[4m[38;5;9m<mask>[0m[1m[4m[38;5;9m gland[0m material (red dotted circle) 24 hours after puparium formation.B Quantification of data from (A). Data are represented as means. Statistical significance was determined using a Chi-square test.C Control animals (+/w; UAS-[1m[38;5;4msec[0m[1m[38;5;3m15[0mIR/+), n= 20, and those with[1m[4m[38;5;10m



Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /tokcl_models/NER
Configuration saved in /tokcl_models/NER/config.json
Model weights saved in /tokcl_models/NER/pytorch_model.bin
***** Running Prediction *****
  Num examples = 2399
  Batch size = 64


Testing on 2399.



                                                                                
                precision    recall  f1-score   support

          CELL       0.75      0.78      0.76      2148
     EXP_ASSAY       0.75      0.73      0.74      3328
      GENEPROD       0.85      0.89      0.87      9966
      ORGANISM       0.75      0.81      0.78      1742
SMALL_MOLECULE       0.75      0.74      0.75      2012
   SUBCELLULAR       0.67      0.66      0.67      1319
        TISSUE       0.66      0.68      0.67      1261

     micro avg       0.79      0.81      0.80     21776
     macro avg       0.74      0.76      0.75     21776
  weighted avg       0.79      0.81      0.80     21776

{'test_loss': 0.12268844991922379, 'test_accuracy_score': 0.9619240175306908, 'test_precision': 0.788031594448659, 'test_recall': 0.8109386480529023, 'test_f1': 0.7993210365508656, 'test_runtime': 20.5154, 'test_samples_per_second': 116.937, 'test_steps_per_second': 1.852}


#### Train GENEPROD ROLES

In [17]:
training_args.num_train_epochs = 1.0
training_args.prediction_loss_only=True
training_args.masking_probability=1.
training_args.replacement_probability=.0
training_args

TrainingArgumentsTOKCL(output_dir='/tokcl_models', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, evaluation_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=True, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=0.0001, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1.0, max_steps=-1, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, warmup_ratio=0.0, warmup_steps=0, log_level=-1, log_level_replica=-1, log_on_each_node=True, logging_dir='/tokcl_models/runs/Feb16_22-41-22_e6255381d0ac', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=False, logging_steps=50, logging_nan_inf_filter=True, save_strategy=<IntervalStrategy.STEPS: 'steps'>, save_steps=500, save_total_limit=5, save_on_each_node=False, no_cuda=False, 

In [18]:
training_args.output_dir

'/tokcl_models'

In [19]:
train_tokcl(
    training_args,
    loader_path,
    "GENEPROD_ROLES",
    tokenized_examples,
    no_cache,
    tokenizer,
    model_type,
    from_pretrained
)



Created /tokcl_models/GENEPROD_ROLES.
tokenizer vocab size: 50265

Loading and tokenizing datasets found in /data/json/sd_test.
using ./smtag/loader/loader_tokcl.py as dataset loader.


  0%|          | 0/3 [00:00<?, ?it/s]


Training with 15859 examples.
Evaluating on 4869 examples.

Training on 5 features:
O, I-CONTROLLED_VAR, B-CONTROLLED_VAR, I-MEASURED_VAR, B-MEASURED_VAR


loading configuration file https://huggingface.co/EMBO/bio-lm/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/09fed88b4a07fe6baced126e3cdb14f2764c1bc57f62d1026a75b3ffdb3ec5f8.c781727f43e25ac5b298f775b2dd4f32f53c9890a2367bbd99ffdbd856251b85
Model config RobertaConfig {
  "_name_or_path": "EMBO/bio-lm",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_he


Training arguments for model type Autoencoder:
RobertaConfig {
  "_name_or_path": "EMBO/bio-lm",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

TrainingArgumentsTOKCL(
_n_gp

Step,Training Loss,Validation Loss
50,0.1711,0.072223
100,0.0667,0.068704
150,0.0613,0.065169
200,0.056,0.064815


***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>A, B, C. Cell lysates of HEK293 cells co-transfected with<mask><mask>-[1m[38;5;2m<mask>[0m[1m[38;5;1m<mask>[0m[1m[38;5;1m<mask>[0m (A) and either empty vector control,<mask><mask>-[1m[4m[38;5;2m<mask>[0m[1m[4m[38;5;1m<mask>[0m[1m[4m[38;5;1m<mask>[0m[1m[4m[38;5;1m<mask>[0mS or<mask><mask>-[1m[38;5;2m<mask>[0m[1m[38;5;1m<mask>[0m[1m[38;5;1m<mask>[0m[1m[38;5;1m<mask>[0mL were subjected to immunoprecipitation with anti-[1m[38;5;4m<mask>[0m[1m[38;5;3m<mask>[0m (A and B) antibodies. Immune pellets were probed for<mask><mask>-[1m[38;5;4m<mask>[0m[1m[38;5;3m<mask>[0m[1m[38;5;3m<mask>[0m[1m[38;5;3m<mask>[0m (A and B),<mask><mask>-[1m[4m[38;5;4m<mask>[0m[1m[4m[38;5;3m<mask>[0m[1m[4m[38;5;3m<mask>[0m (A) on immunoblots. </s>




***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>[1m[4m[38;5;2m<mask>[0m[1m[4m[38;5;1m<mask>[0m[1m[4m[38;5;1m<mask>[0mKO_FH<mask><mask><mask> and[1m[4m[38;5;2m<mask>[0m[1m[4m[38;5;1m<mask>[0m[1m[4m[38;5;1m<mask>[0mKO_<mask><mask> cell extracts (n=3 experimental replicates) were immunoprecipitated using an anti-[1m[4m[38;5;4m<mask>[0m[1m[4m[38;5;3m<mask>[0m antibody or IgG as mock IP.[1m[4m[38;5;4m<mask>[0m<mask> and<mask><mask><mask> (as a negative control) abundance was assessed by RT-qPCR.<mask><mask><mask><mask> was used for normalization and enrichment in[1m[38;5;2m<mask>[0m[1m[4m[38;5;3m<mask>[0m-RIP as compared to IgG RIP was plotted. </s>




***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>A YTS-2DL1 cells were incubated with 721.221-[1m[4m[38;5;2m<mask>[0m[1m[4m[38;5;1m<mask>[0m[1m[4m[38;5;1m<mask>[0m or[1m[4m[38;5;2m<mask>[0m[1m[4m[38;5;1m<mask>[0m[1m[4m[38;5;1m<mask>[0m (221-[1m[4m[38;5;2m<mask>[0m[1m[4m[38;5;1m<mask>[0m[1m[4m[38;5;1m<mask>[0m or[1m[4m[38;5;2m<mask>[0m[1m[4m[38;5;1m<mask>[0m[1m[4m[38;5;1m<mask>[0m) cells for 5 min at 370C. The cells were lysed, and immunoprecipitates (IP) of[1m[4m[38;5;4m<mask>[0m[1m[4m[38;5;3m<mask>[0m[1m[4m[38;5;3m<mask>[0m[1m[4m[38;5;3m<mask>[0m were immunoblotted (IB) with anti-p<mask> (top panel), anti-[1m[38;5;4m<mask>[0m[1m[38;5;3m<mask>[0m[1m[38;5;3m<mask>[0m[1m[38;5;3m<mask>[0m (middle panel) or anti-[1m[4m[38;5;4m<mask>[0m[1m[4m[38;5;3m<mask>[0m[1m[4m[38;5;3m<mask>[0m[1m[4m[38;5;3m<mask>[0m (bottom panel) antibodies. </s>




***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>A Control animals (+/w; UAS-[1m[4m[38;5;2m<mask>[0m<mask>IR/+), n= 19, and those with salivary gland-specific knockdown of[1m[4m[38;5;2m<mask>[0m[1m[4m[38;5;1m<mask>[0m (<mask><mask>-[1m[38;5;2m<mask>[0m[1m[38;5;1m<mask>[0m[1m[38;5;1m<mask>[0m/w; UAS-[1m[4m[38;5;2m<mask>[0m[1m[4m[38;5;1m<mask>[0mIR/+), n=20, were analyzed by histology for the presence of salivary gland material (red dotted circle) 24 hours after puparium formation.B Quantification of data from (A). Data are represented as means. Statistical significance was determined using a Chi-square test.C Control animals (+/w; UAS-sec15IR/+), n= 20, and those with salivary gland-specific expression[1m[4m[38;5;2m<mask>[0m[1m[4m[38;5;1m<mask>[0m (<mask><mask>-[1m[38;5;2m<mask>[0m[1m[38;5;1m<mask>[0m[1m[38;5;1m<mask>[0m/w; UAS-[1m[4m[38;5;2m<mask>[0m[1m[4m[38;5;1m<mask>[0mIR/+), n= 20, were analyzed by histology for the presence of salivary gland material (red dotted circle) 



Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /tokcl_models/GENEPROD_ROLES
Configuration saved in /tokcl_models/GENEPROD_ROLES/config.json
Model weights saved in /tokcl_models/GENEPROD_ROLES/pytorch_model.bin
***** Running Prediction *****
  Num examples = 2399
  Batch size = 64


Testing on 2399.



                                                                                
                precision    recall  f1-score   support

CONTROLLED_VAR       0.74      0.81      0.77      2906
  MEASURED_VAR       0.80      0.81      0.80      4151

     micro avg       0.77      0.81      0.79      7057
     macro avg       0.77      0.81      0.79      7057
  weighted avg       0.77      0.81      0.79      7057

{'test_loss': 0.05420839041471481, 'test_accuracy_score': 0.9786475863015247, 'test_precision': 0.7709459459459459, 'test_recall': 0.8084171744367294, 'test_f1': 0.789237047796915, 'test_runtime': 19.7826, 'test_samples_per_second': 121.268, 'test_steps_per_second': 1.921}


### Train SMALL MOL ROLES

In [20]:
training_args.num_train_epochs = 1.0
training_args.prediction_loss_only=True
training_args.masking_probability=1.0
training_args.replacement_probability=.0
training_args

TrainingArgumentsTOKCL(output_dir='/tokcl_models', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, evaluation_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=True, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=0.0001, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1.0, max_steps=-1, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, warmup_ratio=0.0, warmup_steps=0, log_level=-1, log_level_replica=-1, log_on_each_node=True, logging_dir='/tokcl_models/runs/Feb16_22-41-22_e6255381d0ac', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=False, logging_steps=50, logging_nan_inf_filter=True, save_strategy=<IntervalStrategy.STEPS: 'steps'>, save_steps=500, save_total_limit=5, save_on_each_node=False, no_cuda=False, 

In [21]:
train_tokcl(
    training_args,
    loader_path,
    "SMALL_MOL_ROLES",
    tokenized_examples,
    no_cache,
    tokenizer,
    model_type,
    from_pretrained
)



Created /tokcl_models/SMALL_MOL_ROLES.
tokenizer vocab size: 50265

Loading and tokenizing datasets found in /data/json/sd_test.
using ./smtag/loader/loader_tokcl.py as dataset loader.


  0%|          | 0/3 [00:00<?, ?it/s]


Training with 15859 examples.
Evaluating on 4869 examples.

Training on 5 features:
O, I-CONTROLLED_VAR, B-CONTROLLED_VAR, I-MEASURED_VAR, B-MEASURED_VAR


loading configuration file https://huggingface.co/EMBO/bio-lm/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/09fed88b4a07fe6baced126e3cdb14f2764c1bc57f62d1026a75b3ffdb3ec5f8.c781727f43e25ac5b298f775b2dd4f32f53c9890a2367bbd99ffdbd856251b85
Model config RobertaConfig {
  "_name_or_path": "EMBO/bio-lm",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_he


Training arguments for model type Autoencoder:
RobertaConfig {
  "_name_or_path": "EMBO/bio-lm",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

TrainingArgumentsTOKCL(
_n_gp

Step,Training Loss,Validation Loss
50,0.0969,0.018253
100,0.0175,0.014983
150,0.0129,0.012849
200,0.0132,0.012811


***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>A, B, C. Cell lysates of HEK293 cells co-transfected with FLAG-FIP200 (A) and either empty vector control, Myc-C9orf72S or Myc-C9orf72L were subjected to immunoprecipitation with anti-Myc (A and B) antibodies. Immune pellets were probed for Myc-C9orf72 (A and B), FLAG-FIP200 (A) on immunoblots. </s>




***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>AGO2KO_FH AGO2 and AGO2KO_GFP cell extracts (n=3 experimental replicates) were immunoprecipitated using an anti-TERT antibody or IgG as mock IP. TERC and HOTAIR (as a negative control) abundance was assessed by RT-qPCR. HPRT1 was used for normalization and enrichment in TERT-RIP as compared to IgG RIP was plotted. </s>




***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>A YTS-2DL1 cells were incubated with 721.221-Cw4 or Cw7 (221-Cw4 or Cw7) cells for 5 min at 370C. The cells were lysed, and immunoprecipitates (IP) of SHP-1 were immunoblotted (IB) with anti-pTy (top panel), anti-β-actin (middle panel) or anti-SHP-1 (bottom panel) antibodies. </s>




***** Running Evaluation *****
  Num examples = 4869
  Batch size = 64




<s>A Control animals (+/w; UAS-sec5IR/+), n= 19, and those with salivary gland-specific knockdown of sec5 (fkh-GAL4/w; UAS-sec5IR/+), n=20, were analyzed by histology for the presence of salivary gland material (red dotted circle) 24 hours after puparium formation.B Quantification of data from (A). Data are represented as means. Statistical significance was determined using a Chi-square test.C Control animals (+/w; UAS-sec15IR/+), n= 20, and those with salivary gland-specific expression sec15 (fkh-GAL4/w; UAS-sec15IR/+), n= 20, were analyzed by histology for the presence of salivary gland material (red dotted circle) 24 hours after puparium formation.D Quantification of data from (C). Data are represented as means. Statistical significance was determined using a Chi-square test.E Control animals (+/w; UAS-sec3IR/+), n= 20, and those with salivary gland-specific expression sec3 (fkh-GAL4/w; UAS-sec3IR/+), n= 20, were analyzed by histology for the presence of salivary gland material (r



Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /tokcl_models/SMALL_MOL_ROLES
Configuration saved in /tokcl_models/SMALL_MOL_ROLES/config.json
Model weights saved in /tokcl_models/SMALL_MOL_ROLES/pytorch_model.bin
***** Running Prediction *****
  Num examples = 2399
  Batch size = 64


Testing on 2399.



                                                                                
                precision    recall  f1-score   support

CONTROLLED_VAR       0.75      0.92      0.83      1098
  MEASURED_VAR       0.74      0.79      0.76       313

     micro avg       0.75      0.89      0.81      1411
     macro avg       0.74      0.85      0.79      1411
  weighted avg       0.75      0.89      0.81      1411

{'test_loss': 0.0130100566893816, 'test_accuracy_score': 0.9945922822990064, 'test_precision': 0.7476190476190476, 'test_recall': 0.890148830616584, 'test_f1': 0.8126819799417665, 'test_runtime': 19.3847, 'test_samples_per_second': 123.757, 'test_steps_per_second': 1.96}


### Train PANELIZATION

In [22]:
training_args.num_train_epochs = 2.4
training_args.prediction_loss_only=True
training_args.masking_probability=.0
training_args.replacement_probability=.0
training_args.logging_steps=20
training_args

TrainingArgumentsTOKCL(output_dir='/tokcl_models', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, evaluation_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=True, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=0.0001, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=2.4, max_steps=-1, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, warmup_ratio=0.0, warmup_steps=0, log_level=-1, log_level_replica=-1, log_on_each_node=True, logging_dir='/tokcl_models/runs/Feb16_22-41-22_e6255381d0ac', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=False, logging_steps=20, logging_nan_inf_filter=True, save_strategy=<IntervalStrategy.STEPS: 'steps'>, save_steps=500, save_total_limit=5, save_on_each_node=False, no_cuda=False, 

In [23]:
train_tokcl(
    training_args,
    loader_path,
    "PANELIZATION",
    tokenized_figures,  # Use Figure-level data here!
    no_cache,
    tokenizer,
    model_type,
    from_pretrained
)



Created /tokcl_models/PANELIZATION.
tokenizer vocab size: 50265

Loading and tokenizing datasets found in /data/json/sd_test_figs.
using ./smtag/loader/loader_tokcl.py as dataset loader.


  0%|          | 0/3 [00:00<?, ?it/s]


Training with 4445 examples.
Evaluating on 1296 examples.

Training on 2 features:
O, B-PANEL_START


loading configuration file https://huggingface.co/EMBO/bio-lm/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/09fed88b4a07fe6baced126e3cdb14f2764c1bc57f62d1026a75b3ffdb3ec5f8.c781727f43e25ac5b298f775b2dd4f32f53c9890a2367bbd99ffdbd856251b85
Model config RobertaConfig {
  "_name_or_path": "EMBO/bio-lm",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file http


Training arguments for model type Autoencoder:
RobertaConfig {
  "_name_or_path": "EMBO/bio-lm",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

TrainingArgumentsTOKCL(
_n_gpu=4,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unuse

Step,Training Loss,Validation Loss
50,0.0074,0.004355
100,0.0033,0.003745
150,0.003,0.003175


***** Running Evaluation *****
  Num examples = 1296
  Batch size = 64




<s>[1m[4m[38;5;1m([0ma,b) BJAB lymphoma cells stably expressing mCherry-GFP-LC3 were serially cultured at log phase followed by fluorescence-activated cell sorting for cells with high and low autophagic flux using the ratio of mCherry/GFP (a). The high and low 20% were sorted (a),[1m[4m[38;5;1m ([0ma,b) BJAB lymphoma cells stably expressing mCherry-GFP-LC3 were serially cultured at log phase followed by fluorescence-activated cell sorting for cells with high and low autophagic flux using the ratio of mCherry/GFP (a). The high and low 20% were sorted (a), re-plated and treated with lysosomal protease inhibitors pepstatin and E-64d for 1��h; lysates were then immunoblotted for the indicated proteins (b).[1m[38;5;1m ([0mc) Densitometry of LC3-II and p62 western blots (normalized to actin and hour 0, mean��±��s.e.m., n��=��3 blots from 2 independent experiments, *P��=��0.051, **P��=��0.0091). (d); autophagic LC3 puncta were assessed by quantitative microscopy (d); autophagic L

***** Running Evaluation *****
  Num examples = 1296
  Batch size = 64




<s>[1m[4m[38;5;1m([0mA-B) mRNA for indicated genes in BAT (n=4)[1m[4m[38;5;1m ([0mc) mRNA for the indicated genes in epididymal white adipose tissue (eWAT) (n=4)[1m[4m[38;5;1m ([0mD) immunoblots for indicated proteins in BAT from 10‐month (mo)‐old chow diet (RD)‐fed control (Con) and knock out (KO) mice. Arrows depict protein isoforms.[1m[4m[38;5;1m ([0mE) immunoblots for indicated proteins in eWAT from 10‐month (mo)‐old chow diet (RD)‐fed control (Con) and knock out (KO) mice. Arrows depict protein isoforms.[1m[4m[38;5;1m ([0mF) Electron micrographs ( × 10,000 magnification) of BAT depicting mitochondria from 4‐mo‐old Con and KO mice. m, mitochondria; LD, lipid droplet; n, nucleus.[1m[4m[38;5;1m ([0mG) mRNA levels (n=4) in BAT from 10‐mo‐old Con and KO mice[1m[4m[38;5;1m ([0mH) hematoxylin and eosin (HE) and (I) Sirius Red stains in BAT from 10‐mo‐old Con and KO mice (n=3-4). Average adipocyte and LD size, and LD number in BAT are shown. Scale bar, 50 μm.

***** Running Evaluation *****
  Num examples = 1296
  Batch size = 64




<s>[1m[4m[38;5;1m([0ma) Primary cortical neurons were transduced with lentivirus expressing sh-RNA for Nrf2 or rat NDP52 (rNDP52), or a scrambled sh-RNA at DIV 1, and were maintained until DIV 6. The levels of tau phosphorylated at Ser262/Ser356 and Ser396/Ser404 were analysed by immunoblotting using 12E8- and PHF1-specific antibodies, respectively. Total tau was detected with a polyclonal tau-specific antibody (Tau). The relative molecular masses (kDa) are indicated to the left of each blot. (b) Bar graph of the relative optical density of phosphorylated tau normalized to actin. Data shown are mean±s.e. of three independent experiments and were analysed using Student's t-test. (*P0.05; ***P0.001) (c,d) Primary cortical neurons were transduced with a control lentivirus (FIGB) or with one expressing humanNDP52 (hNDP52) at DIV 1. To induce autophagy, trehalose (150��mM) was added at DIV 5 and the neurons incubated for 24��h (DIV 6). Primary cortical neurons were fixed with 4% paraf



Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /tokcl_models/PANELIZATION
Configuration saved in /tokcl_models/PANELIZATION/config.json
Model weights saved in /tokcl_models/PANELIZATION/pytorch_model.bin
***** Running Prediction *****
  Num examples = 652
  Batch size = 64


Testing on 652.



                                                                                
              precision    recall  f1-score   support

 PANEL_START       0.92      0.97      0.94      1860

   micro avg       0.92      0.97      0.94      1860
   macro avg       0.92      0.97      0.94      1860
weighted avg       0.92      0.97      0.94      1860

{'test_loss': 0.002895773621276021, 'test_accuracy_score': 0.9990617418539087, 'test_precision': 0.9226434426229508, 'test_recall': 0.9682795698924731, 'test_f1': 0.9449108079748164, 'test_runtime': 11.4343, 'test_samples_per_second': 57.022, 'test_steps_per_second': 0.962}


In [None]:
! ls /tokcl_models/NER

### Alternative via CLI:

Useful for testing and debugging from within `tmux` session and `docker-compose exec nlp bash`

```bash
python -m smtag.cli.tokcl.train \
./smtag/loader/loader_tokcl.py \
PANELIZATION \
--data_dir /data/json/sd_test \
--num_train_epochs=1 \
--logging_steps=50 \
--per_device_train_batch_size=16 \
--per_device_eval_batch_size=16 \
--replacement_probability=0 \
--masking_probability=0 \
--model_type=Autoencoder \
--from_pretrained="EMBO/bio-lm"
```

## Try it!

In [2]:
from smtag.pipeline import SmartTagger

In [3]:
smarttagger = SmartTagger(
    tokenizer_source="roberta-base",
    panelizer_source="/tokcl_models/PANELIZATION",
    ner_source="/tokcl_models/NER",
    geneprod_roles_source="/tokcl_models/GENEPROD_ROLES",
    small_mol_roles_source="/tokcl_models/SMALL_MOL_ROLES"
)

In [4]:
tags = smarttagger("This creb1-/- mutant mouse has a strange brain after aspirin treatment.")
print(tags)

{
  "smtag": [
    {
      "panel_group": [
        [
          {
            "text": "creb1",
            "type": "geneprod",
            "role": "intervention"
          },
          {
            "text": "mouse",
            "type": "organism"
          },
          {
            "text": "brain",
            "type": "tissue"
          },
          {
            "text": "aspirin",
            "type": "molecule",
            "role": "intervention"
          }
        ]
      ]
    }
  ]
}


With CLI:

    python -m smtag.cli.inference.tag --local_model_dir /tokcl_models "This creb1-/- mutant mouse has a strange brain after aspirin treatment."