### NER tags on the text T5-base

In [2]:
from datasets import load_dataset, load_from_disk

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from smtag.train.train_seq2seq import FromIob2seq2seq

In [3]:
dataset = FromIob2seq2seq(labels=["GENEPROD", "EXP_ASSAY", "SMALL_MOL"],
         generate_end="[END]",
         prompt_init="Copy the input and label the GENEPROD, SMALL_MOL and EXP_ASSAY entities: ",
                          task_type="copy_tag"
)
dataset = dataset()

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading and preparing dataset source_data_nlp/NER to /root/.cache/huggingface/datasets/EMBO___source_data_nlp/NER/1.0.0/536e86a707c3e5578223ffb2658d82399fe009a240cfd0ac1c59ad6f4d543fae...


Downloading:   0%|          | 0.00/18.9M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset source_data_nlp downloaded and prepared to /root/.cache/huggingface/datasets/EMBO___source_data_nlp/NER/1.0.0/536e86a707c3e5578223ffb2658d82399fe009a240cfd0ac1c59ad6f4d543fae. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/67 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

In [4]:
checkpoint = "seq2seq_models/checkpoint-40000"
seq2seq_config = {
        # Parameters for sequence generation
        "max_length": 512,
        "min_length": 0,
        "do_sample": True,
        "early_stopping": True,
        "num_beams": 1,
        "num_beam_groups": 1,
        "diversity_penalty": 0.0,
        "temperature": 1.0,
        "top_k": 20,
        "top_p": 0.90,
#         "typical_p": 1.0,
        "repetition_penalty": 1.0,
        "length_penalty": 50.0,
        "no_repeat_ngram_size": 0,
        "encoder_no_repeat_ngram_size": 0,
        "bad_words_ids": None,
        "num_return_sequences": 5,
        "chunk_size_feed_forward": 0,
        "output_scores": False,
        "return_dict_in_generate": False,
        "forced_bos_token_id": None,
        "forced_eos_token_id": None,
        "remove_invalid_values": False,
#         "exponential_decay_length_penalty": None
    
}
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, **seq2seq_config)
tokenizer = AutoTokenizer.from_pretrained(checkpoint, add_prefix_space=True, is_split_into_words=False)

In [5]:
example_number = 21

In [6]:
print(dataset["test"][example_number]["input"])

Copy the input and label the GENEPROD, SMALL_MOL and EXP_ASSAY entities:   A . STAT3 and PDK4 stratified subgroups were generated by median splits in MSKCC PCa GSE21032 data set . Pearson correlation between STAT3 and PDK4 is shown . Kaplan - Meier plot shows stratified subgroups . P - values were estimated by Log - rank test and adjusted with Benjamini - Hochberg method . Hi = high , lo = low . 

END_INPUT




In [7]:
print(dataset["test"][example_number]["target"])

A . <GENEPROD> STAT3 </GENEPROD> and <GENEPROD> PDK4 </GENEPROD> stratified subgroups were generated by median splits in MSKCC PCa GSE21032 data set . Pearson correlation between <GENEPROD> STAT3 </GENEPROD> and <GENEPROD> PDK4 </GENEPROD> is shown . <EXP_ASSAY> Kaplan - Meier plot </EXP_ASSAY> shows stratified subgroups . P - values were estimated by Log - rank test and adjusted with Benjamini - Hochberg method . Hi = high , lo = low .  [END]


In [8]:
token_inputs = tokenizer(dataset["test"][example_number]["input"], return_tensors="pt")
output = model.generate(**token_inputs, 
                        return_dict_in_generate=False, 
                        output_scores=False,
                        output_attentions=False)
print(tokenizer.decode(output[0],
                skip_special_tokens= True,
                clean_up_tokenization_spaces= True,
                ))

A. STAT3 and PDK4 stratified subgroups were generated by median splits in MSKCC PCa GSE21032 data set. Pearson correlation between STAT3 and PDK4 is shown. Kaplan - Meier plot shows stratified subgroups. P - values were estimated by Log - rank test and adjusted with Benjamini - Hochberg method. Hi = high, lo = low. [END]


In [9]:
print(tokenizer.decode(output[1],
                skip_special_tokens= True,
                clean_up_tokenization_spaces= True,
                ))

A. GENEPROD> STAT3 /GENEPROD> and GENEPROD> PDK4 /GENEPROD> stratified subgroups were generated by median splits in MSKCC PCa GSE21032 data set. Pearson correlation between GENEPROD> STAT3 /GENEPROD> and GENEPROD> PDK4 /GENEPROD> is shown. EXP_ASSAY> Kaplan - Meier /EXP_ASSAY> plot shows stratified subgroups. P - values were estimated by Log - rank test and adjusted with Benjamini - Hochberg method. Hi = high, lo = low. [END]


### Causal hypothesis GENEPROD BART-base

In [10]:
seq2seq_config = {
        # Parameters for sequence generation
        "max_length": 512,
        "min_length": 0,
        "do_sample": True,
        "early_stopping": True,
        "num_beams": 1,
        "num_beam_groups": 1,
        "diversity_penalty": 0.0,
        "temperature": 1.0,
        "top_k": 20,
        "top_p": 0.90,
#         "typical_p": 1.0,
        "repetition_penalty": 1.0,
        "length_penalty": 50.0,
        "no_repeat_ngram_size": 0,
        "encoder_no_repeat_ngram_size": 0,
        "bad_words_ids": None,
        "num_return_sequences": 5,
        "chunk_size_feed_forward": 0,
        "output_scores": False,
        "return_dict_in_generate": False,
        "forced_bos_token_id": None,
        "forced_eos_token_id": None,
        "remove_invalid_values": False,
#         "exponential_decay_length_penalty": None
    
}

In [11]:
checkpoint = "/app/test_seq2seq_metrics/bart-base-causal-geneprod"
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, **seq2seq_config)
tokenizer = AutoTokenizer.from_pretrained(checkpoint, add_prefix_space=True, is_split_into_words=False)

In [14]:
import pandas as pd
data_file = "/app/data/seq2seq/GENEPROD_seq2seq_cleaned.csv"
ds = pd.read_csv(data_file)

ds_test = ds[ds["subset"] == "testset"]
ds_test

Unnamed: 0.1,Unnamed: 0,subset,input_text,output_text
23,23,testset,Find causal hypotheses for geneprod: D. Analys...,ste11 was tested for its influence on hog1
24,24,testset,Find causal hypotheses for geneprod: D. Phos-t...,pbs2 was tested for its influence on hog1
25,25,testset,Find causal hypotheses for geneprod: E. Compar...,pbs2 or ste11 was tested for its influence on ...
26,26,testset,Find causal hypotheses for geneprod: C. Immuno...,hog1 was tested for its influence on p38
27,27,testset,Find causal hypotheses for geneprod: D-G. Phos...,hog1 or pbs2 was tested for its influence on hog1
...,...,...,...,...
11458,11458,testset,"Find causal hypotheses for geneprod: F, G. Rea...",hmena was tested for its influence on gas6
11459,11459,testset,Find causal hypotheses for geneprod: A. Immuno...,hmena was tested for its influence on axl
11460,11460,testset,Find causal hypotheses for geneprod: B. Real t...,hmena was tested for its influence on axl
11461,11461,testset,Find causal hypotheses for geneprod: C. Real t...,hmena was tested for its influence on axl


In [24]:
example = ds.sample()


token_inputs = tokenizer(example["input_text"].values[0], return_tensors="pt")
output = model.generate(**token_inputs, 
                        return_dict_in_generate=False, 
                        output_scores=False,
                        output_attentions=False)


print(30*"*")
print(f"INPUT TEXT: {example['input_text'].values[0]}")
print(30*"*")
print(f"GENERATED TEXT 0: {tokenizer.decode(output[0],skip_special_tokens= True,clean_up_tokenization_spaces= True)}")
print(30*"*")
print(f"GENERATED TEXT 1: {tokenizer.decode(output[1],skip_special_tokens= True,clean_up_tokenization_spaces= True)}")
print(30*"*")
print(f"GENERATED TEXT 2: {tokenizer.decode(output[2],skip_special_tokens= True,clean_up_tokenization_spaces= True)}")
print(30*"*")
print(f"GENERATED TEXT 3: {tokenizer.decode(output[3],skip_special_tokens= True,clean_up_tokenization_spaces= True)}")
print(30*"*")
print(f"EXPECTED TEXT: {example['output_text'].values[0]}")

******************************
INPUT TEXT: Find causal hypotheses for geneprod: Panel A: viral and donor specific T-cell responses were measured by IFN-γELISpot at different time points: before the beginning of treatment (PRE), prior to each infusion and 2 months after the fourth infusion. Infusions are indicated by arrows. Patient's peripheral blood mononuclear cells (PBMC) were challenged with irradiated autologous lymphoblastoid cell lines (EBV line host), with cytomegalovirus glycin extract (CMV antigen)(Gehrz et al, 1987) and with the following irradiated cells harvested from the donor: untreated mesoangioblasts (MABdonor), MAB activated by 48 hour-exposure to 500 IU/mL IFN-γ (γMABdonor), myotubes differentiated from MAB (myotubesdonor) and PBMC (PBMCdonor). Polyclonal stimulation (phytohemagglutinin, PHA) was used as positive control. Donor PBMC were challenged with autologous targets as negative controls. Results are expressed as number of specific cells/105PBMC and calculated a

### Causal hypothesis GENEPROD T5-base

### Checking pipeline

In [1]:
from smtag.pipeline import SmartTagger
import json
from transformers import AutoTokenizer, AutoModelForTokenClassification

In [2]:
text = "Figure 2A. HEK293T cells were transfected with MYC-FOXP3 and FLAG-USP44 encoding expression constructs using Polyethylenimine. 48hrs post-transfection, cells were harvested, lysed, and anti-FLAG or anti-MYC antibody coated beads were used to immunoprecipitate the given labeled protein along with its binding partner. Co-IP' ed proteins were subjected to SDS PAGE followed by immunoblot analysis. Antibodies recognizing FLAG or MYC tags were used to probe for USP44 and FOXP3, respectively. B. Endogenous co-IP of USP44 and FOXP3 in murine iTregs. iTregs were generated as in Fig. 1 from naïve CD4+T cells FACS isolated from pooled suspensions of the lymph node and spleen cells of wild type C57BL/6 mice (n = 2-3 / experiment). iTregs were lysed and key proteins were immunoprecipitated using either anti-USP44 (right panel) or anti-FOXP3 (left panel) antibody. Proteins pulled-down in this experiment were then resolved and analyzed by immunoblot using anti-FOXP3 or anti-USP44 antibodies. C. Endogenous co-IP of USP44 and FOXP3 in murine nTregs. nTregs (CD4+CD25high) isolated by FACS were activated by anti-CD3 and anti-CD28 (1 and 4 ug/ml, respectively) overnight in the presence of IL-2 (100 U/ml). The cells were lysed and proteins were immunoprecipitated using either anti-Foxp3 (left panel) or anti-Usp44 (right panel). Proteins pulled down in this experiment were then resolved and identified with the indicated antibodies. D . Naïve murine CD4+T cells were isolated by FACS from lymph node and spleen cell suspension of USP44fl/fl CD4Cre+ mice and that of their wild type littermates (USP44fl/fl CD4Cre-mice; n = 2-3 / group / experiment) . iTreg cells were generated from these mice as described for Fig. 1 before incubation on a microscope slide pre-coated with poly-L lysine for 1h. Adhered cells were then fixed by PFA for 0.5 followed by blocking with 1% BSA for 1h, then incubation with the specified antibodies. Representative confocal microscopy images (40X) were visualized for endogenous USP44 (red) and FOXP3 Baxter et al (). DAPI was used to visualize cell nuclei (blue); scale bar 50μm."

In [3]:
smtag = SmartTagger()
json_ = smtag(text)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [4]:
text

"Figure 2A. HEK293T cells were transfected with MYC-FOXP3 and FLAG-USP44 encoding expression constructs using Polyethylenimine. 48hrs post-transfection, cells were harvested, lysed, and anti-FLAG or anti-MYC antibody coated beads were used to immunoprecipitate the given labeled protein along with its binding partner. Co-IP' ed proteins were subjected to SDS PAGE followed by immunoblot analysis. Antibodies recognizing FLAG or MYC tags were used to probe for USP44 and FOXP3, respectively. B. Endogenous co-IP of USP44 and FOXP3 in murine iTregs. iTregs were generated as in Fig. 1 from naïve CD4+T cells FACS isolated from pooled suspensions of the lymph node and spleen cells of wild type C57BL/6 mice (n = 2-3 / experiment). iTregs were lysed and key proteins were immunoprecipitated using either anti-USP44 (right panel) or anti-FOXP3 (left panel) antibody. Proteins pulled-down in this experiment were then resolved and analyzed by immunoblot using anti-FOXP3 or anti-USP44 antibodies. C. Endo

In [5]:
json.loads(json_)["smtag"]

[{'panel_group': [[],
   [{'text': 'hek293t', 'entity_type': 'cell'},
    {'text': 'myc', 'entity_type': 'geneprod', 'role': 'intervention'},
    {'text': 'foxp3', 'entity_type': 'geneprod', 'role': 'intervention'},
    {'text': 'flag', 'entity_type': 'geneprod', 'role': 'intervention'},
    {'text': 'usp44', 'entity_type': 'geneprod', 'role': 'intervention'},
    {'text': '##imine', 'entity_type': 'molecule', 'role': 'assayed'},
    {'text': 'flag', 'entity_type': 'geneprod', 'role': 'assayed'},
    {'text': 'myc', 'entity_type': 'geneprod', 'role': 'assayed'},
    {'text': 'immunoprecipitate', 'category': 'assay'},
    {'text': 'co - ip', 'category': 'assay'},
    {'text': 'sds page', 'category': 'assay'},
    {'text': 'immunoblot', 'category': 'assay'},
    {'text': 'flag', 'entity_type': 'geneprod', 'role': 'assayed'},
    {'text': 'myc', 'entity_type': 'geneprod', 'role': 'assayed'},
    {'text': 'usp44', 'entity_type': 'geneprod', 'role': 'assayed'},
    {'text': 'foxp3', 'entity