In [1]:
import numpy as np
import pandas as pd
import logging
import os
import glob
import regex as re
import torch
import argparse
import random
import itertools
import random
import ast
import sys
import ast

from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import simpletransformers
from simpletransformers.t5 import T5Model, T5Args
from datasets import load_dataset, get_dataset_split_names

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ['TORCH_HOME'] = '/scratch/wadhwa.s/cache/'
os.environ['HF_HOME'] = '/scratch/wadhwa.s/cache'
os.environ['TRANSFORMERS_CACHE'] = '/scratch/wadhwa.s/cache'

In [3]:
dataset = load_dataset('ade_corpus_v2', 'Ade_corpus_v2_drug_ade_relation')
ade_dict = {
    "text" : dataset['train']['text'],
    "drug" : dataset['train']['drug'],
    "effect": dataset['train']['effect'],
    "index" : dataset['train']['indexes']
}
df = pd.DataFrame(ade_dict, index=None)
df.shape

Found cached dataset ade_corpus_v2 (/home/wadhwa.s/.cache/huggingface/datasets/ade_corpus_v2/Ade_corpus_v2_drug_ade_relation/1.0.0/940d61334dbfac6b01ac5d00286a2122608b8dc79706ee7e9206a1edb172c559)
100%|███████████████████████████████████████████████████████| 1/1 [00:00<00:00, 414.13it/s]


(6821, 4)

In [4]:
input_docs = "ade_gpt3.txt"

In [5]:
with open(input_docs) as f:
    lines = f.readlines()
    unique_ade = [line.strip() for line in lines]

In [6]:
few_shot = random.sample(unique_ade, 7)

In [7]:
prompt = ""
for i in few_shot:
    de_pairs = []
    for ix, row in df.loc[df['text'] == i].iterrows():
        de_pairs.append([row["drug"], row["effect"]])
    prompt += i + "\n" + str(de_pairs) + "\n\n"
print (prompt)

A 22-year-old black man developed fever, chills, fatigue, night sweats, tender lymphadenopathy, and a generalized, pruritic, macular eruption 3 weeks after starting minocycline therapy for acne.
[['minocycline', 'chills'], ['minocycline', 'fatigue'], ['minocycline', 'fever'], ['minocycline', 'generalized, pruritic, macular eruption'], ['minocycline', 'night sweats']]

Neurological improvement and rehabilitation potential following toxic myelopathy due to intrathecal injection of doxorubicin.
[['doxorubicin', 'toxic myelopathy']]

The cause is presumed to be secondary to hypercoagulability due to asparaginase-induced antithrombin III deficiency.
[['asparaginase', 'antithrombin III deficiency']]

Two patients with imminent gangrene of the extremities caused by ergot-induced arteriospasm underwent mechanical dilatation of the arteries when conventional measures such as anticoagulation, vasodilation, and sympathetic blockade produced no improvement.
[['ergot', 'arteriospasm'], ['ergot', 'g

In [8]:
# with open("few_7_shot_prompt.txt", "w") as text_file:
#     text_file.write(prompt)

In [9]:
with open("few_7_shot_prompt_ade.txt", "r") as text_file:
    prompt = text_file.read()
print (prompt)

Gentamicin-associated acute renal failure.
[['Gentamicin', 'acute renal failure']]</s>

Ampicillin-associated seizures.
[['Ampicillin', 'seizures']]</s>

Approximately 15 min after the first administration of nebulised morphine the patient became markedly bradypneic (respiratory rate: 4-5 bpm), hypotensive (BP 70/40 mmHg), and responded only partially to command.
[['morphine', 'bradypneic'], ['morphine', 'hypotensive']]</s>

Disseminated eruptive giant mollusca contagiosa in an adult psoriasis patient during efalizumab therapy.
[['efalizumab', 'Disseminated eruptive giant mollusca contagiosa']]</s>

Three patients are reported without a history of angina pectoris who had clinical and electrocardiographic evidence of myocardial ischemia during and immediately after BCNU infusion.
[['BCNU', 'myocardial ischemia']]</s>

Vogt-Koyanagi-Harada disease occurring during interferon alpha therapy for chronic hepatitis C.
[['interferon alpha', 'Vogt-Koyanagi-Harada disease']]</s>

Complications o

In [10]:
prompt = prompt + "Temporary neurologic abnormalities were observed in one out of 23 patients undergoing chemotherapy with high-dose methotrexate (HD-MTX) for osteogenic sarcoma."

In [11]:
print (prompt)

Gentamicin-associated acute renal failure.
[['Gentamicin', 'acute renal failure']]</s>

Ampicillin-associated seizures.
[['Ampicillin', 'seizures']]</s>

Approximately 15 min after the first administration of nebulised morphine the patient became markedly bradypneic (respiratory rate: 4-5 bpm), hypotensive (BP 70/40 mmHg), and responded only partially to command.
[['morphine', 'bradypneic'], ['morphine', 'hypotensive']]</s>

Disseminated eruptive giant mollusca contagiosa in an adult psoriasis patient during efalizumab therapy.
[['efalizumab', 'Disseminated eruptive giant mollusca contagiosa']]</s>

Three patients are reported without a history of angina pectoris who had clinical and electrocardiographic evidence of myocardial ischemia during and immediately after BCNU infusion.
[['BCNU', 'myocardial ischemia']]</s>

Vogt-Koyanagi-Harada disease occurring during interferon alpha therapy for chronic hepatitis C.
[['interferon alpha', 'Vogt-Koyanagi-Harada disease']]</s>

Complications o

In [12]:
flan_ade = [prompt + s for s in lines]

In [13]:
print (flan_ade[0])

Gentamicin-associated acute renal failure.
[['Gentamicin', 'acute renal failure']]</s>

Ampicillin-associated seizures.
[['Ampicillin', 'seizures']]</s>

Approximately 15 min after the first administration of nebulised morphine the patient became markedly bradypneic (respiratory rate: 4-5 bpm), hypotensive (BP 70/40 mmHg), and responded only partially to command.
[['morphine', 'bradypneic'], ['morphine', 'hypotensive']]</s>

Disseminated eruptive giant mollusca contagiosa in an adult psoriasis patient during efalizumab therapy.
[['efalizumab', 'Disseminated eruptive giant mollusca contagiosa']]</s>

Three patients are reported without a history of angina pectoris who had clinical and electrocardiographic evidence of myocardial ischemia during and immediately after BCNU infusion.
[['BCNU', 'myocardial ischemia']]</s>

Vogt-Koyanagi-Harada disease occurring during interferon alpha therapy for chronic hepatitis C.
[['interferon alpha', 'Vogt-Koyanagi-Harada disease']]</s>

Complications o

In [14]:
dev_docs = "ade_dev.txt"

In [15]:
with open(dev_docs) as f:
    lines_dev = f.readlines()
    unique_ade_dev = [line.strip() for line in lines_dev]

In [16]:
flan_ade = [prompt + s.strip() for s in lines_dev]

In [17]:
print (prompt)

Gentamicin-associated acute renal failure.
[['Gentamicin', 'acute renal failure']]</s>

Ampicillin-associated seizures.
[['Ampicillin', 'seizures']]</s>

Approximately 15 min after the first administration of nebulised morphine the patient became markedly bradypneic (respiratory rate: 4-5 bpm), hypotensive (BP 70/40 mmHg), and responded only partially to command.
[['morphine', 'bradypneic'], ['morphine', 'hypotensive']]</s>

Disseminated eruptive giant mollusca contagiosa in an adult psoriasis patient during efalizumab therapy.
[['efalizumab', 'Disseminated eruptive giant mollusca contagiosa']]</s>

Three patients are reported without a history of angina pectoris who had clinical and electrocardiographic evidence of myocardial ischemia during and immediately after BCNU infusion.
[['BCNU', 'myocardial ischemia']]</s>

Vogt-Koyanagi-Harada disease occurring during interferon alpha therapy for chronic hepatitis C.
[['interferon alpha', 'Vogt-Koyanagi-Harada disease']]</s>

Complications o

In [6]:
model_args = {
    "cache_dir" : "/scratch/wadhwa.s/cache/",
    "device_map" : "auto",
    "use_multiprocessing": False,
    "use_multiprocessed_decoding": False,
#         "num_train_epochs" : 4,
         "no_save" : True,
#         "preprocess_inputs" : False,
         "overwrite_output_dir" : True,
    "max_seq_length" : 512,
#         "special_tokens_list" : ["<bos>", "<eos>", "<rel>", "<ent>"],
         "max_length" : 100,
#         "num_beams" : 5,
#         "learning_rate": lr,

    }

In [7]:
model = T5Model("t5", "google/flan-t5-xxl", args=model_args)

In [12]:
to_predict = flan_ade[:20]

In [23]:
for i in to_predict:
    de_pairs = []
    text = i.split("\n")[-1]
    print ("INPUT: ", text)
    for ix, row in df.loc[df['text'] == text].iterrows():
        de_pairs.append([row["drug"], row["effect"]])
    print ("TRUE: ", de_pairs)
    print ("GENERATED: ", model.predict([i])[0])
    print ("\n--------------\n")


# for s in to_predict:
#     print (s.split("\n")[-1])
#     print ("\n--------------\n")

INPUT:  These case reports provide evidence that 5-aminosalicylic acid may induce acute pancreatitis after long term treatment.
TRUE:  [['5-aminosalicylic acid', 'acute pancreatitis']]


Generating outputs: 100%|█████████████████████████| 1/1 [00:02<00:00,  2.09s/it]


GENERATED:  [['5-aminosalicylic acid', 'acute pancreatitis']]

--------------

INPUT:  A search of the United States Food and Drug Administration's Adverse Event Reporting System identified nine cases of oxcarbazepine-associated angioedema in pediatric patients aged 16 years and younger.
TRUE:  [['oxcarbazepine', 'angioedema']]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.91s/it]


GENERATED:  [['oxcarbazepine', 'angioedema']]

--------------

INPUT:  Human insulin-induced lipoatrophy.
TRUE:  [['Human insulin', 'lipoatrophy']]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.44s/it]


GENERATED:  [['Human insulin-induced lipoatrophy']]

--------------

INPUT:  In all the patients, pellagra symptoms appeared during isoniazid therapy.
TRUE:  [['isoniazid', 'pellagra']]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.59s/it]


GENERATED:  [['isoniazid', 'pellagra']]

--------------

INPUT:  We present a case of a 20-year-old woman who ingested 900 mg of glyburide causing refractory hypoglycemia resistant to treatment with intravenous dextrose, glucagon, and diazoxide.
TRUE:  [['glyburide', 'hypoglycemia']]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.85s/it]


GENERATED:  [['glyburide','refractory hypoglycemia']]

--------------

INPUT:  We present two children with acute lymphocytic leukemia who developed leukoencephalopathy following administration of a combination of intravenous ara = C and methotrexate during the consolidation phase of chemotherapy.
TRUE:  [['ara = C', 'leukoencephalopathy'], ['methotrexate', 'leukoencephalopathy']]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.67s/it]


GENERATED:  [['leukoencephalopathy', 'ara = C']]

--------------

INPUT:  A 62-year-old Caucasian man with atrial fibrillation who was taking warfarin reported an episode of hematochezia; his international normalized ratio (INR) was 1.74.
TRUE:  [['warfarin', 'hematochezia']]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.66s/it]


GENERATED:  [['Warfarin', 'hematochezia']]

--------------

INPUT:  Choanal atresia and athelia: methimazole teratogenicity or a new syndrome?
TRUE:  [['methimazole', 'athelia'], ['methimazole', 'Choanal atresia'], ['methimazole', 'teratogenicity']]


Generating outputs: 100%|█████████████████████████| 1/1 [00:02<00:00,  2.40s/it]


GENERATED:  [['Choanal atresia and athelia','methimazole teratogenicity']]

--------------

INPUT:  DATA SYNTHESIS: Genetic deficiencies in DPD, the rate-limiting enzyme responsible for 5-FU catabolism, may occur in 3% or more of patients with cancer putting them at increased risk for unusually severe adverse reactions (e.g., diarrhea, stomatitis, mucositis, myelosuppression, neurotoxicity) to standard doses of 5-FU.
TRUE:  [['5-FU', 'diarrhea'], ['5-FU', 'diarrhea'], ['5-FU', 'mucositis'], ['5-FU', 'mucositis'], ['5-FU', 'myelosuppression'], ['5-FU', 'myelosuppression'], ['5-FU', 'neurotoxicity'], ['5-FU', 'neurotoxicity'], ['5-FU', 'stomatitis'], ['5-FU', 'stomatitis']]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.91s/it]


GENERATED:  [['5-FU', 'DPD', 'genetic deficiencies']]

--------------

INPUT:  We describe a patient who developed acute cerebellar syndrome after prophylactic intrathecal methotrexate administration and recovered spontaneously.
TRUE:  [['methotrexate', 'acute cerebellar syndrome']]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.72s/it]


GENERATED:  [['methotrexate', 'cerebellar syndrome']]

--------------

INPUT:  Four Chinese female patients who suffered from manic-depressive disorder and underlying autoimmune thyroiditis developed transient episodes of thyrotoxicosis during maintenance lithium therapy.
TRUE:  [['lithium', 'thyrotoxicosis']]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.89s/it]


GENERATED:  [['Lithium', 'thyrotoxicosis']]

--------------

INPUT:  Such anagen effluvium with lichenoid eruption following INH therapy has not been observed previously.
TRUE:  [['INH', 'anagen effluvium'], ['INH', 'lichenoid eruption']]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.74s/it]


GENERATED:  [['INH', 'anagen effluvium']]

--------------

TRUE:  [['quinine', 'thrombocytopenia'], ['quinine', 'thrombocytopenia'], ['quinine', 'thrombocytopenia']]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.69s/it]


GENERATED:  [['quinine', 'thrombocytopenia']]

--------------

INPUT:  Acute pancreatitis after long-term 5-aminosalicylic acid therapy.
TRUE:  [['5-aminosalicylic acid', 'Acute pancreatitis']]


Generating outputs: 100%|█████████████████████████| 1/1 [00:02<00:00,  2.03s/it]


GENERATED:  [['5-aminosalicylic acid', 'acute pancreatitis']]

--------------

INPUT:  We report a patient in whom the anti-depressant trazodone hydrochloride (Molipaxin, Roussel), a serotonin antagonist, provoked generalized pustular psoriasis (GPP).
TRUE:  [['Molipaxin', 'generalized pustular psoriasis'], ['trazodone hydrochloride', 'generalized pustular psoriasis'], ['Molipaxin', 'GPP'], ['trazodone hydrochloride', 'GPP']]


Generating outputs: 100%|█████████████████████████| 1/1 [00:02<00:00,  2.34s/it]


GENERATED:  [['trazodone hydrochloride', 'generalized pustular psoriasis']]

--------------

INPUT:  Progressive pulmonary fibrosis complicating cyclophosphamide therapy.
TRUE:  [['cyclophosphamide', 'Progressive pulmonary fibrosis']]


Generating outputs: 100%|█████████████████████████| 1/1 [00:02<00:00,  2.09s/it]


GENERATED:  [['cyclophosphamide', 'progressive pulmonary fibrosis']]

--------------

INPUT:  Valproate-induced hyperammonemia as a cause of altered mental status.
TRUE:  [['Valproate', 'altered mental status'], ['Valproate', 'hyperammonemia']]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.72s/it]


GENERATED:  [['hyperammonemia', 'altered mental status']]

--------------

INPUT:  Anisocoria from transdermal scopolamine.
TRUE:  [['scopolamine', 'Anisocoria']]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.81s/it]


GENERATED:  [['Scopolamine', 'Anisocoria']]

--------------

INPUT:  Captopril is known to be associated with dermatologic, hematologic, and pulmonary adverse effects.
TRUE:  [['Captopril', 'dermatologic, hematologic, and pulmonary adverse effects']]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.56s/it]


GENERATED:  [['Captopril', 'dermatologic']]

--------------

INPUT:  Schneiderian first-rank symptoms associated with fluvoxamine treatment: a case report.
TRUE:  [['fluvoxamine', 'Schneiderian first-rank symptoms']]


Generating outputs: 100%|█████████████████████████| 1/1 [00:02<00:00,  2.01s/it]

GENERATED:  [['fluvoxamine', 'Schneiderian first-rank symptoms']]

--------------






In [15]:
for s in to_predict:
    print (model.predict([s]))

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

Generating outputs: 100%|█████████████████████████| 1/1 [00:02<00:00,  2.75s/it]


["[['5-aminosalicylic acid', 'acute pancreatitis']]"]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.85s/it]


["[['oxcarbazepine', 'angioedema']]"]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.49s/it]


["[['Human insulin-induced lipoatrophy']]"]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.56s/it]


["[['isoniazid', 'pellagra']]"]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.85s/it]


["[['glyburide','refractory hypoglycemia']]"]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.66s/it]


["[['leukoencephalopathy', 'ara = C']]"]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.64s/it]


["[['Warfarin', 'hematochezia']]"]


Generating outputs: 100%|█████████████████████████| 1/1 [00:02<00:00,  2.44s/it]


["[['Choanal atresia and athelia','methimazole teratogenicity']]"]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.85s/it]


["[['5-FU', 'DPD', 'genetic deficiencies']]"]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.74s/it]


["[['methotrexate', 'cerebellar syndrome']]"]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.88s/it]


["[['Lithium', 'thyrotoxicosis']]"]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.74s/it]


["[['INH', 'anagen effluvium']]"]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.68s/it]


["[['quinine', 'thrombocytopenia']]"]


Generating outputs: 100%|█████████████████████████| 1/1 [00:02<00:00,  2.04s/it]


["[['5-aminosalicylic acid', 'acute pancreatitis']]"]


Generating outputs: 100%|█████████████████████████| 1/1 [00:02<00:00,  2.38s/it]


["[['trazodone hydrochloride', 'generalized pustular psoriasis']]"]


Generating outputs: 100%|█████████████████████████| 1/1 [00:02<00:00,  2.07s/it]


["[['cyclophosphamide', 'progressive pulmonary fibrosis']]"]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.69s/it]


["[['hyperammonemia', 'altered mental status']]"]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.80s/it]


["[['Scopolamine', 'Anisocoria']]"]


Generating outputs: 100%|█████████████████████████| 1/1 [00:01<00:00,  1.57s/it]


["[['Captopril', 'dermatologic']]"]


Generating outputs: 100%|█████████████████████████| 1/1 [00:02<00:00,  2.02s/it]

["[['fluvoxamine', 'Schneiderian first-rank symptoms']]"]





In [51]:
preds = model.predict(to_predict)

Generating outputs:   0%|                                 | 0/3 [00:04<?, ?it/s]


In [43]:
preds[0]

"[['oxcarbazepine', 'angioedema']]"

In [44]:
print (lines_dev[1])

A search of the United States Food and Drug Administration's Adverse Event Reporting System identified nine cases of oxcarbazepine-associated angioedema in pediatric patients aged 16 years and younger.

