In [16]:
import spacy
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import os
from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer
import terms

[nltk_data] Downloading package punkt to /Users/bchu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [52]:

def listFilePaths(start :int, end :int, end2 :int):
    '''
    Return 
    list of specific file paths in each dataset
    AND
    list of specific file names in each dataset
    range = end - start: number of files we want to read
    '''

    ## Puts filenames of dataset into two sorted lists
    dirNames = ["DrugBank", "MedLine"]
    # /Users/bchu/Documents/UCSD/Computer Science/LIGN 167 is cwd
    cur_path = os.getcwd() + "/"
    ds1_path = cur_path + dirNames[0]
    ds2_path = cur_path + dirNames[1]
    ds1_files = sorted(os.listdir(ds1_path))
    ds2_files = sorted(os.listdir(ds2_path))
    # Each element contains the file path
    ds1_file_paths = [ds1_path + '/' + i for i in ds1_files]
    ds2_file_paths = [ds2_path + '/' + i for i in ds2_files]

    return ds1_file_paths[start:end], ds2_file_paths[start:end2], ds1_files[start:end], ds2_files[start:end2]


def parse_sentence(xml: list) -> dict:
    '''
    parse xml files in dataset to find names and types of drugs
    does not account for links, but isolates the named entities

    @params:
        xml: the current file being passed in

    @returns:
        a dictionary of nodes in format {node name : node}'''

    nodes = dict()

    for line in xml:
        if ("type" and "text" in line) and ("sentence" not in line):
            
            curr_quote = line.find("\"") + 1
            next_quote = line.find("\"", curr_quote)

            kind = line[curr_quote:next_quote]

            curr_quote = line.find("\"", next_quote + 1) + 1
            next_quote = line.find("\"", curr_quote)

            name = line[curr_quote:next_quote].lower()

            if name not in nodes:
                nodes[name] = TermNode(name = name, kind = kind)

    return nodes

def parse_names(xml: list) -> dict:
    '''
    parse xml files in dataset to find names and types of drugs
    does not account for links, but isolates the named entities

    @params:
        xml: the current file being passed in

    @returns:
        a list of names
    '''

    nodes = list()

    for line in xml:
        if ("type" and "text" in line) and ("sentence" not in line):
            
            curr_quote = line.find("\"") + 1
            next_quote = line.find("\"", curr_quote)

            kind = line[curr_quote:next_quote]

            curr_quote = line.find("\"", next_quote + 1) + 1
            next_quote = line.find("\"", curr_quote)

            name = line[curr_quote:next_quote].lower()

            if name not in nodes:
                nodes.append(name)

    return nodes

def parse_effects2str(xml :list):#, nodes :dict):
    effects = list()
    for line in xml:
        if("<sentence id=" and "text=" in line):
            curr_quote = line.find("\"") + 1
            next_quote = line.find("\"", curr_quote)

            kind = line[curr_quote:next_quote]

            curr_quote = line.find("\"", next_quote + 1) + 1
            next_quote = line.find("\"", curr_quote)
            eff = line[curr_quote:next_quote]

            # eff = eff + " are the effects of"

            effects.append(eff)
        # if ("type" and "text" in line) and ("sentence" not in line):
            
        #     curr_quote = line.find("\"") + 1
        #     next_quote = line.find("\"", curr_quote)

        #     kind = line[curr_quote:next_quote]

        #     curr_quote = line.find("\"", next_quote + 1) + 1
        #     next_quote = line.find("\"", curr_quote)

        #     name = line[curr_quote:next_quote].lower()
        #     effects.append(name)
        
    effects = list2str(effects)
    return effects


def parse_file2list(filename, kind="list"):
    '''
    parses file into list of strings or one long string

    @params:
        filename: the current file being passed in
        kind: type want to return, list be default

    @returns: 
        file as a list or string
    '''
    file = open(filename,"r")
    file = file.readlines()
    if(kind.lower() == "string"):
        file = list2str(file)
    return file



def parse_sen2corpus(xml :list):
    ## not done yet
    return xml


########################## PROCESSING ##########################

def files2corpus(file_names :list):
    sents = list()
    for n in file_names:
        sents.append(parse_effects2str(parse_file2list(n)) + "\n")
    corpus = list2str(sents)
    return corpus

def files2list(file_names :list):
    sents = list()
    for n in file_names:
        sents.extend(parse_file2list(n, "list"))
    return sents

def segment_and_tokenize(corpus):
	#make sure to run: 
	# pip install -U pip setuptools wheel
	# pip install -U spacy
	# python -m spacy download en_core_web_sm
	#in the command line before using this!

	#corpus is assumed to be a string, containing the entire corpus
	nlp = spacy.load('en_core_web_sm')
	tokens = nlp(corpus)
	sents = [[t.text for t in s] for s in tokens.sents if len([t.text for t in s])>1]
	sents = remove_infrequent_words(sents)
	sents = [['<START>']+s+['<END>'] for s in sents]
	return sents

def remove_infrequent_words(sents):
    '''
    Take in list of sentences and remove infrequent words.
    '''
    word_counts = {}
    for s in sents:
        for w in s:
            if w in word_counts:
                word_counts[w] += 1
            else:
                word_counts[w] = 1

    threshold = 2
    filtered_sents = []
    for s in sents:
        new_s = []
        for w in s:
            if word_counts[w] < threshold:
                new_s.append('<UNKOWN>')
            else:
                new_s.append(w)
        filtered_sents.append(new_s)
    return filtered_sents


##### filter words
def filter_stop(corpus):
    sp = spacy.load('en_core_web_sm')
    # all_stopwords = sp.Defaults.stop_words
    all_stopwords = ["in","to", "an", "a", "the", "sentence", "text=", "brand", "<?xml version=\"1.0\" encoding=\"UTF-8\"?>", "\'\'", "``", "version=", "xml", "e1=", "group", "drug"
        "document", "id=", "</", ">", "<", "/>", "\'", "\"", ",",".", "type=", "entity", "/", "charOffset=", "/sentence", "/document", "and", "ddi", "encoding="]
    text_tokens = word_tokenize(corpus)
    tokens_without_sw= [word for word in text_tokens if not word in all_stopwords]
    return tokens_without_sw

##########################  CHECKING  ##########################

## Randomly select 20% of files as test dataset
## 60% as training data, 20% as pre-training

# segment_and_tokenize(file_list2str)
dirfiles = listFilePaths(0,5,1)
# unfiltered_corpus = list()
# for file in dirfiles[0]:
#     unfiltered_corpus.append(parse_effects2str(parse_file2list(file)))   
# unfiltered_corpus = list2str(unfiltered_corpus)
# corpus = list2str(filter_stop(unfiltered_corpus))
# print(corpus)

names = parse_names(files2list(dirfiles[0]))

corpus = list2str(filter_stop(files2corpus(dirfiles[0])))

# print(corpus)


##########################  TEXT GENERATION  ##########################



prompt = "The combined of " + " and ".join(names) + " effects are: "

# model = AutoModelForCausalLM.from_pretrained("xlnet-base-cased")
# tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
# inputs = tokenizer(corpus + prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]

# prompt_length = len(tokenizer.decode(inputs[0]))
# outputs = model.generate(inputs, max_length=10000, do_sample=True, top_p=0.95, top_k=60)
# generated = prompt + tokenizer.decode(outputs[0])[prompt_length + 1 :]

# print(generated)


# classifier = pipeline("sentiment-analysis")
# result = classifier(file_list2str)[0]
# print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

##########################





model = AutoModelForCausalLM.from_pretrained("xlnet-base-cased")
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

# Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
# PADDING_TEXT = # """No drug nutritional supplement food or herb interactions have yet been reported.
#  No formal drug/drug interaction studies with Plenaxis were performed Plenaxis Cytochrome P-450 is not known be involved metabolism of Plenaxis Plenaxis Plenaxis is highly bound plasma proteins ( 96 99 % ) 
#  Plenaxis Laboratory Tests Response Plenaxis should be monitored by measuring serum total testosterone concentrations just prior administration on Day 29 every 8 weeks thereafter 
#  Plenaxis testosterone Serum transaminase levels should be obtained before starting treatment with Plenaxis periodically during treatment Plenaxis Periodic measurement of serum PSA levels may also be considered 
#  Formal drug interaction studies have not been conducted with ORENCIA ORENCIA Population pharmacokinetic analyses revealed that MTX NSAIDs corticosteroids TNF blocking agents did not influence abatacept clearance 
#  MTX NSAIDs corticosteroids TNF blocking agents abatacept The majority of patients RA clinical studies received one or more of following concomitant medications with ORENCIA : MTX NSAIDs corticosteroids TNF blocking 
#  agents azathioprine chloroquine gold hydroxychloroquine ,leflunomide sulfasalazine anakinra ORENCIA MTX NSAIDs corticosteroids TNF blocking agents azathioprine chloroquine gold hydroxychloroquine leflunomide sulfasalazine 
#  anakinra Concurrent administration of TNF antagonist ,with ORENCIA has been associated with increased risk of serious infections no significant additional efficacy over use of TNF antagonists alone TNF antagonist ORENCIA TNF 
#  antagonists Concurrent therapy with ORENCIA TNF antagonists ,is not recommended ORENCIA TNF antagonists."""  
PADDING_TEXT =  """There is insufficient experience assess, safety efficacy of ORENCIA administered concurrently with anakinra therefore such 
 use is not recommended ORENCIA anakinra Formal drug interaction studies with Abciximab have not been conducted Abciximab Abciximab has been administered patients with ischemic heart disease treated concomitantly with broad range 
 of medications used treatment of angina myocardial ,infarction hypertension Abciximab. These medications have included heparin warfarin beta-adrenergic receptor blockers calcium channel antagonists angiotensin converting enzyme inhibitors 
 intravenous oral nitrates ticlopidine aspirin heparin warfarin beta-adrenergic receptor blockers calcium channel antagonists angiotensin converting enzyme inhibitors nitrates ticlopidine aspirin Heparin other anticoagulants thrombolytics 
 anti platelet agents are associated with increase bleeding Heparin anticoagulants thrombolytics anti platelet agents Patients with HACA titers may have allergic or hypersensitivity reactions when treated with other diagnostic or therapeutic 
 monoclonal antibodies diagnostic monoclonal ,antibodies therapeutic monoclonal antibodies The concomitant intake of alcohol Acamprosate does not affect pharmacokinetics of either alcohol or acamprosate alcohol Acamprosate alcohol acamprosate 
 Pharmacokinetic studies ,indicate that administration of disulfiram or diazepam does not affect pharmacokinetics of acamprosate ,disulfiram diazepam acamprosate Co-administration of naltrexone with Acamprosate produced 25 increase AUC 33 increase 
 Cmax of acamprosate naltrexone Acamprosate acamprosate No adjustment of dosage ,is recommended such patients The pharmacokinetics of naltrexone its major metabolite 6-beta-naltrexol were unaffected following co-administration with Acamprosate naltrexone 
 6-beta-naltrexol Acamprosate Other concomitant therapies : In clinical trials safety profile subjects treated with Acamprosate concomitantly with anxiolytics hypnotics sedatives ( including benzodiazepines ) or non-opioid analgesics was similar that of 
 subjects taking placebo with these concomitant medications Acamprosate anxiolytics hypnotics sedatives benzodiazepines non-opioid analgesics Patients taking Acamprosate concomitantly with antidepressants more commonly reported both weight gain weight loss.
 compared with patients taking either medication alone Acamprosate antidepressants. <eod> </s> <eos>"""

prompt = "The combined effects of drugs Abarelix and Testosterone "



PADDING_TEXT1 = """In 1991, the remains of Russian Tsar Nicholas II and his family
(except for Alexei and Maria) are discovered.
The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
remainder of the story. 1883 Western Siberia,
a young Grigori Rasputin is asked by his father and a group of men to perform magic.
Rasputin has a vision and denounces one of the men as a horse thief. Although his
father initially slaps him for making such an accusation, Rasputin watches as the
man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""

prompt1 = "Today the weather is really nice and I am planning on "



# inputs = tokenizer(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]
# prompt_length = len(tokenizer.decode(inputs[0]))
# outputs = model.generate(inputs, max_length=70, do_sample=True, top_p=0.95, top_k=60)
# generated = prompt + tokenizer.decode(outputs[0])[prompt_length + 1 :]


# inputs = tokenizer(PADDING_TEXT1 + prompt1, add_special_tokens=False, return_tensors="pt")["input_ids"]
# prompt_length = len(tokenizer.decode(inputs[0]))
# outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
# generated = prompt1 + tokenizer.decode(outputs[0])[prompt_length + 1 :]

# print(generated)

print("hi")

# etree = ET.parse(xml_data) #create an ElementTree object 
# doc_df = pd.DataFrame(list(iter_docs(etree.getroot())))
# print(type(doc_df))


hi


In [None]:
def list2str(sents :list):
    sents = " ".join(sents)
    return sents

    ##### filter words
def filter_stop(corpus):
    sp = spacy.load('en_core_web_sm')
    # all_stopwords = sp.Defaults.stop_words
    all_stopwords = ["in","to", "an", "a", "the", "sentence", "text=", "brand", "<?xml version=\"1.0\" encoding=\"UTF-8\"?>", "\'\'", "``", "version=", "xml", "e1=", "group", "drug"
        "document", "id=", "</", ">", "<", "/>", "\'", "\"", ",",".", "type=", "entity", "/", "charOffset=", "/sentence", "/document", "and", "ddi", "encoding=", "-"]
    text_tokens = word_tokenize(corpus)
    tokens_without_sw= [word for word in text_tokens if not word in all_stopwords]
    return tokens_without_sw

drugbank_corpus = terms.drugbank.get_corpus()
#print(drugbank_corpus)

In [53]:
PADDING_TEXT =  """There is insufficient experience assess, safety efficacy of ORENCIA administered concurrently with anakinra therefore such 
 use is not recommended ORENCIA anakinra Formal drug interaction studies with Abciximab have not been conducted Abciximab Abciximab has been administered patients with ischemic heart disease treated concomitantly with broad range 
 of medications used treatment of angina myocardial ,infarction hypertension Abciximab. These medications have included heparin warfarin beta-adrenergic receptor blockers calcium channel antagonists angiotensin converting enzyme inhibitors 
 intravenous oral nitrates ticlopidine aspirin heparin warfarin beta-adrenergic receptor blockers calcium channel antagonists angiotensin converting enzyme inhibitors nitrates ticlopidine aspirin Heparin other anticoagulants thrombolytics 
 anti platelet agents are associated with increase bleeding Heparin anticoagulants thrombolytics anti platelet agents Patients with HACA titers may have allergic or hypersensitivity reactions when treated with other diagnostic or therapeutic 
 monoclonal antibodies diagnostic monoclonal ,antibodies therapeutic monoclonal antibodies The concomitant intake of alcohol Acamprosate does not affect pharmacokinetics of either alcohol or acamprosate alcohol Acamprosate alcohol acamprosate 
 Pharmacokinetic studies ,indicate that administration of disulfiram or diazepam does not affect pharmacokinetics of acamprosate ,disulfiram diazepam acamprosate Co-administration of naltrexone with Acamprosate produced 25 increase AUC 33 increase 
 Cmax of acamprosate naltrexone Acamprosate acamprosate No adjustment of dosage ,is recommended such patients The pharmacokinetics of naltrexone its major metabolite 6-beta-naltrexol were unaffected following co-administration with Acamprosate naltrexone 
 6-beta-naltrexol Acamprosate Other concomitant therapies : In clinical trials safety profile subjects treated with Acamprosate concomitantly with anxiolytics hypnotics sedatives ( including benzodiazepines ) or non-opioid analgesics was similar that of 
 subjects taking placebo with these concomitant medications Acamprosate anxiolytics hypnotics sedatives benzodiazepines non-opioid analgesics Patients taking Acamprosate concomitantly with antidepressants more commonly reported both weight gain weight loss.
 compared with patients taking either medication alone Acamprosate antidepressants. <eod> </s> <eos>"""

prompt = "The combined effects of drugs Abarelix and Testosterone "

In [54]:
inputs = tokenizer(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]
prompt_length = len(tokenizer.decode(inputs[0]))
outputs = model.generate(inputs, max_length=700, do_sample=True, top_p=0.95, top_k=60)
generated = prompt + tokenizer.decode(outputs[0])[prompt_length + 1 :]

print(generated)

The combined effects of drugs Abarelix and Testosterone inhibit heart rhythm in patients with ischemic heart disease. However, the rate of ischemic heart disease was not significantly affected by both drugs. The percentage of patients who did not feel ischemic heart disease was less than 10%.<eop> "The study was a very small one. The data were very well characterized. Among the patients who did not feel ischemic heart disease on the study we participated in."


In [12]:
drugbank_nodes = terms.drugbank.get_nodes()
drugbank_epochs = [ node.to_string() for node in drugbank_nodes.values() if node.get_sentences() != list() ]
# drugbank_epochs

In [61]:
PADDING_TEXT = list2str(drugbank_epochs[0:20]) + "<eop>"
prompt = "The combined effects of vincristine and gluthethimide are"
# print(PADDING_TEXT)

In [63]:
model = AutoModelForCausalLM.from_pretrained("xlnet-base-cased")
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
inputs = tokenizer(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]
prompt_length = len(tokenizer.decode(inputs[0]))
outputs = model.generate(inputs, max_length=2000, do_sample=True, top_p=0.95, top_k=60)
generated = prompt + tokenizer.decode(outputs[0])[prompt_length + 1 :]
print(generated)

The combined effects of vincristine and gluthethimide arevery weak, which may cause problems. heparin ("h" in the "L"), ""s" in the "I" in the "R" ("e") in the "E" ("L") in the "C"), and "T" ("T") in the "R" ("T"). ""s" in the "L" ("V") in the "S" ("S") in the "D" ("D") ("D") in the "E" ("E") in the "G") ("G") in the "G" (G") ("G") (G") in the "G") ("G") (G") in the "G" (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G") (G"


In [37]:
def train_instance(PADDING_TEXT, prompt, mlength):
	model = AutoModelForCausalLM.from_pretrained("xlnet-base-cased")
	tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
	inputs = tokenizer(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]
	prompt_length = len(tokenizer.decode(inputs[0]))
	outputs = model.generate(inputs, max_length=mlength, do_sample=True, top_p=0.95, top_k=60)
	generated = prompt + tokenizer.decode(outputs[0])[prompt_length + 1 :]
	return generated

In [44]:
def train_model():
	PADDING_TEXT = ""
	prompt = "The combined effects of drugs are "
	mlength = 100
	nnodes = 5
	for i in range(nnodes):
		PADDING_TEXT = list2str(filter_stop(PADDING_TEXT + list2str(drugbank_epochs[i:i+1])))
		mlength = mlength + len(PADDING_TEXT.split())
		generated = train_instance(PADDING_TEXT, prompt, mlength)
		print(generated)
		print(mlength)
		print("\n\n")
	print(generated)




In [45]:
train_model()

The combined effects of drugs are a ------- drugs ------------------------------------------------------------------------
128



The combined effects of drugs are very similar compared with their counterpart steroids, which include the pharmacological effects of drug - - - - corticosteroids TNF blocking agents which enhance the pharmacological effects of steroid - - -
206



The combined effects of drugs are usually measured by the level of testosterone in their serum. The highest level of testosterone in patients who received a RA clinical trial and received the most reaction to the Drugs is 9/14/18/19/20/20/21/18/20/20/22/24/20/20/20/25/20/23/25/25/23/25/25/25/25/26/25/25/25/25/25/25/25
304



The combined effects of drugs are described in EA III (Supplemental Table). The complete list of drug effects is presented in Table III (Supplemental Table).<eop><eod> 2. In order to use this software, you may first want to access Windows 9.2.1 or Windows 10.1.2.2. The installation must includ

In [50]:
def train_model2():
	PADDING_TEXT = ""
	prompt = "The combined effects of drugs are "
	mlength = 130
	generated = ""
	nnodes = 5
	for i in range(nnodes):
		PADDING_TEXT = PADDING_TEXT + list2str(drugbank_epochs[i:i+1])
		mlength = mlength + len(PADDING_TEXT.split())
		generated = train_instance(PADDING_TEXT, prompt, mlength)
		PADDING_TEXT = generated
		print(generated)
		print(mlength)
		print("\n\n")
	print(generated)

In [51]:
train_model2()

The combined effects of drugs are always considered to be different. testosterone Your - Drug - laboratory tests are not necessary. testosterone Your - Drug - drug - laboratory tests are not necessary. testosterone Your - Drug - drug - drug - drugs - laboratory tests are not necessary. testosterone Your - Drug - drug - drugs - drug - drug - drug - drugs - drug - drugs - drugs - drug - drugs - drugs - drug - drugs
161



The combined effects of drugs are always considered to be different.<eop><eod> A page in the United States of Canada or New Zealand that has a page in this country or Canada that has a page in Canada or Canada that has a
296



The combined effects of drugs are always considered to be different. abatacept A - drug - The combined effects of drugs are always considered to be different. abatacept A - drug - drug - drug - drug - drug - drug - drug - drug - drug - drug - drug - drug - drug - drug - drug - drug - drug - drug - drug - drug - drug - drug - drug - drug - drug - 