In [1]:
import os
import pandas as pd
import xml.etree.ElementTree as ET
from nltk.tokenize import WordPunctTokenizer as wpt
from nltk.tokenize import WhitespaceTokenizer as wst
from nltk.tokenize import RegexpTokenizer
import re
from collections import Counter
from random import shuffle
#from lxml import etree

In [2]:
def get_paths(rootdir):
    # fetches a list of absolute paths, given a dir with xml files
    # BEHÖVS FÖR OPEN_XMLS
    file_paths = []

    for folder, _, files in os.walk(rootdir):
        for filename in files:
            if filename.endswith('xml'):
                file_paths.append(os.path.abspath(os.path.join(folder, filename)))
    return file_paths

In [3]:
allFiles = get_paths("../DDICorpus/")
#allFiles.extend(get_paths("../DDICorpus/Test")) # ska vi ha båda testmapparna???
#allFiles.extend(get_paths("../DDICorpus/Test/Test for DrugNER task"))

In [4]:
allFiles

['/home/gushansad@GU.GU.SE/lt2316-ml/a1/DDICorpus/Test/Test for DrugNER task/MedLine/21878085.xml',
 '/home/gushansad@GU.GU.SE/lt2316-ml/a1/DDICorpus/Test/Test for DrugNER task/MedLine/21751542.xml',
 '/home/gushansad@GU.GU.SE/lt2316-ml/a1/DDICorpus/Test/Test for DrugNER task/MedLine/21722807.xml',
 '/home/gushansad@GU.GU.SE/lt2316-ml/a1/DDICorpus/Test/Test for DrugNER task/MedLine/21745652.xml',
 '/home/gushansad@GU.GU.SE/lt2316-ml/a1/DDICorpus/Test/Test for DrugNER task/MedLine/21761733.xml',
 '/home/gushansad@GU.GU.SE/lt2316-ml/a1/DDICorpus/Test/Test for DrugNER task/MedLine/21751828.xml',
 '/home/gushansad@GU.GU.SE/lt2316-ml/a1/DDICorpus/Test/Test for DrugNER task/MedLine/21775592.xml',
 '/home/gushansad@GU.GU.SE/lt2316-ml/a1/DDICorpus/Test/Test for DrugNER task/MedLine/22016977.xml',
 '/home/gushansad@GU.GU.SE/lt2316-ml/a1/DDICorpus/Test/Test for DrugNER task/MedLine/21789791.xml',
 '/home/gushansad@GU.GU.SE/lt2316-ml/a1/DDICorpus/Test/Test for DrugNER task/MedLine/21843050.xml',


In [5]:
s = "string. With. Punctuation? don't 'hell0o cat's' micro-organisms 'chat'"

In [6]:
def string_to_span(s):
    # creates a tokenized version and a span version of a string
    # BEHÖVS FÖR OPEN_XMLS
    #s = (re.sub(r"[^A-Za-z\s]",'',s)).lower() # removes all non-alphanumerical characters LOL gör inte det
    tokenizer = RegexpTokenizer("[\w'-]+|[^\w\s]+") # tokenizes words and punctuation except hyphens in compound words and apostrophes
    tokenized = tokenizer.tokenize(s.lower())
    span = list(tokenizer.span_tokenize(s)) # gets the pythonic span i e (start, stop_but_not_including)
    new_span = []
    for tpl in span:
        new_span.append((tpl[0], (tpl[1]-1))) # to get non-pythonic span i e (start,last_char)
    return new_span, tokenized

In [15]:
def open_xmls(fileList):
    
    vocab = []
    data_df_list = [] 
    ner_df_list = []
    ent2id = {
        'drug'   : 0,
        'drug_n' : 1,
        'group'  : 2, 
        'brand'  : 3
    }
    
    
    for file in fileList:
        tree = ET.parse(file)
        root = tree.getroot()
        for sentence in root:
            sent_id = sentence.attrib['id']
            sent_txt= sentence.attrib['text']
            char_ids, tokenized = string_to_span(sent_txt)
            unique_w = list(set(tokenized))
            vocab.extend(unique_w)
            for i, word in enumerate(tokenized): # creating data_df_list
                if 'test' in file.lower():
                    split = 'test'
                else:
                    split = 'train/dev'
                word_tpl = (sent_id, word, char_ids[i][0], char_ids[i][1], split) # one row in data_df 
                data_df_list.append(word_tpl)
                
            for entity in sentence: # creating the ner_df_list
                if entity.tag == 'entity':
                    ent_txt = (entity.attrib['text']).lower()
                    ent_type = (entity.attrib['type']).lower()
                    ent_type = ent2id[ent_type]
                    char_offset = entity.attrib['charOffset']
                    char_span = (re.sub(r"[^0-9]+",' ', char_offset)).split(' ')
                    
                    if len(char_span) > 2:
                        char_pairs = (list(zip(char_span[::2], char_span[1::2])))
                        for pair in char_pairs:
                            entity_tpl = (sent_id, ent_type, pair[0], pair[1])
                            ner_df_list.append(entity_tpl)
                    else:
                        ent_start_id, ent_end_id = char_span
                        ent_txt_one = ent_txt    
                        
                        entity_tpl = (sent_id, ent_type, ent_start_id, ent_end_id) 
                        
                        ner_df_list.append(entity_tpl)
                        
    vocab = list(sorted(set(vocab)))
    return vocab, data_df_list, ner_df_list

In [16]:
vocab, data_df_list, ner_df_list  = open_xmls(allFiles)

In [17]:
# tar hela vocabet från open_xmls
def word2int(vocabList):
    #w2i = {} # dictionary mapping words to integers
    #for i, w in enumerate(sorted((vocabList))):
    #    w2i[w] = i # because we want to be able to get integers by giving word as a key
    #return w2i
    
    return {w:i for i,w in enumerate(sorted(vocabList))} # FASTER


In [18]:
# efter data_df_list är en dataframe och efter vocab har blivit w2i
# tar data_df-kolumnen för tokens och dicten från w2i
def get_token_ids(tokensList, vocab2idDict):
    # fetches token id from vocab2id dict
    #token_ids = []
    #for w in tokensList:
    #    print(w)
    #    token_ids.append(vocab2idDict[w])
    #return token_ids
    return [vocab2idDict[w] for w in tokensList] # SNABBARE

In [19]:
data_df = pd.DataFrame(data_df_list, columns=['sentence_id', 'token', 'char_start_id', 'char_end_id', 'split'])
ner_df = pd.DataFrame(ner_df_list, columns=['sentence_id', 'ner_id', 'char_start_id', 'char_end_id']) # ner_id = entity type
w2i = word2int(vocab)
token_ids = get_token_ids(data_df['token'], w2i)
data_df.insert(1, 'token_id', token_ids)
#data_df['token id'] = token_ids
data_df = data_df.drop(columns=['token'])
test_df = data_df.loc[data_df.split == 'test']
traindev_df = data_df.loc[data_df.split != 'test']
dev_len = len(test_df) # size of dev/val set is decided from how big the test set is
train_len = len(traindev_df) - dev_len
traindev_df.drop(columns=['split'])
train_dev = ['train'] * train_len
dev = ['dev'] * dev_len
train_dev.extend(dev)
shuffle(train_dev)
#traindev_df['split'] = shuffled
pd.options.mode.chained_assignment = None
traindev_df.loc[:, 'split'] = train_dev
data_df = traindev_df.append(test_df)

In [20]:
data_df

Unnamed: 0,sentence_id,token_id,char_start_id,char_end_id,split
45372,DDI-MedLine.d12.s0,3462,0,12,dev
45373,DDI-MedLine.d12.s0,1337,14,16,dev
45374,DDI-MedLine.d12.s0,8865,18,24,train
45375,DDI-MedLine.d12.s0,907,26,32,train
45376,DDI-MedLine.d12.s0,9248,34,42,dev
45377,DDI-MedLine.d12.s0,5574,44,50,train
45378,DDI-MedLine.d12.s0,1337,52,54,dev
45379,DDI-MedLine.d12.s0,9482,56,64,train
45380,DDI-MedLine.d12.s0,3911,66,69,train
45381,DDI-MedLine.d12.s0,7388,71,72,train


23

In [151]:
data_ddata_df.sort_values(by=['split'])

TypeError: sort_values() got an unexpected keyword argument 'ignore_index'

Unnamed: 0,sent id,token,char start id,char end id,split,token id
82683,DDI-DrugBank.d489.s9,allergens,103,111,dev,1063
111747,DDI-DrugBank.d558.s35,cyp3a4,150,155,dev,2911
111748,DDI-DrugBank.d558.s35,",",156,156,dev,68
111753,DDI-DrugBank.d558.s35,cyp3a4,187,192,dev,2911
20982,DDI-MedLine.d85.s1,",",138,138,dev,68
111754,DDI-DrugBank.d558.s35,",",193,193,dev,68
111766,DDI-DrugBank.d558.s35,diuretics,271,279,dev,3379
111774,DDI-DrugBank.d558.s35,",",329,329,dev,68
111778,DDI-DrugBank.d558.s35,of,357,358,dev,6656
111784,DDI-DrugBank.d558.s35,and,394,396,dev,1210


In [107]:
{w:i for w, i in count.items() if int(i) <= 10 and if w not in }

{'retinyl': 6,
 'stimulate': 1,
 'egf-': 1,
 'insulin-induced': 2,
 'proliferation': 7,
 'prostatic': 4,
 'epithelium': 4,
 'proliferates': 1,
 'defined': 3,
 'medium': 7,
 'consisting': 2,
 'basal': 7,
 'rpmi1640': 2,
 'transferrin': 2,
 'microgram': 2,
 'egf': 5,
 'iu': 2,
 'modify': 7,
 'mitogenic': 2,
 'stimulated': 6,
 'glucocorticoids': 9,
 'retinoids': 4,
 'regulate': 2,
 'prostate': 2,
 'modification': 7,
 'designs': 4,
 'joint': 6,
 'logistic': 2,
 'regression': 4,
 'models': 8,
 'interest': 3,
 'lies': 2,
 'enhancing': 8,
 'efficacious': 2,
 'limiting': 2,
 'statistical': 3,
 'work': 3,
 'focused': 2,
 'developing': 7,
 'mathematical': 1,
 'functions': 5,
 'dose-response': 3,
 'curves': 5,
 'relatively': 5,
 'regard': 5,
 'designing': 2,
 'assessing': 1,
 'parametric': 1,
 'probability': 1,
 'typically': 1,
 'nonlinear': 4,
 'weighted': 1,
 'squares': 1,
 'approach': 6,
 'purpose': 3,
 'applicable': 3,
 'across': 4,
 'wide': 4,
 'settings': 1,
 'continuous': 4,
 'discrete': 1

In [None]:
(count['drug'] + count['drug_n'] + count['group'] + count['brand']) /(len(all_words)) # obalans

In [None]:
vocab2idd['the']

In [73]:
0.00021696090698242188


0.00021696090698242188

In [None]:
sus = "Drug interaction studies with SUSTIVA and these imidazole and triazole antifungals have not been conducted."

In [None]:
a = ['cat', 'dog', 'hamster', 'monkey', 'banana']
b = ['cat', 'hamster', 'apple', 'pear', 'banana']

In [None]:
set(a) ^ set(b)

In [None]:
import timeit
init = "a = ['cat', 'dog', 'hamster', 'monkey', 'banana']; b = ['cat', 'hamster', 'apple', 'pear', 'banana']"
print(timeit.timeit('list(set(a) - set(b))', init, number = 100000))
print(timeit.timeit('s = set(b);[x for x in a if x not in s]', init, number = 100000))
print(timeit.timeit('set(a) ^ set(b)', init, number = 100000))
print(timeit.timeit('[item for item in a if item not in b]', init, number = 100000))

In [None]:
import csv
with open('eggs.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=' ',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(['Spam'] * 5 + ['Baked Beans'])
    spamwriter.writerow(['Spam', 'Lovely Spam', 'Wonderful Spam'])

In [2]:
abc = "string With. Punctuation: don't HElL0 c4T5."

In [70]:
s = (re.sub(r"[^0-9]+",' ',nums)).split(' ')

In [71]:
s

['11', '5123', '502', '112', '838', '999']

In [53]:
kates = "I shouldn\'t really but, okay I will. I love 'Rocky' and calcium-rich foods -- bananas for example. 'jajjaj."

In [68]:
k = (re.sub(r"\w+|[^\w\s]+",' ',kates)).lower()

In [61]:
tokenizer = RegexpTokenizer("[0-9]+")
tokenizer.tokenize(nums)

In [58]:
nums = "11-5123;502-112;838-999"

In [56]:
tokenizer = RegexpTokenizer("[\w'\w]+|[^\w\s]+")

In [57]:
tok = tokenizer.tokenize(kates)

In [58]:
tok

['I',
 "shouldn't",
 'really',
 'but',
 ',',
 'okay',
 'I',
 'will',
 '.',
 'I',
 'love',
 "'Rocky'",
 'and',
 'calcium',
 '-',
 'rich',
 'foods',
 '--',
 'bananas',
 'for',
 'example',
 '.',
 "'jajjaj",
 '.']

In [9]:
tok1 = ['85', '92', '103', '111', '146', '153']

In [34]:
toktok = "thiazide diuretics paraply"

In [92]:
idx = int(tok1[1]) - int(tok1[0])

In [35]:
len(toktok)

26

In [36]:
for item in lol:
    print(int(item[1])- int(item[0]))

7
8
7


In [37]:
len_w = [int(tpl[1])-int(tpl[0]) for tpl in lol]

In [65]:
ltoktok = list(toktok)
def get_word_span(charOffsetList, entTxt, sentId, entType):
    
    ners = []
    span_pairs = list(zip(charOffsetList[::2], charOffsetList[1::2]))
    ent_txt = list(entTxt)
    w_len = [int(tpl[1])-int(tpl[0]) for tpl in span_pairs] # len of each word in entity
    
    for i, n in enumerate(w_len):

        ners.append((''.join((ent_txt[:n+1])), span_pairs[i][0], span_pairs[i][1]))
        #print(lol[i])
        #print(ltoktok)
        del ent_txt[:n+2]
    return ners

In [67]:
a = get_word_span(tok1, toktok)

In [69]:
a.extend(get_word_span(tok1, toktok))

In [70]:
a

[('thiazide', '85', '92'),
 ('diuretics', '103', '111'),
 ('paraply', '146', '153'),
 ('thiazide', '85', '92'),
 ('diuretics', '103', '111'),
 ('paraply', '146', '153')]

In [60]:
lol = list(zip(tok1[::2], tok1[1::2]))

In [61]:
lol

[('85', '92'), ('103', '111'), ('146', '153')]

In [81]:
tokenizer.tokenize(kates)

['I',
 "shouldn't",
 'really',
 'but',
 ',',
 'okay',
 'I',
 'will',
 '.',
 'I',
 'love',
 "'Rocky'",
 'and',
 'calcium-rich',
 'foods',
 '--',
 'bananas',
 'for',
 'example',
 '.']

In [None]:
zxc = string_to_span(s)

In [None]:
dfObj = pd.DataFrame(zxc)

In [None]:
dfObj

In [None]:
z = "The concurrent use of Robinul Injection with other anticholinergics or medications with anticholinergic activity, such as phenothiazines, antiparkinson drugs, or tricyclic antidepressants, may intensify the antimuscarinic effects and may result in an increase in anticholinergic side effects."

In [None]:
for i, x in enumerate(z): 
    print(i, x)

In [None]:
# YOU DON'T NEED THIS 
# list(spans_to_relative(wst().span_tokenize(s))) # how long the words are

In [35]:
import string

account_number = input("Enter the account number to which you want to transfer money: ")
value = input("Enter the amount you would like to transfer: ")

final_account_number = account_number.translate(str.maketrans('', '', string.punctuation)
)
final_value = value.translate(str.maketrans('', '', string.punctuation)
)

print(final_account_number)
print(final_value)

Enter the account number to which you want to transfer money: a
Enter the amount you would like to transfer: a
a
a


In [40]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [37]:
stringy = "'string. With. Punctuation? don't 'hell0o cat's' micro-organisms 'chat'"

In [63]:
zup, xup = string_to_span(s)

In [65]:
[x for x in xup if len(regex.findall(x)) > 1 or len(regex.findall(x)) == 0]

['string',
 '.',
 'with',
 '.',
 'punctuation',
 "cat's'",
 'micro-organisms',
 "'chat'"]

In [73]:
regex = re.compile("[@_!#$%^&*()<>?/\|}{~:\']+") 

In [48]:
regex.findall(stringy)

["'", '?', "'", "'", "'", "'", "'", "'"]

In [57]:
wpt().tokenize(stringy)

["'",
 'string',
 '.',
 'With',
 '.',
 'Punctuation',
 '?',
 'don',
 "'",
 't',
 "'",
 'hell0o',
 'cat',
 "'",
 's',
 "'",
 'micro',
 '-',
 'organisms',
 "'",
 'chat',
 "'"]

In [62]:
data_df[data_df['token'] != 'and']

Unnamed: 0,sent id,token,char start id,char end id,split
0,DDI-MedLine.d12.s0,dexamethasone,0,12,train/dev
2,DDI-MedLine.d12.s0,retinyl,18,24,train/dev
3,DDI-MedLine.d12.s0,acetate,26,32,train/dev
4,DDI-MedLine.d12.s0,similarly,34,42,train/dev
5,DDI-MedLine.d12.s0,inhibit,44,50,train/dev
7,DDI-MedLine.d12.s0,stimulate,56,64,train/dev
8,DDI-MedLine.d12.s0,egf-,66,69,train/dev
9,DDI-MedLine.d12.s0,or,71,72,train/dev
10,DDI-MedLine.d12.s0,insulin-induced,74,88,train/dev
11,DDI-MedLine.d12.s0,proliferation,90,102,train/dev


In [74]:
import time
start = time.time()
tokenizer = RegexpTokenizer("[0-9]+")
tokenizer.tokenize(nums)
end = time.time()
print(end - start)

0.00025153160095214844
