In [None]:
import os
import pandas as pd
import xml.etree.ElementTree as ET
from nltk.tokenize import WordPunctTokenizer as wpt
from nltk.tokenize import WhitespaceTokenizer as wst
from nltk.tokenize import RegexpTokenizer
import re
from collections import Counter
from random import shuffle
#from lxml import etree

In [None]:
def get_paths(rootdir):
    # fetches a list of absolute paths, given a dir with xml files
    # BEHÖVS FÖR OPEN_XMLS
    file_paths = []

    for folder, _, files in os.walk(rootdir):
        for filename in files:
            if filename.endswith('xml'):
                file_paths.append(os.path.abspath(os.path.join(folder, filename)))
    return file_paths

In [None]:
allFiles = get_paths("../DDICorpus/")
#allFiles.extend(get_paths("../DDICorpus/Test")) # ska vi ha båda testmapparna???
#allFiles.extend(get_paths("../DDICorpus/Test/Test for DrugNER task"))

In [None]:
allFiles

In [None]:
s = "string. With. Punctuation? don't 'hell0o cat's' micro-organisms 'chat'"

In [None]:
def string_to_span(s):
    # creates a tokenized version and a span version of a string
    # BEHÖVS FÖR OPEN_XMLS
    #s = (re.sub(r"[^A-Za-z\s]",'',s)).lower() # removes all non-alphanumerical characters LOL gör inte det
    punctuation = "-,.?!:;"
    tokenizer = RegexpTokenizer("\s|:|;", gaps=True)
    tokenized = tokenizer.tokenize(s.lower())
    [word.strip(punctuation) if word[-1] in punctuation else word for word in tokenized]
    #list(tokenized.span_tokenize(s))
    #tokenizer = RegexpTokenizer("[\w'-]+|[^\w\s]+") # tokenizes words and punctuation except hyphens in compound words and apostrophes
    #tokenized = tokenizer.tokenize(s.lower())
    span = list(tokenizer.span_tokenize(s)) # gets the pythonic span i e (start, stop_but_not_including)
    new_span = []
    for tpl in span:
        new_span.append((tpl[0], (tpl[1]-1))) # to get non-pythonic span i e (start,last_char)
    return new_span, tokenized

In [None]:
def open_xmls(fileList):
    
    vocab = []
    data_df_list = [] 
    ner_df_list = []
    ent2id = {
        'drug'   : 0,
        'drug_n' : 1,
        'group'  : 2, 
        'brand'  : 3
    }
    
    
    for file in fileList:
        tree = ET.parse(file)
        root = tree.getroot()
        for sentence in root:
            sent_id = sentence.attrib['id']
            #if sent_id == "DDI-DrugBank.d64.s87":
            #    print(f"filename: {file}")
            sent_txt= sentence.attrib['text']
            char_ids, tokenized = string_to_span(sent_txt)
            unique_w = list(set(tokenized))
            vocab.extend(unique_w)
            for i, word in enumerate(tokenized): # creating data_df_list
                if 'test' in file.lower():
                    split = 'test'
                else:
                    split = 'train/dev'
                word_tpl = (sent_id, word, int(char_ids[i][0]), int(char_ids[i][1]), split) # one row in data_df 
                data_df_list.append(word_tpl)
                
            for entity in sentence: # creating the ner_df_list
                if entity.tag == 'entity':
                    ent_txt = (entity.attrib['text']).lower()
                    ent_type = (entity.attrib['type']).lower()
                    ent_type = ent2id[ent_type]
                    char_offset = entity.attrib['charOffset']
                    char_span = (re.sub(r"[^0-9]+",' ', char_offset)).split(' ')
                    
                    if len(char_span) > 2:
                        char_pairs = (list(zip(char_span[::2], char_span[1::2])))
                        for pair in char_pairs:
                            entity_tpl = (sent_id, ent_type, int(pair[0]), int(pair[1]))
                            ner_df_list.append(entity_tpl)
                    else:
                        ent_start_id, ent_end_id = char_span
                        ent_txt_one = ent_txt    
                        
                        entity_tpl = (sent_id, ent_type, int(ent_start_id), int(ent_end_id))
                        
                        ner_df_list.append(entity_tpl)
                        
    vocab = list(sorted(set(vocab)))
    return vocab, data_df_list, ner_df_list

In [None]:
vocab, data_df_list, ner_df_list  = open_xmls(allFiles)

In [None]:
# tar hela vocabet från open_xmls
def word2int(vocabList):
    #w2i = {} # dictionary mapping words to integers
    #for i, w in enumerate(sorted((vocabList))):
    #    w2i[w] = i # because we want to be able to get integers by giving word as a key
    #return w2i
    
    return {w:i for i,w in enumerate(sorted(vocabList))} # FASTER


In [None]:
# efter data_df_list är en dataframe och efter vocab har blivit w2i
# tar data_df-kolumnen för tokens och dicten från w2i
def get_token_ids(tokensList, vocab2idDict):
    # fetches token id from vocab2id dict
    #token_ids = []
    #for w in tokensList:
    #    print(w)
    #    token_ids.append(vocab2idDict[w])
    #return token_ids
    return [vocab2idDict[w] for w in tokensList] # SNABBARE

In [None]:
data_df = pd.DataFrame(data_df_list, columns=['sentence_id', 'token', 'char_start_id', 'char_end_id', 'split'])
ner_df = pd.DataFrame(ner_df_list, columns=['sentence_id', 'ner_id', 'char_start_id', 'char_end_id']) # ner_id = entity type
w2i = word2int(vocab)
token_ids = get_token_ids(data_df['token'], w2i)
data_df.insert(1, 'token_id', token_ids)
#data_df['token id'] = token_ids
data_df = data_df.drop(columns=['token'])
test_df = data_df.loc[data_df.split == 'test']
traindev_df = data_df.loc[data_df.split != 'test']
dev_len = len(test_df) # size of dev/val set is decided from how big the test set is
train_len = len(traindev_df) - dev_len
traindev_df.drop(columns=['split'])
train_dev = ['train'] * train_len
dev = ['dev'] * dev_len
train_dev.extend(dev)
shuffle(train_dev)
#traindev_df['split'] = shuffled
pd.options.mode.chained_assignment = None
traindev_df.loc[:, 'split'] = train_dev
data_df = (traindev_df.append(test_df)).reset_index(drop=True)

In [None]:
sent_dict = Counter(list(data_df.sentence_id)) # counting occurences of sentence_id in data df = len of sentences

In [None]:
max_sample_length = max(sent_dict.values())

In [None]:
# to get sentences out of data_dt
sentences= []
for x in sent_dict.keys():
    sentences.append(list(data_df.loc[data_df['sentence_id'] == x, 'token_id']))

In [None]:
sent_dict = Counter(list(data_df.sentence_id))

In [None]:
import operator
max(sent_dict.items(), key=operator.itemgetter(1))

In [None]:
max(sent_dict.values())

In [None]:
train_no = (traindev_df.sentence_id.nunique()) - (test_df.sentence_id.nunique())

In [None]:
data_no = data_df.sentence_id.nunique()

In [None]:
test_no = test_df.sentence_id.nunique()

In [None]:
import string
string.punctuation

In [None]:
(count['drug'] + count['drug_n'] + count['group'] + count['brand']) /(len(all_words)) # obalans

In [None]:
data_ddata_df.sort_values(by=['split'])

In [None]:
vocab2idd['the']

In [None]:
0.00021696090698242188


In [None]:
sus = "Drug interaction studies with SUSTIVA and these imidazole and triazole antifungals have not been conducted."

In [None]:
a = ['cat', 'dog', 'hamster', 'monkey', 'banana']
b = ['cat', 'hamster', 'apple', 'pear', 'banana']

In [None]:
set(a) ^ set(b)

In [None]:
import timeit
init = "a = ['cat', 'dog', 'hamster', 'monkey', 'banana']; b = ['cat', 'hamster', 'apple', 'pear', 'banana']"
print(timeit.timeit('list(set(a) - set(b))', init, number = 100000))
print(timeit.timeit('s = set(b);[x for x in a if x not in s]', init, number = 100000))
print(timeit.timeit('set(a) ^ set(b)', init, number = 100000))
print(timeit.timeit('[item for item in a if item not in b]', init, number = 100000))

In [None]:
abc = "string With. Punctuation: don't HElL0 c4T5."

In [None]:
s = (re.sub(r"[^0-9]+",' ',nums)).split(' ')

In [None]:
s

In [None]:
kates = "I shouldn\'t really but, okay I will. I love 'Rocky' and calcium-rich foods -- bananas for example. 'jajjaj."

In [None]:
k = (re.sub(r"\w+|[^\w\s]+",' ',kates)).lower()

In [None]:
tokenizer = RegexpTokenizer("[0-9]+")
tokenizer.tokenize(nums)

In [None]:
nums = "11-5123;502-112;838-999"

In [None]:
tokenizer = RegexpTokenizer("[\w'\w]+|[^\w\s]+")

In [None]:
tok = tokenizer.tokenize(kates)

In [None]:
tok

In [None]:
tok1 = ['85', '92', '103', '111', '146', '153']

In [None]:
toktok = "thiazide diuretics paraply"

In [None]:
idx = int(tok1[1]) - int(tok1[0])

In [None]:
len(toktok)

In [None]:
for item in lol:
    print(int(item[1])- int(item[0]))

In [None]:
len_w = [int(tpl[1])-int(tpl[0]) for tpl in lol]

In [None]:
ltoktok = list(toktok)
def get_word_span(charOffsetList, entTxt, sentId, entType):
    
    ners = []
    span_pairs = list(zip(charOffsetList[::2], charOffsetList[1::2]))
    ent_txt = list(entTxt)
    w_len = [int(tpl[1])-int(tpl[0]) for tpl in span_pairs] # len of each word in entity
    
    for i, n in enumerate(w_len):

        ners.append((''.join((ent_txt[:n+1])), span_pairs[i][0], span_pairs[i][1]))
        #print(lol[i])
        #print(ltoktok)
        del ent_txt[:n+2]
    return ners

In [None]:
a = get_word_span(tok1, toktok)

In [None]:
a.extend(get_word_span(tok1, toktok))

In [None]:
a

In [None]:
lol = list(zip(tok1[::2], tok1[1::2]))

In [None]:
lol

In [None]:
tokenizer.tokenize(kates)

In [None]:
zxc = string_to_span(s)

In [None]:
dfObj = pd.DataFrame(zxc)

In [None]:
dfObj

In [None]:
z = "The concurrent use of Robinul Injection with other anticholinergics or medications with anticholinergic activity, such as phenothiazines, antiparkinson drugs, or tricyclic antidepressants, may intensify the antimuscarinic effects and may result in an increase in anticholinergic side effects."

In [None]:
for i, x in enumerate(z): 
    print(i, x)

In [None]:
# YOU DON'T NEED THIS 
# list(spans_to_relative(wst().span_tokenize(s))) # how long the words are

In [None]:
string.punctuation

In [None]:
stringy = "string. With. Punctuation? don't 'hell0o cat's micro-organisms 'chat' 1-methyl-4-phenyl-1,2,3,6-tetrahydropyridine punctu:aaaaation"

In [None]:
s

In [None]:
tokenizer = RegexpTokenizer("[\w,'-]+") # tokenizes words and punctuation except hyphens in compound words and apostrophes
tokenized = tokenizer.tokenize(stringy.lower())

In [None]:
tokenized

In [None]:
list(tokenizer.span_tokenize(stringy.lower()))

In [None]:
punctuation = "-,.?!:;"
tok = RegexpTokenizer("\s|:|;", gaps=True)
x = tok.tokenize(stringy.lower())
[string.strip(punctuation) if string[-1] in punctuation else string for string in x]
list(tok.span_tokenize(stringy))

In [None]:
list(wst().span_tokenize(stringy))

In [None]:
data_df[data_df['token'] != 'and']

In [None]:
import time
start = time.time()
max(sent_dict.values())
end = time.time()
print(end - start)

In [None]:
0.004487276077270508

In [None]:
xcv = "Other compounds that are substrates of CYP3A4 may have decreased plasma concentrations when coadministered with SUSTIVA (efavirenz)."

In [None]:
xcv[112:119]

In [None]:
    tokenizer = RegexpTokenizer("[\w'-]+|[^\w\s]+") # tokenizes words and punctuation except hyphens in compound words and apostrophes
    tokenized = tokenizer.tokenize(xcv.lower())
    span = list(tokenizer.span_tokenize(xcv))

In [None]:
wpt