In [1]:
#export needed libraries
import xml
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import re
import os
from tqdm import tnrange
from oger.ctrl.router import Router, PipelineServer
import os
import io
import sys
import gzip

In [2]:
def split_xml(name): #split xml for small chunks to annotate
    context = ET.iterparse(name, events=('end', ))
    folder = str(name).replace('.xml', '').replace('data/', '')
    os.mkdir(folder)
    os.mkdir(folder +'/xml')
    index = 0
    for event, elem in context:
        if elem.tag == 'PubmedArticle':
            index += 1
            filename = format(str(index) + ".xml")
            with open(folder + '/xml/' + filename, 'wb') as f:
                f.write(ET.tostring(elem))            
    return folder

In [3]:
def get_article_info(fname):
    file = ET.parse(str(fname)).getroot()
    year = []
    for row in file.iter('Year'): #year completed
        year.append(row.text)
    pmid = []
    for row in file.iter('PMID'): #pubmed id
        pmid.append(row.text)
    jt = []
    for row in file.iter('Title'): #journal title
        jt.append(row.text)
    at = []
    for row in file.iter('ArticleTitle'): #article title    
        at.append(row.text)
    
    text_list = []
    for row in file.iter('Abstract'): #abstract info
        for text_fragment in row.itertext():
            text_list.append(text_fragment)
    if len(text_list) > 0:
        labstr = ''.join(text_list).strip()
    else:
        labstr = 'nan'
      
    authors = []
    lname = [] #LastName
    fname = [] #ForeName
    for row in file.iter('LastName'): #last name
        lname.append(row.text)
    for row in file.iter('ForeName'): #first name
        fname.append(row.text)
    
    num = min(len(lname), len(fname))
    for i in range(num):
        author = str(lname[i]) + ' ' + str(fname[i])
        authors.append(author)
    lauth = ', '.join(authors)
    #get mails
    maill = []
    uni = []
    for row in file.iter('Author'): 
        auth = []
        for r in row.iter('LastName'):
            auth.append(r.text)
        for r in row.iter('ForeName'):
            auth.append(r.text)

        with_mails = []
        for r in row.iter('Affiliation'):
            with_mails.append(r.text)
            uni.append(str(' '.join(auth) + ' (' + r.text + ')'))
            
        mail_list = []
        if len(with_mails) > 0:
            for i in range(len(with_mails)):
                look = re.findall('@', str(with_mails[i]))
                if len(look) > 0:
                    #print(with_mails[i])
                    look2 = with_mails[i]
                    mail = re.findall('\S{,100}@\S{,100}', look2)[0]
                    mail_list.append(mail)   
        if len(mail_list) > 0:
            au = str(mail_list[0]) + str(' (' + ' '.join(auth) + ')')
        else:
            au = 'nan'
        if au != 'nan':
            maill.append(au)        
    if len(maill) > 0:
        maill = ', '.join(maill)  
    else:
        maill = 'nan'
    uni = ', '.join(uni)
    new_row = {'Year': [year[1]], 'PMID': [pmid[0]], 'JTitle': [jt[0]], 'ATitle': [at[0]],
               'Abstract': [labstr], 'Authors': [lauth], 'EMails': [maill], 'Affilation': [uni]}
    nr = pd.DataFrame(new_row) 
    return nr

In [5]:
folder = split_xml('data/pubmed20n1017.xml') #path to needed xml file
files = os.listdir(path = str(folder) + str('/xml')) #list of files with needed format

In [6]:
#here we get dataframe for future processing
df = pd.DataFrame()
for i in tnrange(len(files)):
    el = files[i]    
    fname = str(folder) + '/xml/' + str(el)
    if str(type(fname)) == '<class \'str\'>':
        row = get_article_info(fname)
        df = df.append(row)
        df.index = np.arange(len(df))

HBox(children=(IntProgress(value=0, max=29999), HTML(value='')))




In [7]:
#getting file to pubtator format for annotation

def to_pubtator_format(df):
    num = 1
    num2 = 0
    if not os.path.exists(folder +'/annotate'): # создаем ее, если не существует
        os.makedirs(folder +'/annotate')
    for i in range(len(df)):
        if num > num2:
            with open(folder + '/annotate/test_' + str(num) + '.PubTator', 'w+') as f:
                f.close()
            num2 += 1

        s1 = str('\n' + str(df.loc[i, 'PMID']) + '|t|' + str(df.loc[i, 'ATitle']).replace('\n', ''))
        s2 = str('\n' + str(df.loc[i, 'PMID']) + '|a|' + str(df.loc[i, 'Abstract']).replace('\n', '') + '\n\n')

        with open(folder + '/annotate/test_' + str(num) + '.PubTator', 'a') as f:
            f.write(s1)
            f.write(s2)
            f.close()

        if i % 1000 == 0:
            num += 1
    return('Convering to PubTator format is done')

In [8]:
def pubtator_annotator(folder):
    input_path = folder + '/annotate/' 
    output_path = folder + '/annotate_res/'           
    if not os.path.exists(output_path): 
        os.makedirs(output_path)

    fls = os.listdir(input_path) 
    fls = [row for row in fls if '.PubTator' in row] 
    print(fls)
    terms_list = 'needed_info/medchem_terms_norm_v2.txt' # path to file with drug discovery terms
    conf = Router(termlist_path=terms_list) # load terms
    pl = PipelineServer(conf) # create annotator
    
    # Going in cycle through files
    for k,fl in enumerate(fls):
        annotations = []
        print('started %d file. Total files: %d'%(k+1,len(fls)))

        doc = pl.load_one(input_path+fl, 'pubtator') # loading the file to annotator
        pl.process(doc) # annotation
        for row in doc: # going through each article
            for entity in row.iter_entities(): # through annotation from article
                annotations.append([row.id_, entity.info[3], entity.info[1]]) # [id article, id term, name term]

        #write the result into file (separator - "\t")
        with io.open(output_path+fl,'w',encoding = 'utf-8') as f:
            for row in annotations:
                f.write('\t'.join(row)+'\n')
                
    return('Checked')

In [9]:
def get_keys(folder):  
    fls = os.listdir(path = folder + '/annotate_res')
    fls = [row for row in fls if '.PubTator' in row]
    
    for i in range(len(fls)):
        file = fls[i]
        with open(folder +'/annotate_res/'+file, 'r') as f:
            lines = f.read()
        lines = lines.split('\n')
    
    keys = pd.DataFrame()
    for i in range(len(lines)):
        elem = lines[i]
        elem = elem.split('\t')
        #print(elem)
        if len(elem) == 3:
            pid = elem[0]
            nw = elem[2]
            rw = {'PMID': [pid], 'Key': [nw]}
            ow = pd.DataFrame(rw)
            keys = keys.append(ow)
            keys.index = np.arange(len(keys))

    valid_ids = []
    for i in range(len(keys)):
        if keys.loc[i, 'PMID'] not in valid_ids:
            valid_ids.append(keys.loc[i, 'PMID'])

    unik = pd.DataFrame()
    for i in range(len(valid_ids)):
        vid = valid_ids[i]
        q = keys[keys['PMID'] == vid]
        q.index = np.arange(len(q))
        keyss = str()
        for i in range(len(q)):
            if keyss == '':
                keyss = q.loc[i, 'Key']
            else:
                keyss = keyss + ', ' + q.loc[i, 'Key']
        row = {'PMID': [vid], 'Keys': [keyss]}
        row = pd.DataFrame(row)
        unik = unik.append(row)
        unik.index = np.arange(len(unik))

    return unik

In [10]:
to_pubtator_format(df)
pubtator_annotator(folder)
unik = get_keys(folder)

['test_10.PubTator', 'test_11.PubTator', 'test_24.PubTator', 'test_25.PubTator', 'test_4.PubTator', 'test_5.PubTator', 'test_9.PubTator', 'test_8.PubTator', 'test_29.PubTator', 'test_28.PubTator', 'test_17.PubTator', 'test_16.PubTator', 'test_23.PubTator', 'test_22.PubTator', 'test_3.PubTator', 'test_2.PubTator', 'test_30.PubTator', 'test_31.PubTator', 'test_27.PubTator', 'test_26.PubTator', 'test_7.PubTator', 'test_6.PubTator', 'test_19.PubTator', 'test_18.PubTator', 'test_13.PubTator', 'test_12.PubTator', 'test_20.PubTator', 'test_21.PubTator', 'test_1.PubTator', 'test_14.PubTator', 'test_15.PubTator']
started 1 file. Total files: 31
started 2 file. Total files: 31
started 3 file. Total files: 31
started 4 file. Total files: 31
started 5 file. Total files: 31
started 6 file. Total files: 31
started 7 file. Total files: 31
started 8 file. Total files: 31
started 9 file. Total files: 31
started 10 file. Total files: 31
started 11 file. Total files: 31
started 12 file. Total files: 31
s

In [11]:
df['Valid'] = 0
df['Keys'] = 'NaN'
for i in tnrange(len(unik)):
    if len(df[df['PMID'] == unik.loc[i, 'PMID']]) != 0:
        for j in range(len(df)):
            if str(df.loc[j, 'PMID']) == str(unik.loc[i, 'PMID']):
                df.at[j, 'Valid'] = 1
                df.at[j, 'Keys'] = str(unik.loc[i, 'Keys'])

HBox(children=(IntProgress(value=0, max=283), HTML(value='')))




In [12]:
df = df.fillna('nan')
res1 = df['Valid'] == 1
res2 = df['EMails'] != 'nan'
f_df = df[res1 & res2]
f_df.to_csv(folder + '/' + str(folder) + '_processed.csv')