In [5]:
import csv
import glob
import pickle
import numpy as np
import pandas as pd
import networkx as nx
import dask.dataframe as dd
import multiprocessing as mp
import xml.etree.ElementTree as ET

In [44]:
def article_author(myroot):

    # Function creates a dataframe with all the authors and the corresponding paper the authored or coauthored.
    # Can be used to create a bipartite author-paper network.

    full_name = []
    pubmed_id = []


    # First loop over every article in the xml file.
    for k in range(0, len(myroot)):
        # Then get the the article tag which is in the medlinecitation
        for child in myroot[k][0]:
            if child.tag == 'Article':
                # Then we need the author list. 
                for second_child in child:
                    if second_child.tag == 'AuthorList':
                        # Then in the authorlist we need all the authors for one paper.
                        for k2 in range(0, len(second_child)):
                            # For every author append the corresponding id to a list
                            pubmed_id.append(myroot[k][0][0].text)
                            # Getting the authors last name
                            lastname = second_child[k2][0].text
                            # Not all authors have their first name listed.
                            try:
                                firstname = second_child[k2][1].text
                            except:
                                firstname = '-'
                            # Create a full name and append it to a list
                            try:
                                name = str(firstname) + ' ' + str(lastname)
                            except:
                                name = 'None'

                            full_name.append(name)

    # Turn both lists created into a dataframe.                                                   
    db = pd.DataFrame((pubmed_id, full_name)).T

    return db

In [None]:
# Code to add publish date.
    # elif second_child.tag == 'Journal':
    #     for third_child in second_child[1]:
    #         if third_child.tag == 'PubDate':
    #             year = third_child[0].text
    #             try:
    #                 month = third_child[1].text
    #             except:
    #                 month = ''
    #             try:
    #                 day = third_child[2].text
    #             except:
    #                 day = ''

    #             publishDate = year + ' ' + month + ' ' + day
    #             publish_date.append(publishDate)

In [50]:
def citation_graph(myroot):

    #Function that creates a dataframe with all papers and their references.
    # Can be used to create a citation graph/network.

    # Currently only obtains papers if they cite another paper not those without.
    
    references = []
    pubmed_id = []
    # First loop over every article in the xml file.
    for k in range(0, len(myroot)):
        # Then get the references which is in the reference list tag.
        for ref_child in myroot[k][1]:
            if ref_child.tag ==  'ReferenceList':
                # Then add the reference and the corresponing pubmed id to lists.
                for ref_child2 in ref_child:
                    reference = ref_child2[1][0].text
                    pubmed_id.append(myroot[k][0][0].text)

                    references.append(reference)
    # combine the lists into a dataframe
    ref_db = pd.DataFrame((pubmed_id, references)).T

    return ref_db

In [6]:
path = '/data/dataprocessing/NCBI/PubMed/'
files = glob.glob(path + '*.xml')

In [7]:
mytree = ET.parse(files[70])
myroot = mytree.getroot()

In [14]:
df = article_author(myroot)

In [51]:
df2 = citation_graph(myroot)

In [73]:
def keyword_collector():
    # Function that creates a dataframe with all the pubmedids and their keywords. 
    # Known issue: Only obtains pubmedIDs if they have keywords. If they dont they get left out.

    keyword_list = []
    pubmed_id = []
    # Looping over all the articles.
    for k in range(0, len(myroot)):
        # Finding the location of keywords.
        for keyword_child in myroot[k][0]:
            if keyword_child.tag == 'MeshHeadingList':
                # Creating a list to store keywords per article.
                indv_list = []
                # Getting the individual keywords per article
                for keyword_child2 in keyword_child:
                    indv_list.append(keyword_child2[0].text)
                # Adding the keywords to a larger list.
                keyword_list.append(indv_list)
                # Adding the pubmedid they belong to.
                pubmed_id.append(myroot[k][0][0].text)
    db = pd.DataFrame((pubmed_id, keyword_list)).T
    return db

In [41]:
def dataframer(myroot):
    # Function that creates a dataframe with everything.
    # Will require some reformatting.

    full_name = []
    pubmed_id = []

    keyword_list = []
    key_pubmed_id = []

    references = []
    ref_pubmed_id = []
    
    # Looping over all the articles.
    for k in range(0, len(myroot)):
        # Finding the location of keywords.
        for child in myroot[k][0]:

            # Obtaining the authors.
            if child.tag == 'Article':
                # Then we need the author list. 
                full_name_list = []
                for second_child in child:
                    if second_child.tag == 'AuthorList':
                        # Then in the authorlist we need all the authors for one paper.
                        for k2 in range(0, len(second_child)):
                            # Getting the authors last name
                            lastname = second_child[k2][0].text
                            # Not all authors have their first name listed.
                            try:
                                firstname = second_child[k2][1].text
                            except:
                                firstname = '-'
                            # Create a full name and append it to a list
                            try:
                                name = str(firstname) + ' ' + str(lastname)
                            except:
                                name = 'None'
                            full_name_list.append(name)
                # For every author append the corresponding id to a list
                pubmed_id.append(myroot[k][0][0].text)
                full_name.append(full_name_list)


            if child.tag == 'MeshHeadingList':
                # Creating a list to store keywords per article.
                indv_list = []
                # Getting the individual keywords per article
                for keyword_child2 in child:
                    indv_list.append(keyword_child2[0].text)
                # Adding the keywords to a larger list.
                keyword_list.append(indv_list)
                # Adding the pubmedid they belong to.
                key_pubmed_id.append(myroot[k][0][0].text)


        for child in myroot[k][1]:
            # Finding the references.
            if child.tag ==  'ReferenceList':
                # Then add the reference and the corresponing pubmed id to lists.
                ref_list = []
                for ref_child2 in child:
                    reference = ref_child2[1][0].text
                    ref_list.append(reference)
                ref_pubmed_id.append(myroot[k][0][0].text)
                references.append(ref_list)

    # combine the lists into a dataframe                                                  
    auth_db = pd.DataFrame((pubmed_id, full_name)).T

    ref_db = pd.DataFrame((ref_pubmed_id, references)).T
                
    key_db = pd.DataFrame((key_pubmed_id, keyword_list)).T


    return ref_db, key_db, auth_db

In [18]:
def data_merger(myroot):
    dataframes = dataframer(myroot)
    references = dataframes[0]
    keywords = dataframes[1]
    authors = dataframes[2]

    authref = authors.merge(references, how='left', left_on=0, right_on=0)
    full_df = authref.merge(keywords, how='left', left_on=0, right_on=0)

    full_df = full_df.rename(columns={0:'pubmedID', '1_x':'Authors', '1_y':'Refs', 1:'keywords'})

    return full_df

In [42]:
test2 = dataframer(myroot)


In [44]:
test = data_merger(myroot)

In [46]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30000 entries, 0 to 29999
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   pubmedID  30000 non-null  object
 1   Authors   30000 non-null  object
 2   Refs      2713 non-null   object
 3   keywords  29999 non-null  object
dtypes: object(4)
memory usage: 1.1+ MB
