<a href="https://colab.research.google.com/github/siliang625/text_mining_health/blob/master/parsing_xml_20200604.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# !pip install BeautifulSoup4
# !pip install fastprogress  #tqdm



In [0]:
%matplotlib inline
import pandas as pd
import requests
from urllib.request import urlopen
from fastprogress.fastprogress import master_bar, progress_bar
from urllib.parse import quote
from bs4 import BeautifulSoup
import os
import time

In [0]:
from importlib import reload  # Not needed in Python 2
import logging
reload(logging)
#logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')
logging.basicConfig(level=logging.DEBUG, datefmt='%I:%M:%S')
logger = logging.getLogger("Parsing_Data")
# logger = logging.getLogger("Parsing_Data")
# logger.setLevel(logging.DEBUG)

In [0]:
# global variables
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
search_url = "esearch.fcgi?db=pmc&term="
fetch_url = "efetch.fcgi?db=pmc"
application = "&tool=digital_public_health"
email = "&email=siliang.liu@alumni.ubc.ca"

# key words:

Loading in key words and taking a look:

In [9]:
path = "./search_term_1.csv"
search_terms_df = pd.read_csv(path, header=None, keep_default_na=False)
search_terms_df

Unnamed: 0,0,1
0,mhealth or m-health,public health
1,ehealth or e-health,health promotion
2,virtual health,health prevention
3,mobile health,health protection
4,online health,health policy
5,internet-based health,health determinants
6,computer-based health,health evaluation
7,health informatics,health economics
8,social media,public health ethics
9,predictive algorithms,risk assessment


In [0]:
search_terms_c1 = search_terms_df.iloc[:,0]
search_terms_c2 = search_terms_df.iloc[:,1]

# Define search query

Let's define the query: we will use OR operator to connect all terms in column 0, as well as column 1, and use AND operator to connect subquery of column 0 and column 1. 

For instance, if we want to search ("virtual health" OR "mobile health") AND ("public health" OR "health promotion"), we would use query: "virtual+health[tw] OR mobile+health[tw]) AND (public+health[tw] OR health+promotion[tw]". 

Note: [tw] is a "Field Tag" helping to search specific fields of the PubMed, it refers to "text words"[1].

First, let's build the query

In [0]:
def form_query_term(df):
    '''
    arg: df: dataframe containing a list of terms1 in single column
    return:
        array: containing processed search terms, eg: "mhealth[tw] OR m-health[tw] OR ehealth[tw]"
    '''
    query_arr = []
    tag = "[tw]"

    for term in df:
        if term == "" or "*" in term:
            continue
            
        # eg: mhealth or m-health
        if " or " in term:
            or_terms = term.split(" or ")
            for each in or_terms:
               # query_string += each.rstrip().lstrip()
                query_arr.append(each.rstrip().lstrip().replace(" ", "+") + tag)
                
        else:
            query_arr.append(term.rstrip().lstrip().replace(" ", "+") + tag)
            
    return query_arr


In [12]:
query_string = ""
connector = " " + "OR" + " "
query_arr_a = form_query_term(search_terms_c1)     
query_arr_b = form_query_term(search_terms_c2) 
query_string = "(" + connector.join(query_arr_a) + ")" + " AND " + "(" + connector.join(query_arr_b) + ")"
logger.debug(query_string)
    

DEBUG:Parsing_Data:(mhealth[tw] OR m-health[tw] OR ehealth[tw] OR e-health[tw] OR virtual+health[tw] OR mobile+health[tw] OR online+health[tw] OR internet-based+health[tw] OR computer-based+health[tw] OR health+informatics[tw] OR social+media[tw] OR predictive+algorithms[tw] OR artificial+intelligence[tw] OR machine+learning+methods[tw] OR big+data[tw] OR electronic+health[tw] OR telemedicine[tw]) AND (public+health[tw] OR health+promotion[tw] OR health+prevention[tw] OR health+protection[tw] OR health+policy[tw] OR health+determinants[tw] OR health+evaluation[tw] OR health+economics[tw] OR public+health+ethics[tw] OR risk+assessment[tw] OR epidemiology[tw] OR community+health[tw] OR emergency+preparedness[tw] OR emergency+response[tw] OR health+equity[tw] OR social+justice[tw] OR social+determinants[tw] OR surveillance[tw])


Secondly, Lets extract all the related ids givne the query

In [0]:
def extract_ids(query_term, return_size=10):
    '''
    args: query_string 
    return: find and set max_return_size 
            return list of related ids
    '''

    ret_max_term = "&RetMax=" + str(return_size)
    
    base_url_key_word = base_url + search_url + quote(query_term) + ret_max_term #+ application + email
    #print(base_url_key_word)
    
    page_kw = urlopen(base_url_key_word)
    soup_kw = BeautifulSoup(page_kw, "xml")
    
    # get text form of everything in the xml file
    # soup_kw.get_text()
    
    return_size = soup_kw.find('Count').text    
    id_list = soup_kw.find_all('Id')
    
    return_ids = []
    for each_id in id_list:
        # print(each_id.text)
        return_ids.append(each_id.text)
    
    return return_size, return_ids

In [14]:
return_size = extract_ids(query_string)[0]
logger.debug("there are {} related papers".format(return_size))
return_ids = extract_ids(query_string, return_size)[1]

DEBUG:Parsing_Data:there are 78390 related papers


# Extract article information in xml

In [0]:
!mkdir full_content

In [0]:
def extract_xml(paper_id):
    
    query_id = "&id=" + str(paper_id)
    base_url_content = base_url + fetch_url + query_id

    response = requests.get(base_url_content)
    with open('./full_content/{}_paper.xml'.format(paper_id), 'wb') as file:
        file.write(response.content)
    file.close()

    # get text form of everything in the xml file
    #soup.get_text()
    
    time.sleep(1)

In [0]:
for each_id in progress_bar(return_ids):
    logger.debug("########" + each_id)
    extract_xml(each_id)
    

DEBUG:Parsing_Data:########7269166
DEBUG:Parsing_Data:########7269107
DEBUG:Parsing_Data:########7268900
DEBUG:Parsing_Data:########7268189
DEBUG:Parsing_Data:########7268184
DEBUG:Parsing_Data:########7268180
DEBUG:Parsing_Data:########7267760
DEBUG:Parsing_Data:########7267758
DEBUG:Parsing_Data:########7267748
DEBUG:Parsing_Data:########7267744
DEBUG:Parsing_Data:########7267671
DEBUG:Parsing_Data:########7267659
DEBUG:Parsing_Data:########7267631
DEBUG:Parsing_Data:########7267625
DEBUG:Parsing_Data:########7267610
DEBUG:Parsing_Data:########7267603
DEBUG:Parsing_Data:########7267600
DEBUG:Parsing_Data:########7267583
DEBUG:Parsing_Data:########7267579
DEBUG:Parsing_Data:########7267576
DEBUG:Parsing_Data:########7267574
DEBUG:Parsing_Data:########7267542
DEBUG:Parsing_Data:########7267534
DEBUG:Parsing_Data:########7267529
DEBUG:Parsing_Data:########7267513
DEBUG:Parsing_Data:########7267493
DEBUG:Parsing_Data:########7267488
DEBUG:Parsing_Data:########7267458
DEBUG:Parsing_Data:#

# TODO

- how to treat sesarch terms, such as :"mhealth or m-health" and "web*"

# Reference

[1]Search Field Tags:  https://www.nlm.nih.gov/bsd/disted/pubmedtutorial/020_710.html