In [253]:
# !pip install BeautifulSoup4
# !pip install lxml
# !pip install fastprogress  #tqdm



In [261]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
np.set_printoptions(threshold=80, edgeitems=50)
from bs4 import BeautifulSoup
from collections import namedtuple
import codecs
import re
import time
import json
import requests
from urllib.request import urlopen
import pandas as pd
from fastprogress.fastprogress import master_bar, progress_bar

In [199]:
import logging
logger = logging.getLogger("Parsing_Data")
logger.setLevel(logging.DEBUG)


In [249]:
# global variables
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
search_url = "esearch.fcgi?db=pmc"
fetch_url = "efetch.fcgi?db=pmc"

return_size = 10

from collections import defaultdict
term_ids_mapping = defaultdict(list)  # {key_term: [list of article ids]}
id_paper_mapping = {}      # {id: {abstract: abstract_text, meta: authors; text: txt_file_name}}

# key words:

Loading in key words

In [12]:
path = "./search_term_1.csv"
search_terms_df = pd.read_csv(path, header=None)
search_terms_df

Unnamed: 0,0,1
0,mhealth or m-health,public health
1,ehealth or e-health,health promotion
2,virtual health,health prevention
3,mobile health,health protection
4,online health,health policy
5,internet-based health,health determinants
6,computer-based health,health evaluation
7,health informatics,health economics
8,social media,public health ethics
9,predictive algorithms,risk assessment


In [15]:
search_terms_c1 = search_terms_df.iloc[:,0]
search_terms_c2 = search_terms_df.iloc[:,1]

In [37]:
for each in search_terms_c1:
    logger.debug(each)

mhealth or m-health
ehealth or e-health
virtual health 
mobile health 
online health
internet-based health
computer-based health
health informatics
social media
predictive algorithms
artificial intelligence
machine learning methods
big data
electronic health
web*
digit*
telemedicine
nan


In [209]:
def extract_ids(term, return_size):
    '''
    TODO: signature 
    '''
    logger.debug("current term is: {}".format(term))
    
    query_term = "&term=" + term.replace(" ", "+")
    
    ret_max_term = "&RetMax=" + str(return_size)
    
    base_url_key_word = base_url + search_url + query_term + ret_max_term
    # base_url_key_word
    
    page_kw = urlopen(base_url_key_word)
    soup_kw = BeautifulSoup(page_kw, "xml")
    
    # get text form of everything in the xml file
    # soup_kw.get_text()
    
    total_id_amount = soup_kw.find('Count').text
    #total_id_amount
    
    #TODO: might be duplicate ids 
    id_list = soup_kw.find_all('Id')
    
    return_ids = []
    for each_id in id_list:
        # print(each_id.text)
        return_ids.append(each_id.text)
        
    return return_ids

Lets extract all the related ids for each of the search term:

In [271]:
from collections import defaultdict
for cur_term in progress_bar(search_terms_c1):
    # TODO: mhealth or m-health
    # TODO: how to deal with regex&*
    if cur_term is not None and not pd.isna(cur_term) and "or" not in cur_term and "*" not in cur_term:
        id_list = extract_ids(cur_term, return_size)
        term_ids_mapping[cur_term].extend(id_list)
        time.sleep(1)
        
        
for cur_term in progress_bar(search_terms_c2):
    if cur_term is not None and not pd.isna(cur_term) and "or" not in cur_term and "*" not in cur_term:
        id_list = extract_ids(cur_term, return_size)
        term_ids_mapping[cur_term].extend(id_list)
        time.sleep(1)
        
logger.debug("term and corresponding ids are:")
logger.debug(term_ids_mapping)

DEBUG:Parsing_Data:current term is: public health
DEBUG:Parsing_Data:current term is: health promotion
DEBUG:Parsing_Data:current term is: health prevention
DEBUG:Parsing_Data:current term is: health protection
DEBUG:Parsing_Data:current term is: health policy
DEBUG:Parsing_Data:current term is: health determinants
DEBUG:Parsing_Data:current term is: health evaluation
DEBUG:Parsing_Data:current term is: health economics
DEBUG:Parsing_Data:current term is: public health ethics
DEBUG:Parsing_Data:current term is: risk assessment
DEBUG:Parsing_Data:current term is: epidemiology
DEBUG:Parsing_Data:current term is: community health
DEBUG:Parsing_Data:current term is: emergency preparedness
DEBUG:Parsing_Data:current term is: emergency response
DEBUG:Parsing_Data:current term is: health equity
DEBUG:Parsing_Data:current term is: social justice
DEBUG:Parsing_Data:current term is: social determinants
DEBUG:Parsing_Data:current term is: surveillance
DEBUG:Parsing_Data:term and corresponding ids

Looks like there are overlap of related articles from some search terms

# article full information

In [117]:
def get_title(soup):
    title = soup.find('title-group')
    article_title = title.find('article-title')
    return article_title.text

In [263]:
def get_authors(soup):
    author_list = []
    
    contrib = soup.find('contrib-group')
    if contrib is None:
        return author_list
    
    authors = contrib.find_all('contrib', **{'contrib-type':"author"})
    for author in authors:
        name = author.find('name')
        if name is not None:
            author_list.append(' '.join([author.find('surname').text, author.find('given-names').text]))

    return author_list


In [119]:
def get_abstract(soup):
    if soup.find("abstract") is None:
        return "NaN"
    
    return soup.find('abstract').text

In [216]:
def get_content(paper_id, soup):
    '''
    write paper content to a txt file name as id.txt
    '''
    if soup.find("body") is not None:
        contrib = soup.find('body')
        paragraphs = contrib.find_all('p')
    else:
        paragraphs = []

    #save to a txt file
    f = open("./full_content/{}_paper.txt".format(paper_id), "w+") 
    for p in paragraphs:
        #print(p.text)
        f.write(p.text)
    f.close()

In [62]:
#!mkdir full_content

In [244]:
def extract_rich_info(paper_id):
    
    query_id = "&id=" + str(paper_id)
    base_url_content = base_url + fetch_url + query_id

    page = urlopen(base_url_content)
    logger.debug(base_url_content)
    soup = BeautifulSoup(page, "xml")

    # get text form of everything in the xml file
    #soup.get_text()
    
    ## extract paper title
    title = get_title(soup)
    id_paper_mapping[paper_id] = dict()
    id_paper_mapping[paper_id]["title"] = title
    

    ## extract paper author
    authors = get_authors(soup)
    id_paper_mapping[paper_id]["authors"] = authors
    
    
    ## extract paper abstract
    abstract = get_abstract(soup)
    id_paper_mapping[paper_id]["abstract"] = abstract
    
    
    ## extract paper content 
    get_content(paper_id, soup)

    time.sleep(1)

Let's extract the full information of all articles

In [272]:
# term_ids_mapping  {key_term: [ids]}
# for each search term, 
    # for each_id of related_ids: 
        # extrac_rich_info()

mb = master_bar(term_ids_mapping.items())
for key,value in mb:
    for each_id in progress_bar(value, parent=mb):
        if each_id in id_paper_mapping:
            continue
        logger.debug("########" + each_id)
        extract_rich_info(each_id)
    
#logger.debug(id_paper_mapping)


DEBUG:Parsing_Data:########7261601
DEBUG:Parsing_Data:https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=7261601
DEBUG:Parsing_Data:########7261420
DEBUG:Parsing_Data:https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=7261420
DEBUG:Parsing_Data:########7261395
DEBUG:Parsing_Data:https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=7261395
DEBUG:Parsing_Data:########7255464
DEBUG:Parsing_Data:https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=7255464
DEBUG:Parsing_Data:########7255454
DEBUG:Parsing_Data:https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=7255454
DEBUG:Parsing_Data:########7255042
DEBUG:Parsing_Data:https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=7255042
DEBUG:Parsing_Data:########7254992
DEBUG:Parsing_Data:https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=7254992
DEBUG:Parsing_Data:########7253138
DEBUG:Parsing_Data:https://eutils.ncbi.nlm.nih.gov/entr

In [275]:
# test test
id_paper_mapping["7255022"]["authors"]

['Zhao Shengzhe', 'Yu Xujiang', 'Qian Yuna', 'Chen Wei', 'Shen Jianliang']

# Conclusion

- currently returnning 10 related related article_ids
- how to treat sesarch terms, such as :"mhealth or m-health" and "web*"
- script could be optimized to check overlap article ids from different search terms
- the xml are not defined well by standard, eg:
    -xml has no 'body' tag, xml only providing abstract, currently, this script put empty text to txt file
        - https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=6494757
        - certain publisher (eg: American Public Health Association) <!--The publisher of this article does not allow downloading of the full text in XML form.-->
    -xml has no 'author' tag, but it place the anthors under the 'body' tag...
        - https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=7256479
        
- adding applicaiton name and email as query search term, just in case...
   

## 