In [1]:
import pandas as pd
import numpy as np
import re
import time
from os.path import exists
import pickle

In [2]:
# Import libraries
import requests
from bs4 import BeautifulSoup

In [3]:
# treat empty strings and np.nan as NAm will impact dropna method
pd.options.mode.use_inf_as_na = True
# set Jupyter to display all output from a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
MAX_USER_PROFILES = 30

In [None]:
def fetch_num_citations(url):
    time.sleep(1)
    page = requests.get(url)
    # Create a BeautifulSoup object
    soup = BeautifulSoup(page.text, 'html.parser')
    
    relevant_cited_by_div = soup.find(class_='citedby-articles')
    
    if not relevant_cited_by_div:
        # done with page
        return 0
    
    relevant_amount_html = relevant_cited_by_div.find(class_='amount')
    for amount in relevant_amount_html:
        break

    return int(amount.replace(',', '').replace(' ', ''))



In [None]:
def scrape_pubmed_results_page(query, pageno=0, ranking=0):
    pubmed_articles = []

    url = f'https://pubmed.ncbi.nlm.nih.gov/?term={query}&filter=pubt.clinicalconference&filter=pubt.clinicalstudy&filter=pubt.clinicaltrial&filter=pubt.clinicaltrialprotocol&filter=pubt.clinicaltrialphasei&filter=pubt.clinicaltrialphaseii&filter=pubt.clinicaltrialphaseiii&filter=pubt.clinicaltrialphaseiv&filter=pubt.comparativestudy&filter=pubt.controlledclinicaltrial&filter=pubt.editorial&filter=pubt.meta-analysis&filter=pubt.observationalstudy&filter=pubt.practiceguideline&filter=pubt.pragmaticclinicaltrial&filter=pubt.randomizedcontrolledtrial&filter=pubt.researchsupportamericanrecoveryandreinvestmentact&filter=pubt.researchsupportnihextramural&filter=pubt.researchsupportnihintramural&filter=pubt.researchsupportnonusgovt&filter=pubt.researchsupportusgovtnonphs&filter=pubt.researchsupportusgovtphs&filter=pubt.researchsupportusgovernment&filter=pubt.validationstudy&show_snippets=off&size=200&page={pageno}&format=pubmed'
    print(f'\npage={pageno},url={url}')
    #print(f'page={pageno}')
    time.sleep(1)
    page = requests.get(url)

    # Create a BeautifulSoup object
    soup = BeautifulSoup(page.text, 'html.parser')

    # Pull all text from the BodyText div
    relevant_pubmed_html = soup.find(class_='search-results-chunk')
    #print(relevant_pubmed_html)

    if not relevant_pubmed_html:
        # done with all pages
        return (pubmed_articles, ranking)

    for pubmed_results in relevant_pubmed_html:
        # fetch the data inside the HTML element
        break
    
    lines = pubmed_results.split('\r\n')
    #print(f'lines_length={len(lines)}')

    article = None
    current_tag = ''

    try:
        for line in lines:
            line = line.strip()

            if line == '':
                if article:
                    if len(article['authors']) > 1:
                        if article['authors'][-1]['affiliations'] == '':
                            article['authors'][-1]['affiliations'] = article['authors'][-2]['affiliations']

                    for idx, author in enumerate(article['authors']):
                        article['authors'][idx]['affiliations'] = article['authors'][idx]['affiliations'].split(';')
                        article['authors'][idx]['affiliations'] = [aff.strip() for aff in article['authors'][idx]['affiliations'] if aff != '']
                    article['citations'] = 0#fetch_num_citations(article['url'])
                    pubmed_articles.append(article)
                    article = None
                current_tag = 'new'
            elif re.search(r'^PMID', line):
                ranking += 1
                article = {
                    'pmid':'', 
                    'pmc':'', 
                    'url':'', 
                    'title':'', 
                    'published_date':'', 
                    'abstract':'', 
                    'reference':'', 
                    'article_type':[], 
                    'authors':[], 
                    'journal':'', 
                    'mesh_terms':[], 
                    'citations': 0, 
                    'rank': ranking
                }
                article['pmid'] = re.sub(r'^PMID\s*\-\s+', '', line).strip()
                article['url'] = f'https://pubmed.ncbi.nlm.nih.gov/{article["pmid"]}/'
                current_tag = 'pmid'
            elif re.search(r'^PMC\s+\-\s+', line):
                article['pmc'] = re.sub(r'^PMC\s+\-\s+', '', line).strip()
                current_tag = 'pmc'
            elif re.search(r'^TI\s+\-\s+', line):
                article['title'] = re.sub(r'^TI\s+\-\s+', '', line).strip()
                current_tag = 'title'
            elif re.search(r'^DP\s+\-\s+', line):
                article['published_date'] = re.sub(r'^DP\s+\-\s+', '', line).strip()
                current_tag = 'published_date'
            elif re.search(r'^SO\s+\-\s+', line):
                article['reference'] = re.sub(r'^SO\s+\-\s+', '', line).strip()
                current_tag = 'reference'
            elif re.search(r'^AB\s+\-\s+', line):
                article['abstract'] = re.sub(r'^AB\s+\-\s+', '', line).strip()
                current_tag = 'abstract'
            elif re.search(r'^MHDA\s*\-\s*', line):
                # ignore and skip it
                current_tag = ''
                continue
            elif re.search(r'^MH\s+\-\s+', line):
                mesh_term = re.sub(r'^MH\s+\-\s+', '', line).strip()
                mesh_term = re.sub(r'\*', '', mesh_term).strip()
                article['mesh_terms'].append(mesh_term)
                current_tag = 'mesh_terms'
            elif re.search(r'^PT\s+\-\s+', line):
                article['article_type'].append(re.sub(r'^PT\s+\-\s+', '', line).strip())
                current_tag = 'article_type'
            elif re.search(r'^JT\s+\-\s+', line):
                article['journal'] = re.sub(r'^JT\s+\-\s+', '', line).strip()
                current_tag = 'journal'
            elif re.search(r'^FAU\s+\-\s+', line):
                if len(article['authors']) > 1:
                    if article['authors'][-1]['affiliations'] == '':
                        article['authors'][-1]['affiliations'] = article['authors'][-2]['affiliations']
                article['authors'].append({
                    'full_name':re.sub(r'^FAU\s+\-\s+', '', line).strip(), 
                    'initial_name':'', 
                    'affiliations':''
                })
                current_tag = 'full_name'
            elif re.search(r'^AU\s+\-\s+', line):
                article['authors'][-1]['initial_name'] = re.sub(r'^AU\s+\-\s+', '', line).strip()
                current_tag = 'initial_name'
            elif re.search(r'^AD\s+\-\s+', line):
                if current_tag == 'affiliations':
                    article['authors'][-1]['affiliations'] += ';'
                    article['authors'][-1]['affiliations'] += re.sub(r'^AD\s+\-\s+', '', line).strip()
                else:
                    if len(article['authors']) == 0:
                        # seems like we arrived at AD without FAU, so assume no author but no name
                        article['authors'].append({
                            'full_name':'', 
                            'initial_name':'', 
                            'affiliations':''
                        })
                    #print('1\t', article['authors'][-1]['affiliations'])
                    article['authors'][-1]['affiliations'] = re.sub(r'^AD\s*\-\s*', '', line).strip()
                    #print('2\t', article['authors'][-1]['affiliations'])
                    current_tag = 'affiliations'
            elif re.search(r'^[A-Z]+\s*\-\s+', line):
                # some tag that we don't care about so ignore,  and move on
                current_tag = ''
                continue
            elif current_tag == 'title':
                article['title'] += ' '
                article['title'] += re.sub(r'^TI\s+\-\s+', '', line).strip()
            elif current_tag == 'abstract':
                article['abstract'] += ' '
                article['abstract'] += re.sub(r'^AB\s+\-\s+', '', line).strip()
                #article['abstract'] = re.sub(r'^BACKGROUND:\s*', '', article['abstract']).strip()
            elif current_tag == 'key_phrases':
                key_phrase = re.sub(r'^MH\s+\-\s+', '', line).strip()
                key_phrase = re.sub(r'\*', '', key_phrase).strip()
                article['key_phrases'].append(key_phrase)
            elif current_tag == 'article_type':
                article['article_type'].append(re.sub(r'^PT\s+\-\s+', '', line).strip())
            elif current_tag == 'affiliations':
                article['authors'][-1]['affiliations'] += ' '
                article['authors'][-1]['affiliations'] += re.sub(r'^AD\s+\-\s+', '', line).strip()
    except Exception as inst:
        print(f'\tEXCEPTION: {line}')
        print(inst)
        print(type(inst))    # the exception instance
        print(inst.args)     # arguments stored in .args
        #raise inst
        pass
    
    return (pubmed_articles, ranking)


In [8]:
def get_search_term_mapping():
    with open('../data/who_search_terms_mapping.pkl', 'rb') as handle:
        out_search_space = pickle.load(handle)
        return out_search_space

In [14]:
search_space = get_search_term_mapping()
for key in search_space:
    vals = search_space[key]
    vals = [re.sub(r'\s*,\s*|\s*&\s*', ' ', val) for val in vals]
    vals = [re.sub(r'\s+', ' ', val) for val in vals]
    key = re.sub(r'\s*,\s*|\s*&\s*', ' ', key)
    key = re.sub(r'\s+', ' ', key)
    print(key, '\t', vals)
    

gliomas glioneuronal tumours neuronal tumours 	 ['central nervous system tumours']
neuronal tumours 	 ['central nervous system tumours']
glioneuronal tumours 	 ['central nervous system tumours']
gliomas 	 ['gliomas glioneuronal tumours neuronal tumours', 'gliomas', 'paediatric tumours', 'eye tumours', 'glioneuronal tumours', 'optic nerve tumours', 'neuronal tumours', 'central nervous system tumours', 'cns tumours']
adult-type diffuse gliomas 	 ['gliomas glioneuronal tumours neuronal tumours', 'gliomas', 'glioneuronal tumours', 'neuronal tumours', 'central nervous system tumours']
astrocytoma 	 ['gliomas glioneuronal tumours neuronal tumours', 'adult-type diffuse gliomas', 'tumours optic disc optic nerve', 'tumours', 'gliomas', 'primary tumours', 'eye tumours', 'retina', 'neuroepithelium', 'glioneuronal tumours', 'neurosensory retina', 'tumours neurosensory retina', 'tumours retina neuroepithelium', 'optic disc optic nerve', 'neuronal tumours', 'central nervous system tumours']
oligoden

oncocytic tumours 	 ['urinary male genital tumours', 'tumours kidney', 'epithelial tumours', 'tumours', 'oncocytic chromophobe renal tumours', 'oncocytic', 'renal cell tumours', 'eye tumours', 'kidney', 'tumours lacrimal gland', 'renal tumours', 'lacrimal gland', 'male genital tumours']
collecting duct tumours 	 ['urinary male genital tumours', 'tumours kidney', 'tumours', 'renal cell tumours', 'kidney', 'male genital tumours']
duct 	 ['urinary male genital tumours', 'tumours kidney', 'tumours', 'renal cell tumours', 'kidney', 'male genital tumours']
collecting duct carcinoma 	 ['urinary male genital tumours', 'duct', 'tumours kidney', 'tumours', 'renal cell tumours', 'collecting duct tumours', 'kidney', 'male genital tumours']
duct carcinoma 	 ['urinary male genital tumours', 'duct', 'tumours kidney', 'tumours', 'renal cell tumours', 'collecting duct tumours', 'kidney', 'male genital tumours']
clear cell papillary renal cell tumour 	 ['urinary male genital tumours', 'tumours kidney', 

germinoma/seminoma 	 ['paediatric tumours', 'germ cell tumours', 'germinoma-family tumours']
dysgerminoma/seminoma 	 ['paediatric tumours', 'germ cell tumours', 'germinoma-family tumours']
non-germinomatous germ cell tumours 	 ['paediatric tumours', 'germ cell tumours']
mature cystic teratoma 	 ['paediatric tumours', 'germ cell tumours', 'non-germinomatous germ cell tumours']
extragonadal teratoma 	 ['paediatric tumours', 'germ cell tumours', 'non-germinomatous germ cell tumours']
female gonadal 	 ['paediatric tumours', 'germ cell tumours', 'non-germinomatous germ cell tumours']
monodermal teratomas 	 ['female genital tumours', 'tumours', 'paediatric tumours', 'tumours ovary', 'ovary', 'germ cell tumours', 'non-germinomatous germ cell tumours']
monodermal teratomas ( female gonadal ) 	 ['paediatric tumours', 'germ cell tumours', 'non-germinomatous germ cell tumours']
immature teratoma ( female gonadal ) 	 ['paediatric tumours', 'germ cell tumours', 'non-germinomatous germ cell tumours'

acute myeloid leukaemia without maturation 	 ['related precursor neoplasms', 'tumours', 'acute myeloid leukaemia related precursor neoplasms', 'acute myeloid leukaemia', 'haematopoietic lymphoid tissues', 'tumours haematopoietic lymphoid tissues']
maturation 	 ['related precursor neoplasms', 'tumours', 'acute myeloid leukaemia related precursor neoplasms', 'acute myeloid leukaemia', 'haematopoietic lymphoid tissues', 'tumours haematopoietic lymphoid tissues']
acute myeloid leukaemia maturation 	 ['related precursor neoplasms', 'tumours', 'acute myeloid leukaemia related precursor neoplasms', 'acute myeloid leukaemia', 'haematopoietic lymphoid tissues', 'tumours haematopoietic lymphoid tissues']
acute myelomonocytic leukaemia 	 ['related precursor neoplasms', 'tumours', 'acute myeloid leukaemia related precursor neoplasms', 'acute myeloid leukaemia', 'haematopoietic lymphoid tissues', 'tumours haematopoietic lymphoid tissues']
acute monoblastic monocytic leukaemia 	 ['related precursor 

In [9]:
search_space = get_search_term_mapping()
for key in search_space:
    vals = search_space[key]
    vals = [re.sub(r'\s*,\s*|\s*&\s*', ' ', val) for val in vals]
    vals = [re.sub(r'\s+', '+', val) for val in vals]
    key = re.sub(r'\s*,\s*|\s*&\s*', ' ', key)
    key = re.sub(r'\s+', '+', key)
    print(key, '--->', vals)
    

gliomas+glioneuronal+tumours+neuronal+tumours ---> ['central+nervous+system+tumours']
neuronal+tumours ---> ['central+nervous+system+tumours']
glioneuronal+tumours ---> ['central+nervous+system+tumours']
gliomas ---> ['gliomas+glioneuronal+tumours+neuronal+tumours', 'gliomas', 'paediatric+tumours', 'eye+tumours', 'glioneuronal+tumours', 'optic+nerve+tumours', 'neuronal+tumours', 'central+nervous+system+tumours', 'cns+tumours']
adult-type+diffuse+gliomas ---> ['gliomas+glioneuronal+tumours+neuronal+tumours', 'gliomas', 'glioneuronal+tumours', 'neuronal+tumours', 'central+nervous+system+tumours']
astrocytoma ---> ['gliomas+glioneuronal+tumours+neuronal+tumours', 'adult-type+diffuse+gliomas', 'tumours+optic+disc+optic+nerve', 'tumours', 'gliomas', 'primary+tumours', 'eye+tumours', 'retina', 'neuroepithelium', 'glioneuronal+tumours', 'neurosensory+retina', 'tumours+neurosensory+retina', 'tumours+retina+neuroepithelium', 'optic+disc+optic+nerve', 'neuronal+tumours', 'central+nervous+system+

clear+cell+sarcoma+kidney ---> ['urinary+male+genital+tumours', 'tumours+kidney', 'tumours', 'renal+mesenchymal+tumours', 'metanephric+tumours', 'mesenchymal+renal+tumours', 'paediatric+tumours', 'renal+male+genital+tumours', 'kidney', 'paediatric+renal+mesenchymal+tumours', 'renal+tumours', 'male+genital+tumours']
embryonal+neoplasms+kidney ---> ['urinary+male+genital+tumours', 'tumours+kidney', 'tumours', 'kidney', 'male+genital+tumours']
embryonal+neoplasms ---> ['urinary+male+genital+tumours', 'tumours+kidney', 'tumours', 'kidney', 'male+genital+tumours']
nephroblastic+tumours ---> ['kidney', 'urinary+male+genital+tumours', 'tumours+kidney', 'tumours', 'embryonal+neoplasms+kidney', 'embryonal+neoplasms', 'male+genital+tumours']
nephrogenic+rests ---> ['embryonal+neoplasms', 'urinary+male+genital+tumours', 'nephroblastic+tumours', 'tumours+kidney', 'tumours', 'embryonal+neoplasms+kidney', 'kidney', 'male+genital+tumours']
paediatric+cystic+nephroma ---> ['kidney', 'urinary+male+geni

tumours+uncertain+derivation ---> ['soft+tissue+bone+tumours', 'eye+tumours', 'paediatric+tumours', 'soft+tissue+tumours', 'mesenchymal+tumours']
superficial+angiomyxoma ---> ['tumours+uncertain+differentiation', 'female+genital+tumours', 'lower+genital+tract', 'soft+tissue+bone+tumours', 'uncertain+differentiation', 'tumours', 'uncertain+derivation', 'paediatric+tumours', 'mesenchymal+tumours+lower+genital+tract', 'soft+tissue+tumours', 'mesenchymal+tumours', 'tumours+uncertain+derivation']
deep+angiomyxoma ---> ['soft+tissue+bone+tumours', 'tumours', 'uncertain+derivation', 'paediatric+tumours', 'soft+tissue+tumours', 'tumours+uncertain+derivation']
myoepithelial+tumours+soft+tissue ---> ['soft+tissue+bone+tumours', 'tumours', 'uncertain+derivation', 'paediatric+tumours', 'soft+tissue+tumours', 'tumours+uncertain+derivation']
myoepithelial+tumours ---> ['soft+tissue+bone+tumours', 'epithelial+tumours', 'tumours', 'uncertain+derivation', 'paediatric+tumours', 'eye+tumours', 'tumours+l

acute+myeloid+leukaemia+gene+mutations ---> ['acute+myeloid+leukaemia+recurrent+genetic+abnormalities', 'related+precursor+neoplasms', 'recurrent+genetic+abnormalities', 'tumours', 'acute+myeloid+leukaemia+related+precursor+neoplasms', 'acute+myeloid+leukaemia', 'haematopoietic+lymphoid+tissues', 'tumours+haematopoietic+lymphoid+tissues']
acute+myeloid+leukaemia+mutated+npm1 ---> ['acute+myeloid+leukaemia+recurrent+genetic+abnormalities', 'related+precursor+neoplasms', 'recurrent+genetic+abnormalities', 'tumours', 'acute+myeloid+leukaemia+related+precursor+neoplasms', 'acute+myeloid+leukaemia', 'haematopoietic+lymphoid+tissues', 'tumours+haematopoietic+lymphoid+tissues']
mutated+npm1 ---> ['acute+myeloid+leukaemia+recurrent+genetic+abnormalities', 'related+precursor+neoplasms', 'recurrent+genetic+abnormalities', 'tumours', 'acute+myeloid+leukaemia+related+precursor+neoplasms', 'acute+myeloid+leukaemia', 'haematopoietic+lymphoid+tissues', 'tumours+haematopoietic+lymphoid+tissues']
acute

In [None]:
search_space = get_search_term_mapping()

In [None]:
search_space

In [None]:
line = 'AD  - Writing Committee J Skinner East Anglia region; MP Maslanyj; TJ Mee and SG Allen'
re.sub(r'^AD\s*\-\s*', '', line).strip()

In [None]:
%%time
query = 'central+nervous+system+tumours'
pageno=1
url = f'https://pubmed.ncbi.nlm.nih.gov/?term={query}&filter=pubt.clinicalconference&filter=pubt.clinicalstudy&filter=pubt.clinicaltrial&filter=pubt.clinicaltrialprotocol&filter=pubt.clinicaltrialphasei&filter=pubt.clinicaltrialphaseii&filter=pubt.clinicaltrialphaseiii&filter=pubt.clinicaltrialphaseiv&filter=pubt.comparativestudy&filter=pubt.controlledclinicaltrial&filter=pubt.editorial&filter=pubt.meta-analysis&filter=pubt.observationalstudy&filter=pubt.practiceguideline&filter=pubt.pragmaticclinicaltrial&filter=pubt.randomizedcontrolledtrial&filter=pubt.researchsupportamericanrecoveryandreinvestmentact&filter=pubt.researchsupportnihextramural&filter=pubt.researchsupportnihintramural&filter=pubt.researchsupportnonusgovt&filter=pubt.researchsupportusgovtnonphs&filter=pubt.researchsupportusgovtphs&filter=pubt.researchsupportusgovernment&filter=pubt.validationstudy&show_snippets=off&size=200&page={pageno}&format=pubmed'
url
#scrape_pubmed_results(query)

In [None]:
%%time
query = 'central+nervous+system+tumours'
pageno = 10
ranking = 0

all_pubmed_articles = []
while(True):
    pageno += 1
    
    pubmed_articles, ranking = scrape_pubmed_results_page(query, pageno, ranking)
    print(f'\tpage={pageno}, ranking={ranking}, length={len(pubmed_articles)}')
    
    #break
    if len(pubmed_articles) == 0:
        break
    all_pubmed_articles.extend(pubmed_articles)
    break


In [None]:
len(all_pubmed_articles)

In [None]:
all_pubmed_articles[0]

In [None]:
len(search_space)

In [None]:
search_space