# title: "what does us this abstract (title) tell"

In [None]:
#inspired by: 
# https://medium.com/@kliang933/scraping-big-data-from-public-research-repositories-e-g-pubmed-arxiv-2-488666f6f29b
# https://marcobonzanini.com/2015/01/12/searching-pubmed-with-python/

In [1]:
from Bio import Entrez

## define some helper functions here

In [153]:
def search(query):
    Entrez.email = 'your.email@example.com'
    handle = Entrez.esearch(db='pubmed', 
                            sort='relevance', 
                            retmax='20',
                            retmode='xml', 
                            term=query)
    results = Entrez.read(handle)
    return results

In [3]:
def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'your.email@example.com'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results

## plan:

In [None]:
# the idea is to collect titles (later we can also collect abstracts) from NCBI from different journals
# In the next steps this data should be processed by NLP and key words (non stop words) will be extracted
# In the last step this data can be used to test if we can come up with a machine learning approach which will give us a prediction if the title (or abstract) is from a high impact journal of not

#

## scraping data v1

In [None]:
GroupA=["Nature", "Science", "PNAS","JACS"]
GroupB=["E-life", "Mol Cell","Plos Computational","Nature communication","Cell"]
GroupC=["Nature Biotech", "Nature Chem Bio", "Nature Str Bio", "Nature Methods"]

In [180]:
GroupA=["Nature"]
GroupB=["Mol Cell"]
GroupC=["Cells"]

years = [2018, 2019]

journals = {'A': GroupA, 'B': GroupB, 'C': GroupC}

res_d = {}

for year in years:
    
    for group,items in journals.items():

        for j in journals[group]:

            # define the search term:
            pub_date = '"{}/01/01"[Date - Publication] : "{}/01/01"[Date - Publication])'.format(year, year+1)
            search_term = "{} AND {}[TA]".format(pub_date, j)
            print(search_term)
            
            # search the data base: 
            results = search(search_term) # query
            id_list = results['IdList'] # list of UIDs - this will give us the article IDs
            chunk_size = 50 # how much data you want to read in one instance - there is a limit to get server answers

            for chunk_i in range(0, len(id_list), chunk_size):
                chunk = id_list[chunk_i:chunk_i + chunk_size]
                try: 
                    papers = fetch_details(chunk)
                    for i, paper in enumerate(papers['PubmedArticle']):
                        do_something() # perhaps write to a csv file
                except: # occasionally a chunk might annoy your parser
                    pass

        titles_list = []
        for articles in papers["PubmedArticle"]:
            titles_list.append(articles["MedlineCitation"]["Article"]["ArticleTitle"])
        
        #key = str(j) + 
        res_d[str(group) + str(year)] = {j:[titles_list, search_term]}

"2018/01/01"[Date - Publication] : "2019/01/01"[Date - Publication]) AND Nature[TA]
"2018/01/01"[Date - Publication] : "2019/01/01"[Date - Publication]) AND Mol Cell[TA]
"2018/01/01"[Date - Publication] : "2019/01/01"[Date - Publication]) AND Cells[TA]
"2019/01/01"[Date - Publication] : "2020/01/01"[Date - Publication]) AND Nature[TA]
"2019/01/01"[Date - Publication] : "2020/01/01"[Date - Publication]) AND Mol Cell[TA]
"2019/01/01"[Date - Publication] : "2020/01/01"[Date - Publication]) AND Cells[TA]


In [181]:
res_d

{'A2018': {'Nature': [['Structure of the post-translational protein translocation machinery of the ER membrane.',
    '3D printing mimics metals.',
    'Brexit impacts, quitting coal and Lassa-fever outbreak.',
    'Beyond the periodic table.',
    'Japan should put the brakes on stem-cell sales.',
    'Dating of hominin discoveries at Denisova.',
    "Protect Madagascar's national parks from pillage.",
    'UN could lead debate on gene editing.',
    'Experimentally trained statistical models boost nuclear-fusion performance.',
    'Secondary organic aerosol reduced by mixture of atmospheric vapours.',
    'Timing of archaic hominin occupation of Denisova Cave in southern Siberia.',
    'Tripled yield in direct-drive laser fusion through statistical modelling.',
    "Ethiopia's church forests are a last refuge for dwindling biodiversity.",
    'To learn inclusion skills, make it personal.',
    'Scientists relieved but wary as US shutdown ends.',
    'Drones unleashed against invasive

In [None]:
##### NEXT STEPS #####
# refine the journal list - high impact yes or now - is there a data base and a source for a classification of high impact?
# refine the handle :
#     handle = Entrez.esearch(db='pubmed', 
                            # sort='relevance',  - do we want to search it by relevance? probably not, maybe we want to take random 10000 or something  
                            # retmax='20', - performance, is it possible to querry 10000 with one run? needs to be tested
                            # retmode='xml', 
                            # term=query)