# Inhibition and Interference in the deep brain

### Code for lit. searches and meta-analytical statistics

In [1]:
import sys
import os
from os.path import join as opj
import pandas as pd
import numpy as np
from docx import Document
import pymed as pm # for pubmed querying
from Bio import Entrez
import neurosynth as ns # for neurosynth querying (still in development phase [caution!])
#import pygraphviz as pgv
import pydot


In [2]:
# change width of notebook
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# change max row pandas will spit out
pd.set_option('display.max_rows', 5000)

In [31]:
# define some random necessary parameters
email = 'scottifict@gmail.com'
Entrez.email = email

Define keywords

In [32]:
# set keyword arguments for pubmed
keyword_arguments = [["interference",
                      "interference control",
                      "conflict",
                      "conflict control",
                      "cognitive control", 
                      "stroop",
                      "simon",
                      "flanker",
                      "stop-signal",
                      "stop signal",
                      "stop task",
                      "stop-signal reaction time",
                      "stop signal reaction time",
                      "go/no go",
                      "go no go",
                      "go-no go",
                      "go/nogo",
                      "go/no-go",
                      "go-no-go",
                      "selective inhibition",
                      "global inhibition",
                      "inhibition",
                      "response inhibition",
                      "inhibitory control",
                      "multi source interference task",
                      "msit",
                      "multi-source interference task"],
                      ["fmri", "functional mri", "functional magnetic resonance imaging"]]

In [33]:
# Neurosynth already uses only fmri data, so just pass it keyword_arguments[0]
neurosynth_keywords = keyword_arguments[0]

# Separate by search engine (Phase 1)

Notes:
    
Because PubMed does not necessarily use the query terms that you pass it, we will first use the Bio.Entrez module to query search PubMed and compare the actual searched query to the original qurery that we inputted. Then, we can extract the PubMed IDs that Entrez finds, and use this to accurately find the articles of interest. 

run code to list attributes of records output:
for record in records:
    print(record)

Info to save for flowchart showing search method: 1) number of total pm articles found for all keywords, 2) number of pm articles found after duplciates removed, 3) number of total neurosynth articles found for all keywords 4) number of neurosynth articles found after duplciates removed 5) total number of ns and pm articles to read abstracts of in phase 1

Summary:

1) Using Entrez, extract PMIDs for articles of interest

2) Using Pymed, download abstracts and output to excel docs

### Attempt new verion using bio.entrez instead of pymed

In [34]:
# Set the saving directory for all pubmed data
save_dir = opj(os.getcwd(), 'query_results/pubmed_results')

In [39]:
# Use Entrez to query search and output articles of interest
pm_search = pd.DataFrame(columns = ["pmid","query_orig","query_trans"])

for args in keyword_arguments[0]: # loop over first level keyword arguments
    for method in keyword_arguments[1]: # loop over second level keyword arguments
        
        query_term = args + ' ' + method # merge keyword arguments
        
        # setup Entrez search module
        handle = Entrez.esearch(db="pubmed", term=query_term, retmax=100000, idtype="acc")
        records = Entrez.read(handle)
        handle.close()
        
        # loop over found articles, append to dataframe
        for res in records["IdList"]:
            
            pm_search = pm_search.append(pd.DataFrame({"pmid":[res],"query_orig":[query_term],"query_trans":[records["QueryTranslation"]]}))
        
        # Print some info on the findings
        num_records = len(records["IdList"])    
        print(f"Query search for {query_term} argument found {num_records} articles")
        
        searched = str(records["QueryTranslation"])
        print(f"original search term was {query_term}, the used search term was {searched}\n")
        

Query search for interference fmri argument found 2672 articles
original search term was interference fmri, the used search term was interference[All Fields] AND ("magnetic resonance imaging"[MeSH Terms] OR ("magnetic"[All Fields] AND "resonance"[All Fields] AND "imaging"[All Fields]) OR "magnetic resonance imaging"[All Fields] OR "fmri"[All Fields])

Query search for interference functional mri argument found 2643 articles
original search term was interference functional mri, the used search term was interference[All Fields] AND ("magnetic resonance imaging"[MeSH Terms] OR ("magnetic"[All Fields] AND "resonance"[All Fields] AND "imaging"[All Fields]) OR "magnetic resonance imaging"[All Fields] OR ("functional"[All Fields] AND "mri"[All Fields]) OR "functional mri"[All Fields])

Query search for interference functional magnetic resonance imaging argument found 2595 articles
original search term was interference functional magnetic resonance imaging, the used search term was interferenc

Query search for cognitive control functional mri argument found 11014 articles
original search term was cognitive control functional mri, the used search term was ("Cogn Int Conf Adv Cogn Technol Appl"[Journal] OR "cognitive"[All Fields]) AND ("prevention and control"[Subheading] OR ("prevention"[All Fields] AND "control"[All Fields]) OR "prevention and control"[All Fields] OR "control"[All Fields] OR "control groups"[MeSH Terms] OR ("control"[All Fields] AND "groups"[All Fields]) OR "control groups"[All Fields]) AND ("magnetic resonance imaging"[MeSH Terms] OR ("magnetic"[All Fields] AND "resonance"[All Fields] AND "imaging"[All Fields]) OR "magnetic resonance imaging"[All Fields] OR ("functional"[All Fields] AND "mri"[All Fields]) OR "functional mri"[All Fields])

Query search for cognitive control functional magnetic resonance imaging argument found 10699 articles
original search term was cognitive control functional magnetic resonance imaging, the used search term was ("Cogn Int C

Query search for stop task fmri argument found 420 articles
original search term was stop task fmri, the used search term was stop[All Fields] AND task[All Fields] AND ("magnetic resonance imaging"[MeSH Terms] OR ("magnetic"[All Fields] AND "resonance"[All Fields] AND "imaging"[All Fields]) OR "magnetic resonance imaging"[All Fields] OR "fmri"[All Fields])

Query search for stop task functional mri argument found 398 articles
original search term was stop task functional mri, the used search term was stop[All Fields] AND task[All Fields] AND ("magnetic resonance imaging"[MeSH Terms] OR ("magnetic"[All Fields] AND "resonance"[All Fields] AND "imaging"[All Fields]) OR "magnetic resonance imaging"[All Fields] OR ("functional"[All Fields] AND "mri"[All Fields]) OR "functional mri"[All Fields])

Query search for stop task functional magnetic resonance imaging argument found 389 articles
original search term was stop task functional magnetic resonance imaging, the used search term was stop[A

Query search for go-no go functional magnetic resonance imaging argument found 407 articles
original search term was go-no go functional magnetic resonance imaging, the used search term was go-no[All Fields] AND go[All Fields] AND ("magnetic resonance imaging"[MeSH Terms] OR ("magnetic"[All Fields] AND "resonance"[All Fields] AND "imaging"[All Fields]) OR "magnetic resonance imaging"[All Fields] OR ("functional"[All Fields] AND "magnetic"[All Fields] AND "resonance"[All Fields] AND "imaging"[All Fields]) OR "functional magnetic resonance imaging"[All Fields])

Query search for go/nogo fmri argument found 183 articles
original search term was go/nogo fmri, the used search term was go/nogo[All Fields] AND ("magnetic resonance imaging"[MeSH Terms] OR ("magnetic"[All Fields] AND "resonance"[All Fields] AND "imaging"[All Fields]) OR "magnetic resonance imaging"[All Fields] OR "fmri"[All Fields])

Query search for go/nogo functional mri argument found 172 articles
original search term was go

Query search for inhibition fmri argument found 5237 articles
original search term was inhibition fmri, the used search term was ("inhibition, psychological"[MeSH Terms] OR ("inhibition"[All Fields] AND "psychological"[All Fields]) OR "psychological inhibition"[All Fields] OR "inhibition"[All Fields]) AND ("magnetic resonance imaging"[MeSH Terms] OR ("magnetic"[All Fields] AND "resonance"[All Fields] AND "imaging"[All Fields]) OR "magnetic resonance imaging"[All Fields] OR "fmri"[All Fields])

Query search for inhibition functional mri argument found 5148 articles
original search term was inhibition functional mri, the used search term was ("inhibition, psychological"[MeSH Terms] OR ("inhibition"[All Fields] AND "psychological"[All Fields]) OR "psychological inhibition"[All Fields] OR "inhibition"[All Fields]) AND ("magnetic resonance imaging"[MeSH Terms] OR ("magnetic"[All Fields] AND "resonance"[All Fields] AND "imaging"[All Fields]) OR "magnetic resonance imaging"[All Fields] OR ("f

Query search for msit functional magnetic resonance imaging argument found 33 articles
original search term was msit functional magnetic resonance imaging, the used search term was msit[All Fields] AND ("magnetic resonance imaging"[MeSH Terms] OR ("magnetic"[All Fields] AND "resonance"[All Fields] AND "imaging"[All Fields]) OR "magnetic resonance imaging"[All Fields] OR ("functional"[All Fields] AND "magnetic"[All Fields] AND "resonance"[All Fields] AND "imaging"[All Fields]) OR "functional magnetic resonance imaging"[All Fields])

Query search for multi-source interference task fmri argument found 44 articles
original search term was multi-source interference task fmri, the used search term was multi-source[All Fields] AND interference[All Fields] AND task[All Fields] AND ("magnetic resonance imaging"[MeSH Terms] OR ("magnetic"[All Fields] AND "resonance"[All Fields] AND "imaging"[All Fields]) OR "magnetic resonance imaging"[All Fields] OR "fmri"[All Fields])

Query search for multi-s

In [40]:
print(f'{len(pm_search)} articles found overall')
all_pm_search = pm_search

112056 articles found overall


In [41]:
all_pm_search.head()

Unnamed: 0,pmid,query_orig,query_trans
0,32356855,interference fmri,"interference[All Fields] AND (""magnetic resona..."
0,32342465,interference fmri,"interference[All Fields] AND (""magnetic resona..."
0,32314048,interference fmri,"interference[All Fields] AND (""magnetic resona..."
0,32311411,interference fmri,"interference[All Fields] AND (""magnetic resona..."
0,32299791,interference fmri,"interference[All Fields] AND (""magnetic resona..."


In [42]:
pm_search.drop_duplicates(subset="pmid", keep="first", inplace=True)
print(f'{len(pm_search)} unique articles found from all pubmed searches')

26478 unique articles found from all pubmed searches


In [8]:
# Remove duplicates 
print(f'{len(pm_search)} articles found overall')
pm_search.drop_duplicates(subset="pmid", keep="first", inplace=True)
print(f'{len(pm_search)} unique articles found from all pubmed searches')
entrez_total = len(pm_search)

26391 articles found overall
26391 unique articles found from all pubmed searches


In [53]:
# give new index and check the new dataframe
pm_search.reset_index(drop=True,inplace=True)
pm_search.head()

Unnamed: 0,pmid,query_orig,query_trans
0,32314048,interference fmri,"interference[All Fields] AND (""magnetic resona..."
1,32311411,interference fmri,"interference[All Fields] AND (""magnetic resona..."
2,32299791,interference fmri,"interference[All Fields] AND (""magnetic resona..."
3,32285357,interference fmri,"interference[All Fields] AND (""magnetic resona..."
4,32285159,interference fmri,"interference[All Fields] AND (""magnetic resona..."


In [55]:
# save search terms and PMIDs
pm_search.to_excel(opj(save_dir, 'excel', 'pm_search.xlsx'), sheet_name = 'pubmed_info')

In [70]:
pm_search = pd.read_excel(opj(save_dir, 'excel', 'pm_search.xlsx'))

## PubMed

Due to the amount of searches that need to be done (26391) the code keeps crashing (probably because of the amount of qeury searches in a small amount of time). Therefore, it is probably easiest to split the search into 27 ~1000 batches and run it. 

In [10]:
import time

In [262]:
pubmed = pm.PubMed(tool="MyTool", email = email)

duplicate_checker_pm = [] # setup list incase multiple articles are found (should only find one per PMID)

all_pm_articles = pd.DataFrame(columns = ["title","authors","pmid","keywords","pubdate","abstract"]) # create pd object to append to

counter = 1

print(f'There are {entrez_total} articles to extract')

for pmid_args in pm_search.pmid: # loop over all keyword argmuents
    
    results = pubmed.query(query=pmid_args, max_results=10) # run search query
        
    for article in results: # loop over all found articles
    
        if '\n' in article.pubmed_id: # extract pubmed ID
            article_id = article.pubmed_id.split('\n')[0]
        else:
            article_id = article.pubmed_id
                
        article_title = article.title # extract title

        if hasattr(article,'keywords'):
            if article.keywords: # find keywords if they exist
                if None in article.keywords:
                    article.keywords.remove(None)
                article_keywords = '", "'.join(article.keywords)
            else:
                article_keywords = "Article has no keywords"
            
        article_publication_date = article.publication_date # extract publication date
        article_abstract = article.abstract # extract abstract

        article_authors = []
        for auths in range(len(article.authors)):
            if auths == len(article.authors)-1:
                if article.authors[auths]['lastname']==None or article.authors[auths]['firstname']==None:
                    print('Skipping author, nonetype')
                else:
                    article_authors.append(article.authors[auths]['lastname'] + ', ' + article.authors[auths]['initials'])
            else:
                if article.authors[auths]['lastname']==None or article.authors[auths]['firstname']==None:
                    print('Skipping author, nonetype')
                else:
                    article_authors.append(article.authors[auths]['lastname'] + ', ' + article.authors[auths]['initials'] + '.,')
        
        # Throw all this info into a pandas dataframe so we can export it to excel
        all_pm_articles = all_pm_articles.append(pd.DataFrame({"title":[article_title],"authors":[str(' '.join(article_authors))],"pmid":[article_id],
                                          "keywords":[article_keywords],"pubdate":[article_publication_date],
                                          "abstract":[article_abstract]}))
        
        # provide some output so we know what stage the code is at
        if counter%50==0:
            print(f'Processing article {counter}...')
        counter += 1
        
        time.sleep(0.5)
        
        # SAVE EXCEL DOCUMENT every iteration incase the code crashes
        all_pm_articles.to_excel(opj(save_dir, 'excel', 'pubmed_articles_fifth_sunday.xlsx'), sheet_name = 'pmid_pubmed')
        
print('Finished')

There are 26391 articles to extract
Finished


In [1]:
print(f'{entrez_total} articles were suppossed to be found')
print(f'{len(all_pm_articles)} were found')

NameError: name 'entrez_total' is not defined

In [263]:
# load, combine and save the different search batches
first = pd.read_excel(opj(save_dir, 'excel', 'pubmed_articles_first_sunday.xlsx'),index_col=0)
second = pd.read_excel(opj(save_dir, 'excel', 'pubmed_articles_second_sunday.xlsx'),index_col=0)
third = pd.read_excel(opj(save_dir, 'excel', 'pubmed_articles_third_sunday.xlsx'),index_col=0)
fourth = pd.read_excel(opj(save_dir, 'excel', 'pubmed_articles_fourth_sunday.xlsx'),index_col=0)
fifth = pd.read_excel(opj(save_dir, 'excel', 'pubmed_articles_fifth_sunday.xlsx'),index_col=0)
all_pms = pd.concat([first,second,third,fourth,fifth])
all_pms.drop_duplicates(subset="pmid", keep="first", inplace=True)
all_pms.reset_index(inplace=True,drop=True)
all_pms.to_excel(opj(save_dir, 'excel', 'pubmed_articles_combined.xlsx'), sheet_name = 'pmid_pubmed')

In [264]:
# make some columns strings to make it easier to index
all_pms["pubdate"] = all_pms["pubdate"].astype(str)
all_pms["pmid"] = all_pms["pmid"].astype(str)
pm_search["pmid"] = pm_search["pmid"].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [266]:
# remove all articles that were before 2010
all_pms["pubdate"] = all_pms["pubdate"].astype(str)
all_pms = all_pms[all_pms['pubdate'].str.startswith(('2020','2019','2018','2017','2016','2015','2014','2013','2012','2011','2010'),na=False)]

## Neurosynth

In [267]:
from neurosynth import Dataset

In [268]:
# First thing to do - Download the current Neurosynth database
ns_save_path = 'query_results/neurosynth_results/database_info'
ns.dataset.download(path=ns_save_path, unpack=True) # download the current dataset and unpack into 'database.txt' and 'features.txt'

Downloading the latest Neurosynth files: https://github.com/neurosynth/neurosynth-data/blob/master/current_data.tar.gz?raw=true bytes: 1


8192  [819200.00%16384  [1638400.00%24576  [2457600.00%32768  [3276800.00%40960  [4096000.00%49152  [4915200.00%57344  [5734400.00%65536  [6553600.00%73728  [7372800.00%81920  [8192000.00%90112  [9011200.00%98304  [9830400.00%106496  [10649600.00%114688  [11468800.00%122880  [12288000.00%131072  [13107200.00%139264  [13926400.00%147456  [14745600.00%155648  [15564800.00%163840  [16384000.00%172032  [17203200.00%180224  [18022400.00%188416  [18841600.00%196608  [19660800.00%204800  [20480000.00%212992  [21299200.00%221184  [22118400.00%229376  [22937600.00%237568  [23756800.00%245760  [24576000.00%253952  [25395200.00%262144  [26214400.00%270336  [27033600.00%278528  [27852800.00%286720  [28672000.00%294912  [29491200.00%303104  [30310400.00%311296  [31129600.00%319488  [31948800.00%327680  [32768000.00%335872  [33587200.00%344064  [34406400.00%352256  [35225600.00%360448  [36044800.00%368640  [36864000.00%376832  [37683200.00%385024  [38502400.00%393216  [39321600.00%401408  [40140800.

15949824  [1594982400.00%]15958016  [1595801600.00%]15966208  [1596620800.00%]15974400  [1597440000.00%]15982592  [1598259200.00%]15990784  [1599078400.00%]15998976  [1599897600.00%]16007168  [1600716800.00%]16015360  [1601536000.00%]16023552  [1602355200.00%]16031744  [1603174400.00%]16039936  [1603993600.00%]16048128  [1604812800.00%]16056320  [1605632000.00%]16064512  [1606451200.00%]16072704  [1607270400.00%]16080896  [1608089600.00%]16089088  [1608908800.00%]16097280  [1609728000.00%]

In [269]:
# Setup up the dataset object to search through
from neurosynth.base.dataset import Dataset
dataset = Dataset(opj(ns_save_path,'database.txt'))
dataset.add_features(opj(ns_save_path,'features.txt'))

# only run code below if you want to save the neurosynth database search
# this is mainly important when running to code for real
date_when_saving = input('What is the date? (e.g., 030420) ')
dataset.save(opj(ns_save_path, 'dataset_' + str(date_when_saving) + '.pkl'))

What is the date? (e.g., 030420) 040520


In [272]:
# Get list of possible feature names to search through 
ns_feature_list = dataset.get_feature_names()
len(ns_feature_list)

3228

In [273]:
# check if any of of our keywords are in the feature list and make new list to feature search
ns_features = []
for i in neurosynth_keywords:
    if i in ns_feature_list:
        ns_features.append(i)

Neurosynth does not have a way to keyword search, they only have feature search.
Therefore, we will first feature search based on our keyword list.
Then, we will download all abstracts and keyword search the abstracts

In [274]:
# Download all neurosynth abstracts
ns_abstracts = ns.base.dataset.download_abstracts(dataset, email = email)

In [275]:
# create list of all IDs of interest
ns_interest = pd.DataFrame(columns=['pmid','keyword'])
for feat_search in ns_features:
    
    ns_ids = dataset.get_studies(features=feat_search, frequency_threshold=0.001) # use a pretty liberal frequency threshold
    ns_add_interest = pd.DataFrame({'pmid':ns_ids,'keyword':feat_search}) # save the IDs that were found
    ns_interest = ns_interest.append(ns_add_interest) # append it to an overall dataframe

print(f"Total number of features of interest found, with duplicates: {len(ns_interest)}")
# Now we want to remove the redundency of IDs from the search query results
ns_interest.drop_duplicates(subset = "pmid", keep = 'first', inplace = True)

print(f"Total number of features of interest found, without duplicates: {len(ns_interest)}")

Total number of features of interest found, with duplicates: 2551
Total number of features of interest found, without duplicates: 1679


In [276]:
# We now have a dataframe of all of the PubMed IDs that we want to look at
# Next: Keyword search the abstracts of all of the neurosynth datasets

ns_interest_abstract = pd.DataFrame(columns = ['pmid','abstract','keyword'])
for keyword_search in neurosynth_keywords:
    for index, row in ns_abstracts.iterrows():
        if keyword_search in row['abstract']:
             ns_interest_abstract = ns_interest_abstract.append(pd.concat([ns_abstracts.iloc[[index]].reset_index(drop=True),
                                                                pd.DataFrame({'keyword':[keyword_search]})],axis=1))

print(f"Total number of abstracts of interest found, with duplicates: {len(ns_interest_abstract)}")
# Now we want to remove the redundency of IDs from the search query results again
ns_interest_abstract.drop_duplicates(subset = "pmid", keep = 'first', inplace = True)

print(f"Total number of abstracts of interest found, without duplicates: {len(ns_interest_abstract)}")

Total number of abstracts of interest found, with duplicates: 2620
Total number of abstracts of interest found, without duplicates: 1750


In [277]:
# Now, we can compare the pubmed IDs of these two dataframes and remove redundency
ns_interest["abstract"] = 'None' # create 'abstract' column for cancatenation
ns_interest_abstract = ns_interest_abstract[['pmid','keyword','abstract']] # adapt column orders

ns_final_interest = pd.concat([ns_interest, ns_interest_abstract]).reset_index(drop=True) # concatenate
print(f'Total combined number of studies, with duplicates = {len(ns_final_interest)}')

ns_final_interest['pmid'] = ns_final_interest['pmid'].astype('str')
ns_final_interest.drop_duplicates(subset = "pmid", keep = 'first', inplace = True) # drop duplicates
print(f'Total combined number of studies, without duplicates = {len(ns_final_interest)}')
ns_total = len(ns_final_interest)

Total combined number of studies, with duplicates = 3429
Total combined number of studies, without duplicates = 1832


In [278]:
# We now have PubMed IDs for all of the Neurosynth articles of interest
# We can use the PudMed query searcher again to extract the information we want from the articles

In [284]:
save_dir = opj(os.getcwd(), 'query_results/neurosynth_results')

pubmed = pm.PubMed(tool="MyTool", email = 'scott@leeclan.net')

# pubmed = pm.PubMed(tool="MyTool", email = email) # this is already done above

duplicate_checker = [] # setup list incase multiple articles are found (should only find one per PMID)

article_info = pd.DataFrame(columns = ["title","authors","pmid","keywords","pubdate","abstract"]) # create pd object to append to

counter = 1

print(f'There are {ns_total} articles to extract')

for pmid_args in ns_final_interest.pmid: # loop over all keyword argmuents
    
    results = pubmed.query(query=pmid_args, max_results=2) # run search query
    lengthable = pubmed.query(query=pmid_args, max_results=2) # non-elegant method to measure length of results
    
    len_of_results = sum(1 for x in lengthable) # only way to get length of results output
    
    if len_of_results > 1: # skip this PMID query search if more than one article comes up
        duplicate_checker = duplicate_checker.append(pmid_args)
        continue
    elif len_of_results == 0:
        print(f"PMID: {pmid_args} not found")
        
    for article in results: # loop over all found articles
    
        if '\n' in article.pubmed_id: # extract pubmed ID
            article_id = article.pubmed_id.split('\n')[0]
        else:
            article_id = article.pubmed_id # extract info from each article
            
        article_title = article.title # extract title

        if hasattr(article,'keywords'):
            if article.keywords: # find keywords if they exist
                if None in article.keywords:
                    article.keywords.remove(None)
                article_keywords = '", "'.join(article.keywords)
            else:
                article_keywords = "Article has no keywords"
            
        article_publication_date = article.publication_date # extract publication date
        article_abstract = article.abstract # extract abstract

        article_authors = []
        for auths in range(len(article.authors)):
            if auths == len(article.authors)-1:
                if article.authors[auths]['lastname']==None or article.authors[auths]['firstname']==None:
                    print('Skipping author, nonetype')
                else:
                    article_authors.append(article.authors[auths]['lastname'] + ', ' + article.authors[auths]['initials'])
            else:
                if article.authors[auths]['lastname']==None or article.authors[auths]['firstname']==None:
                    print('Skipping author, nonetype')
                else:
                    article_authors.append(article.authors[auths]['lastname'] + ', ' + article.authors[auths]['initials'] + '.,')
        
        # Throw all this info into a pandas dataframe so we can export it to excel
        article_info = article_info.append(pd.DataFrame({"title":[article_title],"authors":[str(' '.join(article_authors))],"pmid":[article_id],
                                          "keywords":[article_keywords],"pubdate":[article_publication_date],
                                          "abstract":[article_abstract]}))
        
        # provide some output so we know what stage the code is at
        if counter%50==0:
            print(f'Processing article {counter}...')
        counter += 1
        
        time.sleep(0.5)
        
# SAVE EXCEL DOCUMENT
article_info.to_excel(opj(save_dir, 'excel', 'neurosynth_articles.xlsx'), sheet_name = 'pmid_neurosynth')

print('Finished')

There are 1832 articles to extract
Processing article 50...
Processing article 100...
Processing article 150...
Processing article 200...
Skipping author, nonetype
Processing article 250...
Processing article 300...
Skipping author, nonetype
Processing article 350...
Processing article 400...
Processing article 450...
Processing article 500...
Processing article 550...
Skipping author, nonetype
Processing article 600...
Processing article 650...
Processing article 700...
Processing article 750...
Processing article 800...
Processing article 850...
Processing article 900...
Processing article 950...
Processing article 1000...
Processing article 1050...
Processing article 1100...
Processing article 1150...
Processing article 1200...
Processing article 1250...
Processing article 1300...
Processing article 1350...
Processing article 1400...
Processing article 1450...
Processing article 1500...
Processing article 1550...
Processing article 1600...
Skipping author, nonetype
Processing articl

In [286]:
# check ns query serach ran correctly 
print(f'Length of article_info ({len(article_info)}) should be the same as ns_total ({ns_total})')

Length of article_info (1832) should be the same as ns_total (1832)


In [289]:
# remove articles prior to 2010
article_info["pubdate"] = article_info["pubdate"].astype(str)
article_info = article_info[article_info['pubdate'].str.startswith(('2020','2019','2018','2017','2016','2015','2014','2013','2012','2011','2010'),na=False)]

In [290]:
len(article_info)
# there are 1254 articles after or during 2010

1254

## Remove duplicates between pubmed and neurosynth searches

In [295]:
# compare article_info from neurosynth search to all_pm_articles from pubmed search to remove duplicates
# First: concatenate articles
all_pms['type'] = 'pm'
article_info['type'] = 'ns'
all_articles = pd.concat([all_pms, article_info])

In [292]:
# check concatenation was successful
w_dupe = len(all_articles)
print(f'The total number of articles should be {ns_total} + {pm_total} = {ns_total + pm_total}')
print(f'The total number of articles is {len(all_articles)}')

NameError: name 'pm_total' is not defined

In [296]:
# Remove duplicates
all_articles.drop_duplicates(subset = "pmid", keep = 'first', inplace = True)
all_articles.reset_index(drop=True, inplace=True)

In [297]:
print(f'The total number of articles was {w_dupe}, with duplicates')
print(f'There are now {len(all_articles)} articles from both ns and pm')

The total number of articles was 1257, with duplicates
There are now 19598 articles from both ns and pm


In [298]:
# add colmuns to dataframe so we know whether it should be kept or removed for phase 2
# split this based on 2 raters
# e.g. 'y' for yes (keep), 'n' for no (remove)
all_articles['rater'] = '-'

In [301]:
# save all pm and ns articles
all_articles.to_excel(opj(save_dir, 'excel', 'all_pm_ns_articles.xlsx'), sheet_name = 'pmid_all')

In [302]:
# Randomly shuffle the dataframe so that they are in no particular order for each rater
all_articles_rater1 = all_articles.sample(frac=1).reset_index(drop=True)
all_articles_rater2 = all_articles.sample(frac=1).reset_index(drop=True)

In [308]:
# save thse dataframe for both raters
save_dir = opj(os.getcwd(), 'query_results')
all_articles_rater1.to_excel(opj(save_dir, 'articles_to_check_sct.xlsx'), sheet_name = 'pmid_sct')
all_articles_rater2.to_excel(opj(save_dir, 'articles_to_check_mck.xlsx'), sheet_name = 'pmid_mck')

In [309]:
# Save excel file with all info and abstracts for both query routes
all_articles.to_excel(opj(save_dir, 'all_articles_to_check.xlsx'), sheet_name = 'searches')

# The code below loads, rechecks and gives info on the searches above
Useful instead of running code above again

In [56]:
# start with pubmed
save_dir = opj(os.getcwd(), 'query_results/pubmed_results')

# load and combine the different search batches from the pubmed articles
first = pd.read_excel(opj(save_dir, 'excel', 'pubmed_articles_first_sunday.xlsx'),index_col=0)
second = pd.read_excel(opj(save_dir, 'excel', 'pubmed_articles_second_sunday.xlsx'),index_col=0)
third = pd.read_excel(opj(save_dir, 'excel', 'pubmed_articles_third_sunday.xlsx'),index_col=0)
fourth = pd.read_excel(opj(save_dir, 'excel', 'pubmed_articles_fourth_sunday.xlsx'),index_col=0)
fifth = pd.read_excel(opj(save_dir, 'excel', 'pubmed_articles_fifth_sunday.xlsx'),index_col=0)
all_pms_with_duplicates = pd.concat([first,second,third,fourth,fifth])

# save this as excel for future use/checks
#all_pms_with_duplicates.to_excel(opj(save_dir, 'excel', 'pubmed_articles_all_with_duplicates.xlsx'), sheet_name = 'pmid_pubmed')
n_pm_with_duplicates = len(all_pms_with_duplicates)

# drop the duplicates from this and save again
all_pms_no_duplicates = all_pms_with_duplicates.drop_duplicates(subset="pmid", keep="first", inplace=False)
all_pms_no_duplicates.reset_index(inplace=True,drop=True)
n_pm_no_duplicates = len(all_pms_no_duplicates)
#all_pms_no_duplicates.to_excel(opj(save_dir, 'excel', 'pubmed_articles_no_duplicates.xlsx'), sheet_name = 'pmid_pubmed')

# remove articles from before 2010 and save again
# make some columns strings to make it easier to index
all_pms_no_duplicates["pubdate"] = all_pms_no_duplicates["pubdate"].astype(str)
all_pms_no_duplicates["pmid"] = all_pms_no_duplicates["pmid"].astype(str)

all_pms_no_duplicates_post2010 = all_pms_no_duplicates[all_pms_no_duplicates['pubdate'].str.startswith(('2020','2019','2018','2017','2016','2015','2014','2013','2012','2011','2010'),na=False)]
#all_pms_no_duplicates_post2010.to_excel(opj(save_dir, 'excel', 'pubmed_articles_all_no_duplicates_post_2010.xlsx'), sheet_name = 'pmid_pubmed')
n_pm_post_2010 = len(all_pms_no_duplicates_post2010)

# add type column and save as final sheet for pubmed articles
all_pms_no_duplicates_post2010['type'] = 'pm'
all_pms_final = all_pms_no_duplicates_post2010
#all_pms_final.to_excel(opj(save_dir, 'excel', 'pubmed_articles_all_final.xlsx'), sheet_name = 'pmid_pubmed'
n_pm_final = len(all_pms_final)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [57]:
# then do neurosynth
save_dir = opj(os.getcwd(), 'query_results/neurosynth_results')

# load article sheet 
ns_check = pd.read_excel(opj(save_dir, 'excel', 'neurosynth_articles.xlsx'), index_col=0)
#ns_check.to_excel(opj(save_dir, 'excel', 'neurosynth_articles_all_with_duplicates.xlsx'), sheet_name = 'pmid_ns')
n_ns_with_duplicates = len(ns_check)

# make some columns strings to make it easier to index
ns_check["pubdate"] = ns_check["pubdate"].astype(str)
ns_check["pmid"] = ns_check["pmid"].astype(str)

# drop the duplicates from this and save again
all_ns_no_duplicates = ns_check.drop_duplicates(subset="pmid", keep="first", inplace=False)
all_ns_no_duplicates.reset_index(inplace=True,drop=True)
n_ns_no_duplicates = len(all_ns_no_duplicates)
#all_ns_no_duplicates.to_excel(opj(save_dir, 'excel', 'neurosynth_articles_no_duplicates.xlsx'), sheet_name = 'pmid_neurosynth')

# remove all articles that were before 2010
all_ns_no_duplicates_post_2010 = all_ns_no_duplicates[all_ns_no_duplicates['pubdate'].str.startswith(('2020','2019','2018','2017','2016','2015','2014','2013','2012','2011','2010'),na=False)]
#all_ns_no_duplicates_post_2010.to_excel(opj(save_dir, 'excel', 'neurosynth_articles_all_no_duplicates_post_2010.xlsx'), sheet_name = 'pmid_neurosynth')
n_ns_post_2010 = len(all_ns_no_duplicates_post_2010)

# add type column and save as final sheet for neurosynth articles
all_ns_no_duplicates_post_2010['type'] = 'ns'
all_ns_final = all_ns_no_duplicates_post_2010
#all_ns_final.to_excel(opj(save_dir, 'excel', 'neurosynth_articles_all_final.xlsx'), sheet_name = 'pmid_neurosynth'
n_ns_final = len(all_ns_final)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [58]:
# combine them and recheck 
combine_check = pd.concat([all_pms_final, all_ns_final])
n_combined_with_duplicates = len(combine_check)

combine_check["pmid"] = combine_check["pmid"].astype(str)

# drop duplicates and measure again
combined_without_duplicates = combine_check.drop_duplicates(subset = "pmid", keep = 'first', inplace = False)
n_combined_without_duplicates = len(combined_without_duplicates)

In [55]:
# now print all the measurements/stages for clarity
# print(f"There were {n_pm_with_duplicates} articles found through all pubmed searches with duplicates")
print(f"There were {n_pm_no_duplicates} pubmed articles left after the removal of duplicates")
print(f"The final amount of pubmed articles found, after removing all from before 2010 is {n_pm_final}\n")

print(f"There were {n_ns_with_duplicates} articles found through all neurosynth searches with duplicates")
print(f"There were {n_ns_no_duplicates} pubmed articles left after the removal of duplicates")
print(f"The final amount of pubmed articles found, after removing all from before 2010 is {n_ns_final}\n")

print(f"After combing the results of pubmed and neurosynth, there are {n_combined_with_duplicates} articles")
print(f"After removing duplicates, the final number of articles to check in stage 1 is: {n_combined_without_duplicates}")

There were 26391 pubmed articles left after the removal of duplicates
The final amount of pubmed articles found, after removing all from before 2010 is 19527

There were 1832 articles found through all neurosynth searches with duplicates
There were 1832 pubmed articles left after the removal of duplicates
The final amount of pubmed articles found, after removing all from before 2010 is 1254

After combing the results of pubmed and neurosynth, there are 20781 articles
After removing duplicates, the final number of articles to check in stage 1 is: 19598


## Print and save data pertaining to overview

In [82]:
# Create, print and save overview of all steps taken in this search
# All numbers were calculated after removal of duplicates
print(f"""Total number of articles found in all searches (Phase 1): {pm_total + ns_total}
Number of articles found in PubMed search (Phase 1): {pm_total}
Number of articles found in Neurosynth search (Phase 1): {ns_total}
Final number of articles after removing duplicates (Phase 1): {len(all_articles)}
""")

#overview = pd.DataFrame(columns)

Total number of articles found in all searches (Phase 1): 8776
Number of articles found in PubMed search (Phase 1): 7386
Number of articles found in Neurosynth search (Phase 1): 1390
Final number of articles after removing duplicates (Phase 1): 8405



# Remove articles based on meta-analysis criteria (Phase 2)

Criteria:

1) 3x3x3mm or less voxel resolution

2) 3T or higher MRI

3) publiation during or after 2010

4) Non patient groups (control/healthy individuals only)

# Create flow chart for query search methods

In [103]:
from graphviz import Digraph

graph_dir = opj(os.getcwd(), 'query_results')

g = Digraph('G', filename=opj(graph_dir, 'query_results.gv'))

g.edge('Keywords Pubmed', 'Abstracts')
g.edge('Abstracts', 'Re-check abstracts')
g.edge('Re-check abstracts', 'Read full article')
g.edge('Read full article', 'Include')

g.view()

AttributeError: 'Digraph' object has no attribute 'label'

In [104]:
from graphviz import Source
temp = """
digraph G{
edge [dir=forward]
node [shape=plaintext]

0 [label="Keywords PubMed"]
0 -> 2 
1 [label="Keywords Neurosynth"]
1 -> 2
2 [label="Abstracts"]



1 [label="1 (Hello)"]
2 [label="2 (how)"]
2 -> 1 [label="advmod"]
3 [label="3 (are)"]
4 [label="4 (you)"]
5 [label="5 (doing)"]
5 -> 3 [label="aux"]
5 -> 2 [label="advmod"]
5 -> 4 [label="nsubj"]
}
"""
s = Source(temp, filename="test.gv", format="png")
s.view()

'test.gv.png'

In [105]:
from graphviz import Source
temp = """
digraph g1 {

  graph [splines=false];

  // invisible nodes
  node[ shape = point, width=0, height=0] ;
  i1 [ style="invis"];
  i2 [ style="invis"];
  i3 [ style="invis"];
  i4 [ style="invis"];

  node[fontsize=15, color = black, shape = box, width=3, height=1] ;
  a[color=blue, label="a"];
  b[color=green, label="b"];
  c[color=orange, label="c"]; 
  d[color=red, label="d"] ;       

  {rank=same; a -> b -> c};

  c -> i1[arrowhead=none];
  i1 -> d[label="  FOR EACH\n\n"]; 
  d -> i2[arrowhead=none];

  {rank=same; i3 -> i2[arrowhead=none, minlen = 7 ] };

  b -> i4[style="invis"];
  i4 -> i3[arrowhead=none];

  {rank=same; i4 -> i1};

}
"""
s = Source(temp, filename="test.gv", format="png")
s.view()

'test.gv.png'

# Old code/not to use/notes

In [None]:
# use features to search for articles we want to include
ids = dataset.get_studies(features='conflict control', frequency_threshold=0.001)

In [155]:
len(ids)

0

In [128]:
ids

[10666562,
 11571223,
 11804576,
 12631562,
 12742674,
 12902389,
 12925284,
 12944513,
 14766185,
 15183394,
 15473975,
 15574744,
 15590917,
 15627596,
 15734361,
 15795135,
 15834861,
 15862220,
 15925092,
 15955496,
 15964211,
 15994228,
 16000652,
 16019232,
 16033888,
 16111898,
 16125977,
 16126415,
 16407479,
 16414280,
 16452666,
 16490365,
 16520064,
 16553630,
 16571373,
 16624583,
 16641225,
 16650775,
 16672649,
 16683265,
 16718653,
 16764897,
 16766210,
 16769743,
 16859648,
 16978881,
 17046722,
 17112745,
 17118409,
 17276088,
 17292590,
 17306437,
 17321151,
 17346989,
 17370344,
 17544015,
 17553704,
 17585887,
 17588776,
 17596416,
 17655834,
 17719567,
 17765572,
 17765933,
 17855604,
 17888409,
 17913474,
 17919934,
 17979124,
 18033768,
 18094962,
 18155254,
 18172852,
 18261931,
 18308710,
 18445602,
 18495886,
 18559283,
 18578603,
 18601941,
 18632120,
 18635157,
 18674847,
 18678764,
 18685833,
 18689860,
 18706701,
 18799601,
 18954965,
 19015088,
 19166943,

In [156]:
neurosynth_keywords

{'conflict control',
 'flanker',
 'global inhibition',
 'go no go',
 'go-no go',
 'go/no go',
 'go/nogo',
 'interference',
 'selective inhibition',
 'simon',
 'stop signal',
 'stop-signal',
 'stroop'}

In [92]:
# Okay, so now we have the PubMed IDs for each study that Neurosynth finds
# We can then used a PubMEd search again for these specific IDs (as all neurosynth studies have a PubMed ID)


TypeError: 'Dataset' object is not callable

In [129]:
abstracts = dataset.download_abstracts(ids)

AttributeError: 'Dataset' object has no attribute 'download_abstracts'

In [107]:
neurosynth

NameError: name 'neurosynth' is not defined

In [131]:
test_ab = ns.base.dataset.download_abstracts(ids, email='scottifict@gmail.com')

In [120]:
dir(dataset)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_load_activations',
 'activations',
 'add_features',
 'create_image_table',
 'feature_table',
 'get_feature_counts',
 'get_feature_data',
 'get_feature_names',
 'get_image_data',
 'get_studies',
 'image_table',
 'load',
 'masker',
 'r',
 'save',
 'transformer']

In [136]:
test_ab.abstract[0]

'When a switch between two tasks has to be carried out, performance is slower than in trials where the same task is performed repeatedly. This finding has been attributed to time-consuming control processes required for task switching. Previous results of other paradigms investigating cognitive control processes suggested that prefrontal cortex is involved in executive control. We used event-related fMRI to investigate prefrontal cortex involvement in task switching. Regions in the lateral prefrontal and premotor cortex bilaterally, the anterior insula bilaterally, the left intraparietal sulcus, the SMA/pre-SMA region and the cuneus/precuneus were activated by the task repetition condition and showed additional activation in the task switch condition. This confirmed the hypothesis that lateral prefrontal cortex is involved in task switching. However, the results also showed that this region is neither the only region involved in task switching nor a region specifically involved in task

In [138]:
dataset.id

AttributeError: 'Dataset' object has no attribute 'id'

In [151]:
test_id = dataset.get_studies(features='conflict', frequency_threshold=0.001)

In [152]:
len(test_id)

337

In [182]:
ns_abstracts

Unnamed: 0,pmid,abstract


In [None]:
# this was the original code, but did not remove duplicates so was moved to 'notes' section

save_dir = opj(os.getcwd(), 'query_results/pubmed_results')

pubmed = pm.PubMed(tool="MyTool", email = email)

pm_total = 0

all_pm_articles = pd.DataFrame(columns = ["title","authors","pmid","keywords","pubdate","abstract"])

for args in keyword_arguments: # loop over all keyword argmuents
    
    results = pubmed.query(query=args, max_results=2) # run search query
    lengthable = pubmed.query(query=args, max_results=2) # non-elegant method to measure length of results
    
    len_of_results = sum(1 for x in lengthable) # only way to get length of results output
    pm_total += len_of_results
    
    document = Document() # create the word document to save to
    document.add_heading(args, 0) # make keyword the title of the document
    document.add_paragraph(f"Number of articles found: {len_of_results}")
    
    article_info = pd.DataFrame(columns = ["title","authors","pmid","keywords","pubdate","abstract"])
    
    for article in results: # loop over all found articles
        
        article_id = article.pubmed_id # extract info from each article
        article_title = article.title # extract title

        if article.keywords: # find keywords if they exist
            if None in article.keywords:
                article.keywords.remove(None)
            article_keywords = '", "'.join(article.keywords)
        else:
            article_keywords = "Article has no keywords"
            
        article_publication_date = article.publication_date # extract publication date
        article_abstract = article.abstract # extract abstract
        
        article_authors = []
        for auths in range(len(article.authors)):
            if auths == len(article.authors)-1:
                article_authors.append(article.authors[auths]['lastname'] + ', ' + article.authors[auths]['initials'])
            else:
                article_authors.append(article.authors[auths]['lastname'] + ', ' + article.authors[auths]['initials'] + '.,')
        
        # Throw all this info into a pandas dataframe so we can export it to excel
        article_info = article_info.append(pd.DataFrame({"title":[article_title],"authors":[str(' '.join(article_authors))],"pmid":[article_id],
                                          "keywords":[article_keywords],"pubdate":[article_publication_date],
                                          "abstract":[article_abstract]}))
        
        # Save one dataframe object with every found article so we can remove duplicates
        all_pm_articles = all_pm_articles.append(pd.DataFrame({"title":[article_title],"authors":[str(' '.join(article_authors))],"pmid":[article_id],
                                  "keywords":[article_keywords],"pubdate":[article_publication_date],
                                  "abstract":[article_abstract]}))
        
        # append info to word document
        document.add_page_break()
        document.add_heading(str(article_title))
        document.add_paragraph(str(' '.join(article_authors)))
        document.add_paragraph(str(article_id))
        document.add_paragraph(str(article_keywords))
        document.add_paragraph(str(article_publication_date))
        document.add_paragraph(str(article_abstract))
        
    # SAVE WORD DOCUMENT    
    if "/" not in args:
        document.save(opj(save_dir, 'word', str(args) + '.docx')) # save the documents for each query
    else:
        new_string = args.replace('/','_')
        document.save(opj(save_dir, 'word', str(new_string) + '.docx')) # save the documents for each query
        
    # SAVE EXCEL DOCUMENT
    if "/" not in args:
        article_info.to_excel(opj(save_dir, 'excel', str(args) + '.xlsx'), sheet_name = args)
    else:
        article_info.to_excel(opj(save_dir, 'excel', str(new_string) + '.xlsx'), sheet_name = new_string)


In [None]:
# moved here on 23/04/20 to be replaced by code incorporating bio.entrez module
save_dir = opj(os.getcwd(), 'query_results/pubmed_results')

pm_total_w_dupe = 0 # to count number of articles found

all_pm_articles = pd.DataFrame(columns = ["title","authors","pmid","keywords","pubdate","abstract"])

pm_IDs = []

counter = 1

for args in keyword_arguments[0]: # loop over all keyword argmuents
    for method in keyword_arguments[1]:
    
        to_search = args + ' ' + method
        
        results = pubmed.query(query=to_search, max_results=1) # run search query
        lengthable = pubmed.query(query=to_search, max_results=1) # non-elegant method to measure length of results

        len_of_results = sum(1 for x in lengthable) # only way to get length of results output
        pm_total_w_dupe += len_of_results

        article_info = pd.DataFrame(columns = ["title","authors","pmid","keywords","pubdate","abstract"])

        for article in results: # loop over all found articles

            if '\n' in article.pubmed_id: # extract pubmed ID
                article_id = article.pubmed_id.split('\n')[0]
            else:
                article_id = article.pubmed_id

            article_title = article.title # extract title

            if hasattr(article,'keywords'):
                if article.keywords: # find keywords if they exist
                    if None in article.keywords:
                        article.keywords.remove(None)
                    article_keywords = '", "'.join(article.keywords)
                else:
                    article_keywords = "Article has no keywords"

            article_publication_date = article.publication_date # extract publication date
            article_abstract = article.abstract # extract abstract

            article_authors = []
            for auths in range(len(article.authors)):
                if auths == len(article.authors)-1:
                    if article.authors[auths]['lastname']==None or article.authors[auths]['firstname']==None:
                        print('Skipping author, nonetype')
                    else:
                        article_authors.append(article.authors[auths]['lastname'] + ', ' + article.authors[auths]['initials'])
                else:
                    if article.authors[auths]['lastname']==None or article.authors[auths]['firstname']==None:
                        print('Skipping author, nonetype')
                    else:
                        article_authors.append(article.authors[auths]['lastname'] + ', ' + article.authors[auths]['initials'] + '.,')

            # Throw all this info into a pandas dataframe so we can export it to excel
            article_info = article_info.append(pd.DataFrame({"title":[article_title],"authors":[str(' '.join(article_authors))],"pmid":[article_id],
                                              "keywords":[article_keywords],"pubdate":[article_publication_date],
                                              "abstract":[article_abstract]}))

            # Save one dataframe object with every found article so we can remove duplicates
            all_pm_articles = all_pm_articles.append(pd.DataFrame({"title":[article_title],"authors":[str(' '.join(article_authors))],"pmid":[article_id],
                                      "keywords":[article_keywords],"pubdate":[article_publication_date],
                                      "abstract":[article_abstract]}))

            # provide some output so we know what stage the code is at
            if counter%50==0:
                print(f'Processing article {counter} for {args} argument...')
            counter += 1
        
print('Finished')
