In [2]:
import pandas as pd
import urllib
import requests
import re
import glob
import json

# Get the pubmed papers

In [3]:
#bioc-pmc files
files_to_get = pd.read_table('/home/riddleta/ac_knowl/data/filelist.txt', sep='\s+')
files_to_get.shape

  


(598386, 4)

In [4]:
oa_comm_use_files = pd.read_csv('/home/riddleta/ac_knowl/data/oa_comm_use_file_list.txt', 
                                sep='\\t', engine='python', header=None, skiprows=1)
oa_comm_use_files.columns = ['File', 'ref', 'PMCID', 'PMID', 'license']
oa_comm_use_files.shape

(1479862, 5)

In [5]:
oa_files = pd.read_table('/home/riddleta/ac_knowl/data/oa_file_list.txt', 
                         sep='\\t', engine='python', header=None, skiprows=1)
oa_files.columns = ['File', 'ref', 'PMCID', 'PMID', 'license']
oa_files.shape

  


(2408932, 5)

In [6]:
unique_ids = pd.concat([oa_files['PMCID'], oa_comm_use_files['PMCID'], files_to_get['PMCID']]).unique()

## Main function to return data from fulltext search

In [7]:
def return_useful_data(r):
    git_hits = 0
    osf_hits = 0
    nda_hits = 0
    open_neuro = 0
    fmri = 0
    res = 0
    r_out = r.json()
    date = r_out['date']
    src = r_out['source']
    try:
        doi = r_out['documents'][0]['passages'][0]['infons']['article-id_doi']
    except:
        doi = None
    try:
        yr = r_out['documents'][0]['passages'][0]['infons']['year']
    except:
        yr = None
    if yr not in ['2008', '2009', '2010', '2011', '2012', '2013', 
                  '2014', '2015', '2016', '2017', '2018', '2019']:
        return(date, src, doi, git_hits, osf_hits, nda_hits, open_neuro, fmri, res, yr)
    else:
        for passage in r_out['documents'][0]['passages']:
            if re.search(re.compile('github'), passage['text'].lower()):
                git_hits = git_hits + 1
            if re.search(re.compile('osf\.io'), passage['text'].lower()):
                osf_hits = osf_hits + 1
            if re.search(re.compile('nda\.nih\.gov'), passage['text'].lower()):
                nda_hits = nda_hits + 1
            if re.search(re.compile('openneuro'), passage['text'].lower()):
                open_neuro = open_neuro + 1
            if re.search(re.compile('fmri'), passage['text'].lower()):
                fmri = fmri + 1
            if re.search(re.compile('results'), passage['text'].lower()):
                res = res + 1
        
        return(date, src, doi, git_hits, osf_hits, nda_hits, open_neuro, fmri, res, yr)

## loop to get everything
realistically, this took about two weeks. I ran this in small batches and saved the files iteratively. The second code block gives an example. There ended up being 168 papers that we couldn't retrieve.

In [None]:
#git_links = []
#missed = []

#for i, j in enumerate(unique_ids):
#    pth = 'https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/' + j + '/unicode'
#    try:
#        r = requests.get(pth)
#        git_links.append(return_useful_data(r) + (j, i))
#    except:
#        missed.append([i,j])

#df = pd.DataFrame(git_links, columns=['date', 'src', 'doi', 'git_hits', 
#                                      'osf_hits', 'nda_hits', 'open_neuro', 
#                                      'fmri', 'res', 'yr', 'pmcid', 'idx'])
#df.to_csv('/home/riddleta/ac_knowl/output/bionlp_03.csv')

In [None]:
# like this:

#git_links = []
#missed = []
#progress = [i for i in range(0, 15000, 2500)]

#for i, j in enumerate(unique_ids[625000:]):
#    if i in progress:
#        print(i)
#    pth = 'https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/' + j + '/unicode'
#    try:
#        r = requests.get(pth)
#        git_links.append(return_useful_data(r) + (j, i))
#    except:
#        missed.append([i,j])
#    if i==15000:
#        break

## target the subset of papers I want for a full text download

In [8]:
files = glob.glob('/home/riddleta/ac_knowl/output/bio*.csv')
file_list = []
for filename in files:
    df = pd.read_csv(filename, index_col=None, header=0)
    file_list.append(df)

frame = pd.concat(file_list, axis=0, ignore_index=True)

  interactivity=interactivity, compiler=compiler, result=result)


### Some of the years were formatted incorrectly or missed entirely. This resolves that.

In [9]:
frame['yr_fixed'] = frame['yr'].replace({'2016;':'2016'})

In [10]:
pmids = frame[pd.isna(frame.yr_fixed)].pmcid.tolist()
len(pmids)

28

In [11]:
for pmid in pmids:
    pth = 'https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/' + pmid + '/unicode'
    r = requests.get(pth)
    r_out = r.json()
    for i in range(0, 5):
        if 'year' in r_out['documents'][0]['passages'][i]['infons'].keys():
            year = r_out['documents'][0]['passages'][i]['infons']['year']
    
    frame['yr_fixed'][frame.pmcid==pmid] = year
            

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [12]:
frame['yr_ints'] = frame.yr_fixed.astype(int)

In [13]:
df_targets = frame[frame.yr_ints>2008]

In [14]:
df_targets.shape

(2600884, 15)

In [36]:
def already_retrieved_papes(dirs):
    pmcids = []
    for d in dirs:

        with open(d) as infile:
            dat = json.load(infile)
            for paper in dat:
                pmcids.append(paper['documents'][0]['id'])
    
    return(['PMC'+i for i in pmcids])

def missed_papes(dirs):
    pmcids = []
    for d in dirs:
        with open(d, 'r') as infile:
            dat = infile.readlines()
            for paper in dat:
                pmcids.append(paper[0:10])
    return(pmcids)

In [16]:
already_retrieved = glob.glob('/home/riddleta/ac_knowl/output/full_texts/papes*')

gots = already_retrieved_papes(already_retrieved)

In [38]:
missed = glob.glob('/home/riddleta/ac_knowl/output/full_texts/misses*')

misses = missed_papes(missed)
len(misses)

15710

In [17]:
df_targets_sub = df_targets[~df_targets.pmcid.isin(gots)]
df_targets_sub.shape

(1408, 15)

In [17]:
def already_retrieved_papes(dirs):
    pmcids = []
    for d in dirs:

        with open(d) as infile:
            dat = json.load(infile)
            for paper in dat:
                pmcids.append(paper['documents'][0]['id'])
    
    return(['PMC'+i for i in pmcids])

In [15]:
already_retrieved = glob.glob('/home/riddleta/ac_knowl/output/full_texts/papes*')

gots = already_retrieved_papes(already_retrieved)

In [16]:
df_targets_sub = df_targets[~df_targets.pmcid.isin(gots)]
df_targets_sub.shape

(791283, 15)

In [17]:
file_writes = [i for i in range(1827500, 2700000, 2500)]


In [None]:
outdat = []
missed = []
for i, pmcid in enumerate(df_targets_sub.pmcid):
    try:
        pth = 'https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/' + pmcid + '/unicode'
        r = requests.get(pth)
        r_out = r.json()
        outdat.append(r_out)
    except:
        missed.append(pmcid)
    if i+1827500 in file_writes: 
        with open('/home/riddleta/ac_knowl/output/full_texts/papes_'+str(i+1827500)+'.txt', 'w') as fout:
            json.dump(outdat, fout)
        if len(missed)>0:
            with open('/home/riddleta/ac_knowl/output/full_texts/misses_'+str(i+1827500)+'.txt', 'w') as f:
                for item in missed:
                    f.write("%s\n" % item)
        outdat = []
        missed = []


In [18]:
with open('/home/riddleta/ac_knowl/output/full_texts/papes_7500.txt') as infile:
    dat = json.load(infile)


0***of***16988
1000***of***16988
2000***of***16988
3000***of***16988
4000***of***16988
5000***of***16988
6000***of***16988
7000***of***16988
8000***of***16988
9000***of***16988
10000***of***16988
11000***of***16988
12000***of***16988
13000***of***16988
14000***of***16988
15000***of***16988
16000***of***16988


In [19]:
outdat[0]

{'date': '20190111',
 'source': 'PMC',
 'infons': {},
 'documents': [{'passages': [{'text': 'First evidence of hybridization between golden jackal (Canis aureus) and domestic dog (Canis familiaris) as revealed by genetic markers',
     'offset': 0,
     'relations': [],
     'infons': {'name_2': 'surname:Caniglia;given-names:Romolo',
      'name_3': 'surname:Arbanasić;given-names:Haidi',
      'name_0': 'surname:Galov;given-names:Ana',
      'name_1': 'surname:Fabbri;given-names:Elena',
      'name_6': 'surname:Bošković;given-names:Ivica',
      'alt-title': 'Golden jackal - dog hybrids',
      'article-id_publisher-id': 'rsos150450',
      'name_5': 'surname:Florijančić;given-names:Tihomir',
      'name_8': 'surname:Randi;given-names:Ettore',
      'year': '2015',
      'kwd': 'Canis interspecific hybridization gene introgression major histocompatibility complex melanism β-defensin CDB103',
      'name_4': 'surname:Lapalombella;given-names:Silvana',
      'article-id_doi': '10.1098/rs

In [15]:
pmid = df_targets.pmcid.iloc[0]

In [16]:
pth = 'https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/' + pmid + '/unicode'
r = requests.get(pth)
r_out = r.json()


In [20]:
pmid

'PMC5139083'

In [19]:
#r_out['documents'][0]['passages'][0]['infons']['year']
r_out['documents'][0]['id']# that's the pmcid

'5139083'

In [69]:
yr_counts = frame.yr_fixed.value_counts()
yr_counts.iloc[25:50]

1996      3558
1994      3383
1995      3292
1993      3168
1992      2730
1991      2724
1990      2675
1989      2554
2015      2220
1988      2170
1899      2120
1983      2101
1898      2083
1910.0    2073
1894      2069
1982      2058
1920.0    2036
1902.0    2034
1986      2026
1987      2019
1904      2004
1985      1959
1911      1959
1984      1948
1909.0    1942
Name: yr_fixed, dtype: int64