In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict

import requests
from bs4 import BeautifulSoup
import re
import os
import time

import matplotlib.pyplot as plt
%matplotlib inline

# Determine articles to scrape

Previous scrape (forgot to scrape Sci Data):

```
journals_keep = ['Nat Commun', 'Nat Neurosci', 'Nat Methods',
                 'PLoS One', 'PLoS Comput Biol', 'Proc Natl Acad Sci U S A']
terms = ['python', 'matlab', 'public', 'open', 'code', 'source', 'github']
```

In [2]:
# Define article restrictions
journals1 = ['PLoS One', 'Sci Rep', 'Proc Natl Acad Sci U S A', 'Nat Commun', 'PLoS Comput Biol']
journals2 = ['eLife', 'J Neurosci', 'Front Hum Neurosci', 'Front Neurosci',
                'F1000Res', 'J Neurophysiol', 'Nature', 'Neuroimage', 'Neuron', 'Science',
                'Brain', 'eNeuro', 'Neuroscience', 'Front Syst Neurosci', 'Nat Methods']
cols_keep = ['Journal Title', 'Year', 'PMCID', 'PMID']
year_min = 2014

In [3]:
# Load database of available articles
df = pd.read_csv('/gh/data/opencode/PMC-ids.csv')

df_keep = df[(df['Journal Title'].isin(journals2)) &
             (df['Year'] >= year_min)
             ]
df_keep = df_keep[cols_keep]

# Remove articles without a PMID (not read)
df_keep.dropna(subset=['PMID'], inplace=True)
df_keep = df_keep.reset_index()

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df_keep.head()

Unnamed: 0,index,Journal Title,Year,PMCID,PMID
0,2991923,F1000Res,2014,PMC3799545,24358873.0
1,3057806,J Neurosci,2014,PMC3866478,24381264.0
2,3057807,J Neurosci,2014,PMC3866479,24381272.0
3,3057808,J Neurosci,2014,PMC3866480,24381273.0
4,3057809,J Neurosci,2014,PMC3866481,24381276.0


In [5]:
len(df_keep)

27027

# Scrape all

In [38]:
apikey = open('apikey.txt', 'r').read()
db = 'pmc'
base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'
terms = ['python', 'matlab', 'code', 'github']
span_buffer = 100
N_previous = 27020
N_chunk = 30000
N_save = 10

In [40]:
# Load previous computation
if N_previous > 0:
    other = ['aff', 'subject']
    dfs_articles = {}
    for k in terms + other:
        csv_name = '/gh/data2/opencode/temp/{:s}_{:d}.csv'.format(k, N_previous)
        dfs_articles[k] = [pd.read_csv(csv_name, index_col=0)]
else:
    dfs_articles = defaultdict(list)

for i, row in df_keep.loc[N_previous+1:].iterrows():
    # Get full text of 1 paper
    pmcid = row['PMCID']
    s = '{:s}db={:s}&id={:s}'.format(base, db, pmcid, apikey)
    out = requests.get(s)
    bs = BeautifulSoup(out.content, 'lxml')

    # Affiliations and subject dfs
    dict_affs = defaultdict(list)
    affs = [x.text for x in bs.findAll('aff')]
    for aff in affs:
        dict_affs['PMCID'].append(pmcid)
        dict_affs['aff'].append(aff)
    dfs_articles['aff'].append(pd.DataFrame(dict_affs))

    dict_subjects = defaultdict(list)
    subjects = [x.contents[0] for x in bs.findAll('subject')]
    for subject in subjects:
        dict_subjects['PMCID'].append(pmcid)
        dict_subjects['subject'].append(subject)
    dfs_articles['subject'].append(pd.DataFrame(dict_subjects))

    # DFs of terms
    for term in terms:
        dict_term = defaultdict(list)
        for s in re.finditer(term, out.text, re.IGNORECASE):
            save_span = s.span()
            sent = out.text[(save_span[0] - span_buffer):(save_span[1] + span_buffer)]

            dict_term['PMCID'].append(pmcid)
            dict_term['sentence'].append(sent)
        dfs_articles[term].append(pd.DataFrame(dict_term))

    # Save output every N
    if (i % N_save == 0) & (i > 0):
        print(i)
        for k in dfs_articles.keys():
            df_save = pd.concat(dfs_articles[k])
            df_save.to_csv('/gh/data2/opencode/temp/{:s}_{:d}.csv'.format(k, i))

            # Delete last file Unless 
            if (i-N_save) % N_chunk > 0:
                os.remove('/gh/data2/opencode/temp/{:s}_{:d}.csv'.format(k, i - N_save))

        if i % N_chunk == 0:
            if i > 0:
                dfs_articles = defaultdict(list)
            
# Save output when finish
for k in dfs_articles.keys():
    df_save = pd.concat(dfs_articles[k])
    df_save.to_csv('/gh/data2/opencode/temp/{:s}_{:d}.csv'.format(k, i))
    os.remove('/gh/data2/opencode/temp/{:s}_{:d}.csv'.format(k, int(np.round(i-(N_save/2-1), -1))))