In [2]:
import pandas as pd
import re
import glob
import json
from bs4 import BeautifulSoup
import random
import numpy as np
from urlextract import URLExtract

# Making datasets for labeling
There are several stages to this, and I'm going to pull samples to label from three sources.

First, I'm limiting this to the subset of papers that are funded by the NIMH.

Using that subset, I'm going to label 'passages' for whether they contain an instance of data sharing. A passage roughly corresponds to a paragraph in the paper, but sometimes is a footnote or a reference or a title. It's an organizational unit within the PMC fulltext databnase.

The sources that I'm pulling these passages from are:

1. Contexts previously identified as possible data-sharing using regular expressions.
2. A list of papers described as having shared data using NDAR. For this, I'll attempt to use any passages where data sharing is described
3. A random selection of passsages that are unlikely to contain data-sharing.

In [200]:
#function to pull segments for labeling
extractor = URLExtract()

def hit_contexts(r_out, include_urls = False):
    out_dat = []
    
    reg_matches = re.compile(r"""(github)|(osf\.io)|(nda\.nih\.gov)|(openneuro)|(\sndar)|
                                 (national database for autism research)|(brain-map\.org)|
                                 (humanconnectome\.org)|(balsa\.wustl\.edu)|(loni\.usc\.edu)|
                                 (ida\.loni\.usc\.edu)|(fmridc)|(ccrns)|(datalad)|(dataverse)|
                                 (dbgap)|(nih\.gov\/gap)|(dryad)|(figshare)|(fcon_1000\.projects)|
                                 (nitrc)|(mcgill\.ca\/bic\/resources\/omega)|(xnat\.org)|
                                 (zenodo)|(opendata\.aws)""", re.X)
    

    try:
        tmp_doi = r_out['documents'][0]['passages'][0]['infons']['article-id_doi']
    except:
        tmp_doi = None
    try:
        tmp_pmcid = r_out['documents'][0]['id']
    except:
        tmp_pmcid = None
        
    for passage in r_out['documents'][0]['passages']:
        passage_marked = 0
        m = re.finditer(reg_matches, passage['text'].lower())
        if m:
            segments = [(max(0, item.start()-175),
                         min(len(passage['text']), item.end(0)+125),
                         item.group()) for item in m]
            
            try:
                section_type = passage['infons']['section_type']
            except:
                section_type=None
            
            for seg in segments:
                out_dat.append([passage['text'], #delete the seg[0]:seg[1] and you'll have the full segment
                                seg[2], #ids which repo
                                passage['offset'], #how far into the paper?
                                tmp_pmcid, 
                                tmp_doi, 
                                section_type])
            passage_marked = 1
        
        if include_urls:
            if extractor.has_urls(passage['text'].lower()):
                try:
                    section_type = passage['infons']['section_type']
                except:
                    section_type=None

                out_dat.append([passage['text'],
                                'url_hit',
                                passage['offset'],
                                tmp_pmcid,
                                tmp_doi,
                                section_type])
                passage_marked = 1
            
        if passage_marked == 0:
            out_dat.append([None, None, None, tmp_pmcid, tmp_doi, None])
        
    return(out_dat)

def sample_section(paper):
    candidate = []
    for i in paper['documents'][0]['passages']:
        if i['infons']['section_type'] != 'REF':
            candidate.append(i)
    
    selection = random.choice(candidate)
    
    return(selection)

## Contexts previously identified as probable data sharing (via regex)

In [65]:
# load previously identified hits and papers funded by nimh
hits = pd.read_csv('output/hit_contexts.csv') #need the subset that is in nimh funded papers
nimh_papers = pd.read_csv('output/nimh_papers.csv')
#load file index
file_ix = pd.read_csv('output/file_index.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [52]:
#filter the hits to just nimh papers
nimh_hits = hits[hits.pmcid.isin(nimh_papers.pmcid)]
nimh_hits.shape

(5726, 5)

In [96]:
#pull a sample of 100 hits
sampled_papers = pd.Series(nimh_hits.pmcid.unique()).sample(100, replace=False)
sampled_papers.shape

(100,)

In [97]:
#get location of sampled hits
file_locs = file_ix[file_ix.pmcid.isin(sampled_papers)]

In [98]:
file_locs.shape

(93, 3)

In [76]:
data_collect = []
for i, file_path in enumerate(file_locs.file):
    with open(file_path[24:]) as infile:
        dat = json.load(infile)
        paper = dat[file_locs.paper_number.iloc[i]]
        out = hit_contexts(paper)
        data_collect.extend(out)

In [99]:
df_to_label = pd.DataFrame(data_collect, columns=['text', 'repo', 'paper_offset', 'pmcid', 'doi', 'section'])
df_to_label = df_to_label.drop_duplicates(subset=['pmcid'])
df_to_label.to_csv('output/labeled_data/regex_hits.csv', index=False)

## Papers identified through NDAR

This is a convoluted process. First, I pull out all the papers mentioned in scraped NDA collections as being relevant. Next, I got a sample of those papers, and manually labeled them for containing instances of data sharing. With those labeled papers, I pulled out the section of text in each paper that mentions data sharing. If there was no such mention, then I just pulled out a random section (leaving out anything that was a reference). I then put these data together and wrote them as a csv file (`ndar_labs.csv`). Though I aimed for a sample of 100, I ended up with 65 due to a few papers being embargoed, and some papers listed in NDAR not being in the full-text database that I have.

In [2]:
ndar_collections = glob.glob('output/ndar_collections/*')
ndar_collections.sort()

In [54]:
len(ndar_collections)

1203

In [10]:
l = []
for collection in ndar_collections:
    soup = BeautifulSoup(open(collection), "html.parser")
    table = soup.find_all(id='publication-table')[0]
    table_rows = table.find_all('tr')
    for tr in table_rows:
        td = tr.find_all('td')
        row = [tr.text for tr in td]
        l.append(row)
        
collection_pubs = pd.DataFrame(l, columns = ['pmid', 'study', 'title', 'journal', 'authors', 'date', 'status'])

In [39]:
#pick out the pmids for the studies marked as relevant
collection_pmids = pd.Series(collection_pubs.pmid[collection_pubs.status=='Relevant'].unique())

In [40]:
#read in the linking file and convert pmids to integer
pmid_to_pmcid = pd.read_csv('data/PMC-ids.csv')
pmid_to_pmcid['pmid'] = pmid_to_pmcid.PMID.fillna(0.0).astype(int)

  interactivity=interactivity, compiler=compiler, result=result)


In [41]:
pmid_to_pmcid.head()

Unnamed: 0,Journal Title,ISSN,eISSN,Year,Volume,Issue,Page,DOI,PMCID,PMID,Manuscript Id,Release Date,pmid
0,Breast Cancer Res,1465-5411,1465-542X,2000,3,1,55,,PMC13900,11250746.0,,live,11250746
1,Breast Cancer Res,1465-5411,1465-542X,2000,3,1,61,,PMC13901,11250747.0,,live,11250747
2,Breast Cancer Res,1465-5411,1465-542X,2000,3,1,66,,PMC13902,11250748.0,,live,11250748
3,Breast Cancer Res,1465-5411,1465-542X,1999,2,1,59,10.1186/bcr29,PMC13911,11056684.0,,live,11056684
4,Breast Cancer Res,1465-5411,1465-542X,1999,2,1,64,,PMC13912,11400682.0,,live,11400682


In [43]:
#keep the rows in the linking file that have a match from ndar; strip out letters from pmcid
referenced_papers = pmid_to_pmcid[pmid_to_pmcid.pmid.isin(collection_pmids)]
referenced_papers['pmcid'] = referenced_papers.PMCID.apply(lambda x: x[3:])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [50]:
#pull a sample of 100 hits
sampled_papers = pd.Series(referenced_papers.pmcid.unique()).sample(100, replace=False)
#this file was manually labeled so that I can get specific passages
sampled_papers.to_csv('output/ndar_pmcids.csv', index=False) 

  after removing the cwd from sys.path.


In [58]:
sampled_papers = pd.read_csv('output/ndar_pmcids.csv')
#split out those that have shared and not shared data
no_shares = sampled_papers[sampled_papers.data_sharing=='0']
shares = sampled_papers[sampled_papers.data_sharing=='1']

In [109]:
no_shares_locs = file_ix[file_ix.pmcid.isin(no_shares.PMCID.tolist())]
text = []
repo = []
paper_offset = []
pmcid = []
doi = []
section_type = []
for i, file_path in enumerate(no_shares_locs.file):
    with open(no_shares_locs.file.iloc[i][24:]) as infile:
        dat = json.load(infile)
        paper = dat[no_shares_locs.paper_number.iloc[i]]
        section = sample_section(paper)
        text.append(section['text'])
        repo.append(np.nan)
        paper_offset.append(section['offset'])
        pmcid.append(no_shares_locs.pmcid.iloc[i])
        doi.append(np.nan)
        section_type.append(section['infons']['section_type'])
        
dat = pd.DataFrame({'text':text,
                    'repo':repo,
                    'paper_offset':paper_offset,
                    'pmcid':pmcid,
                    'doi':doi,
                    'section':section_type})

In [165]:
shares_locs = file_ix[file_ix.pmcid.isin(shares.PMCID.tolist())]
text = []
repo = []
paper_offset = []
pmcid = []
doi = []
section_type = []


In [166]:
with open(shares_locs.file.iloc[0][24:]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[0]]
    text.append(paper['documents'][0]['passages'][77]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][77]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[0])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][77]['infons']['section_type'])

In [167]:
with open(shares_locs.file.iloc[2][24:]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[2]]
    text.append(paper['documents'][0]['passages'][62]['text'])
    repo.append('GEO')
    paper_offset.append(paper['documents'][0]['passages'][62]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[2])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][62]['infons']['section_type'])

In [168]:
with open(shares_locs.file.iloc[3][24:]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[3]]
    text.append(paper['documents'][0]['passages'][88]['text'])
    repo.append('OSF')
    paper_offset.append(paper['documents'][0]['passages'][88]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[3])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][88]['infons']['section_type'])

In [169]:
with open(shares_locs.file.iloc[4][24:]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[4]]
    text.append(paper['documents'][0]['passages'][47]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][47]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[4])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][47]['infons']['section_type'])

In [170]:
with open(shares_locs.file.iloc[5][24:]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[5]]
    text.append(paper['documents'][0]['passages'][91]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][91]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[5])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][91]['infons']['section_type'])

In [172]:
dat2 = pd.DataFrame({'text':text,
                    'repo':repo,
                    'paper_offset':paper_offset,
                    'pmcid':pmcid,
                    'doi':doi,
                    'section':section_type})

In [175]:
ndar_labs = pd.concat([dat, dat2])

In [176]:
ndar_labs.to_csv('output/labeled_data/ndar_labs.csv', index=False)

## Now do a random selection of other passages

In [177]:
nimh_papers = pd.read_csv('output/nimh_papers.csv')
#load file index
file_ix = pd.read_csv('output/file_index.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [178]:
#pull a sample of 100 hits
sampled_papers = pd.Series(nimh_papers.pmcid.unique()).sample(100, replace=False)
sampled_papers.shape

(100,)

In [179]:
file_locs = file_ix[file_ix.pmcid.isin(sampled_papers)]
file_locs.shape

(89, 3)

In [180]:
out_dat = []
for i, file_path in enumerate(file_locs.file):
    with open(file_path[24:]) as infile:
        dat = json.load(infile)
        paper = dat[file_locs.paper_number.iloc[i]]
        sec = sample_section(paper)
        out_dat.append([sec['text'], #delete the seg[0]:seg[1] and you'll have the full segment
                np.nan, #ids which repo
                sec['offset'], #how far into the paper?
                file_locs.pmcid.iloc[i], 
                np.nan, 
                sec['infons']['section_type']])
        


In [183]:
temp = pd.DataFrame(out_dat, columns=['text', 'repo', 'paper_offset', 'pmcid', 'doi', 'section'])
temp.to_csv('output/labeled_data/random_selections.csv', index=False)

# We need to add some additional cases

First, I'm going to pull out some of the instances identified via regex matches.

In [186]:
# load previously identified hits and papers funded by nimh
hits = pd.read_csv('output/hit_contexts.csv') #need the subset that is in nimh funded papers
nimh_papers = pd.read_csv('output/nimh_papers.csv')
#load file index
file_ix = pd.read_csv('output/file_index.csv')
#filter the hits to just nimh papers
nimh_hits = hits[hits.pmcid.isin(nimh_papers.pmcid)]
nimh_hits.shape

  interactivity=interactivity, compiler=compiler, result=result)


(5726, 5)

In [187]:
# filter out the previously sampled cases
excludes = pd.read_csv('output/labeled_data/regex_hits.csv')
nimh_hits = nimh_hits[~nimh_hits.pmcid.isin(excludes.pmcid)]
print(nimh_hits.shape)
#pull a sample of 100 hits
sampled_papers = pd.Series(nimh_hits.pmcid.unique()).sample(100, replace=False)
sampled_papers.shape

(5463, 5)


(100,)

In [191]:
#get location of sampled hits
file_locs = file_ix[file_ix.pmcid.isin(sampled_papers)]
data_collect = []
for i, file_path in enumerate(file_locs.file):
    with open(file_path[24:]) as infile:
        dat = json.load(infile)
        paper = dat[file_locs.paper_number.iloc[i]]
        out = hit_contexts(paper)
        data_collect.extend(out)

In [192]:
df_to_label = pd.DataFrame(data_collect, columns=['text', 'repo', 'paper_offset', 'pmcid', 'doi', 'section'])
df_to_label = df_to_label.drop_duplicates(subset=['pmcid'])
df_to_label.to_csv('output/labeled_data/regex_hits2.csv', index=False)

## Additional cases - NDAR selections2

In [4]:
ndar_collections = glob.glob('output/ndar_collections/*')
ndar_collections.sort()

In [5]:
l = []
for collection in ndar_collections:
    soup = BeautifulSoup(open(collection), "html.parser")
    table = soup.find_all(id='publication-table')[0]
    table_rows = table.find_all('tr')
    for tr in table_rows:
        td = tr.find_all('td')
        row = [tr.text for tr in td]
        l.append(row)
        
collection_pubs = pd.DataFrame(l, columns = ['pmid', 'study', 'title', 'journal', 'authors', 'date', 'status'])

In [36]:
#pick out the pmids for the studies marked as relevant
collection_pmids = pd.Series(collection_pubs.pmid[collection_pubs.status=='Relevant'].unique())

#read in the linking file and convert pmids to integer
pmid_to_pmcid = pd.read_csv('data/PMC-ids.csv')
pmid_to_pmcid['pmid'] = pmid_to_pmcid.PMID.fillna(0.0).astype(int)

#keep the rows in the linking file that have a match from ndar; strip out letters from pmcid
referenced_papers = pmid_to_pmcid[pmid_to_pmcid.pmid.isin(collection_pmids)]
referenced_papers['pmcid'] = referenced_papers.PMCID.apply(lambda x: x[3:])
 

  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [37]:
# filter to just nimh funded work
nimh_papers = pd.read_csv('output/nimh_papers.csv')
nimh_pmcids = nimh_papers.pmcid.astype('str').tolist()

referenced_papers = referenced_papers[referenced_papers.pmcid.isin(nimh_pmcids)]

In [38]:
#filter out the already-labeled ndar data
df_ndar_labs = pd.read_csv('output/labeled_data/ndar_labs.csv')
labeled_ndars = df_ndar_labs.pmcid.astype('str').tolist()
referenced_papers = referenced_papers[~referenced_papers.pmcid.isin(labeled_ndars)]

(313, 14)

In [39]:
#get the file location for the matches
#pull a sample of 100 hits
sampled_papers = pd.Series(referenced_papers.pmcid.unique()).sample(100, replace=False)

#this file was manually labeled so that I can get specific passages
sampled_papers.to_csv('output/ndar_pmcids2.csv', index=False)

  


In [359]:
sampled_papers = pd.read_csv('output/ndar_pmcids2.csv')
file_ix = pd.read_csv('output/file_index.csv')
#split out those that have shared and not shared data
no_shares = sampled_papers[sampled_papers.open_data==0]
shares = sampled_papers[sampled_papers.open_data==1]

  interactivity=interactivity, compiler=compiler, result=result)


In [360]:
no_shares_locs = file_ix[file_ix.pmcid.astype('str').isin(no_shares.PMCID.astype('str').tolist())]
text = []
repo = []
paper_offset = []
pmcid = []
doi = []
section_type = []
for i, file_path in enumerate(no_shares_locs.file):
    with open(no_shares_locs.file.iloc[i][24:]) as infile:
        dat = json.load(infile)
        paper = dat[no_shares_locs.paper_number.iloc[i]]
        section = sample_section(paper)
        text.append(section['text'])
        repo.append(np.nan)
        paper_offset.append(section['offset'])
        pmcid.append(no_shares_locs.pmcid.iloc[i])
        doi.append(np.nan)
        section_type.append(section['infons']['section_type'])
        
dat = pd.DataFrame({'text':text,
                    'repo':repo,
                    'paper_offset':paper_offset,
                    'pmcid':pmcid,
                    'doi':doi,
                    'section':section_type})

In [361]:
shares_locs = file_ix[file_ix.pmcid.astype('str').isin(shares.PMCID.astype('str').tolist())]
text = []
repo = []
paper_offset = []
pmcid = []
doi = []
section_type = []

In [362]:
i = 0
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][61]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][61]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][61]['infons']['section_type'])

In [363]:
i = 1
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][16]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][16]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][16]['infons']['section_type'])

In [364]:
i = 2
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][50]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][50]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][50]['infons']['section_type'])

In [365]:
i = 3
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][15]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][15]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][15]['infons']['section_type'])

In [366]:
i = 4
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][42]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][42]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][42]['infons']['section_type'])

In [367]:
i = 5
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][11]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][11]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][11]['infons']['section_type'])

In [368]:
i = 6
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][49]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][49]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][49]['infons']['section_type'])

In [369]:
i = 7
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][83]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][83]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][83]['infons']['section_type'])

In [370]:
i = 8
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][26]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][26]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][26]['infons']['section_type'])

In [371]:
i = 9
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][43]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][43]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][43]['infons']['section_type'])

In [372]:
i = 10
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][11]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][11]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][11]['infons']['section_type'])

In [373]:
i = 11
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][44]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][44]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][44]['infons']['section_type'])

In [374]:
i = 12
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][60]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][60]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][60]['infons']['section_type'])

In [375]:
i = 13
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][54]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][54]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][54]['infons']['section_type'])

In [376]:
i = 14
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][45]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][45]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][45]['infons']['section_type'])

In [377]:
i = 15
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][11]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][11]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][11]['infons']['section_type'])

In [378]:
i = 16
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][54]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][54]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][54]['infons']['section_type'])

In [379]:
i = 17
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][51]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][51]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][51]['infons']['section_type'])

In [380]:
dat2 = pd.DataFrame({'text':text,
                    'repo':repo,
                    'paper_offset':paper_offset,
                    'pmcid':pmcid,
                    'doi':doi,
                    'section':section_type})

In [384]:
ndar_labs = pd.concat([dat, dat2])

In [385]:
sampled_papers['pmcid'] = sampled_papers.PMCID.astype('str')
ndar_labs['pmcid'] = ndar_labs.pmcid.astype('str')
ndar_labs = ndar_labs.merge(sampled_papers[['pmcid', 'data_sharing', 'open_data']], how='left', on='pmcid')

In [386]:
ndar_labs.to_csv('output/labeled_data/ndar_labs2.csv', index=False)

# all nimh passages

In [201]:
nimh_papers = pd.read_csv('output/nimh_papers.csv')
#load file index
file_ix = pd.read_csv('output/file_index.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [202]:
file_ix['pmcid'] = file_ix.pmcid.astype('str')

In [203]:
nimh_papers['pmcid'] = nimh_papers.pmcid.astype('str')

In [204]:
target_papers = file_ix[file_ix.pmcid.isin(nimh_papers.pmcid)]
target_papers.shape

(57692, 3)

In [205]:
target_papers = target_papers.sort_values('file')

In [206]:
target_papers.head()

Unnamed: 0,file,pmcid,paper_number
102548,/home/riddleta/ac_knowl/output/full_texts/pape...,4388653,49
104899,/home/riddleta/ac_knowl/output/full_texts/pape...,4392339,2400
104875,/home/riddleta/ac_knowl/output/full_texts/pape...,4392315,2376
104785,/home/riddleta/ac_knowl/output/full_texts/pape...,4392168,2286
104484,/home/riddleta/ac_knowl/output/full_texts/pape...,4391730,1985


In [207]:
status_prints = range(0, len(target_papers.file.tolist()), 250)
len(status_prints)

231

In [218]:
data_collect = []
last_file = np.nan
for i, file in enumerate(target_papers.file.tolist()):
    if i in status_prints:
        print(i)
    if file == last_file:
        paper = dat[target_papers.paper_number.iloc[i]]
        out_dat = hit_contexts(paper, include_urls=True)
        data_collect.extend(out_dat)
    else:
        with open(file) as infile:
            dat = json.load(infile)
            paper = dat[target_papers.paper_number.iloc[i]]
            out_dat = hit_contexts(paper, include_urls=True)
            data_collect.extend(out_dat)
            last_file = file

0
250
500
750
1000
1250
1500
1750
2000
2250
2500
2750
3000
3250
3500
3750
4000
4250
4500
4750
5000
5250
5500
5750
6000
6250
6500
6750
7000
7250
7500
7750
8000
8250
8500
8750
9000
9250
9500
9750
10000
10250
10500
10750
11000
11250
11500
11750
12000
12250
12500
12750
13000
13250
13500
13750
14000
14250
14500
14750
15000
15250
15500
15750
16000
16250
16500
16750
17000
17250
17500
17750
18000
18250
18500
18750
19000
19250
19500
19750
20000
20250
20500
20750
21000
21250
21500
21750
22000
22250
22500
22750
23000
23250
23500
23750
24000
24250
24500
24750
25000
25250
25500
25750
26000
26250
26500
26750
27000
27250
27500
27750
28000
28250
28500
28750
29000
29250
29500
29750
30000
30250
30500
30750
31000
31250
31500
31750
32000
32250
32500
32750
33000
33250
33500
33750
34000
34250
34500
34750
35000
35250
35500
35750
36000
36250
36500
36750
37000
37250
37500
37750
38000
38250
38500
38750
39000
39250
39500
39750
40000
40250
40500
40750
41000
41250
41500
41750
42000
42250
42500
42750
43000
43250
43

In [219]:
len(data_collect)

49376

In [220]:
df = pd.DataFrame(data_collect)
df.columns = ['context', 'repo_hit', 'paper_offset', 'pmcid', 'doi', 'section']
df.head()

Unnamed: 0,context,repo_hit,paper_offset,pmcid,doi,section
0,While the number of simultaneously-recorded ne...,url_hit,1663,4392339,10.1016/j.neuron.2015.01.028,INTRO
1,Recent rapid progress with on-head digital mul...,github,23918,4392339,10.1016/j.neuron.2015.01.028,INTRO
2,Recent rapid progress with on-head digital mul...,url_hit,23918,4392339,10.1016/j.neuron.2015.01.028,INTRO
3,Spectacular progress has been made over the pa...,url_hit,36421,4392339,10.1016/j.neuron.2015.01.028,CONCL
4,"Obtaining high-quality, high-density data from...",url_hit,37891,4392339,10.1016/j.neuron.2015.01.028,CONCL


In [221]:
df.to_csv('output/hit_contexts_nimh_v2.csv', index=False)

In [92]:
out_dat['pmcid_url'] = out_dat.pmcid.apply(lambda x: 'http://ncbi.nlm.nih.gov/pmc/articles/PMC'+str(x))
out_dat['doi_url'] = out_dat.doi.apply(lambda x: 'http://doi.org/'+str(x))
out_dat.to_excel('output/hit_contexts.nimh.xlsx', index=False)

In [89]:
len(out_dat.pmcid.value_counts())

18449

## Pull pubs out of HCP

In [96]:
soup = BeautifulSoup(open('data/HCP_Publications.htm'), "html.parser")
pubs = soup.find_all('div', {"class": "publication-data-wrapper"})

In [123]:
l = []
for i in pubs:
    title = i.find('h4').text.strip()
    auths = i.find('div', {'class':'publication-authors'}).text.strip()
    journal = i.find('span', {'class':'publication-data-name'}).text.strip()
    date = i.find('span', {'class':'publication-data-date'}).text.strip()
    try:
        pmid = i.find('span', {'class':'publication-data-pmid'}).text.strip()[6:]
    except:
        pmid = np.nan
    try:
        doi_href = i.find('span', {'class':'publication-data-name'}).find('a')['href']
    except:
        doi_href = np.nan
    l.append([title, auths, journal, date, pmid, doi_href])

In [128]:
df = pd.DataFrame(l, columns = ['title', 'auths', 'journal', 'date', 'pmid', 'doi_href'])

In [129]:
df.to_csv('output/hcp_pubs.csv', index=False)

## Pull out some passages identified via regex (again)

In [27]:
# load previously identified hits and papers funded by nimh
hits = pd.read_csv('output/hit_contexts.csv') #need the subset that is in nimh funded papers
nimh_papers = pd.read_csv('output/nimh_papers.csv')
#load file index
file_ix = pd.read_csv('output/file_index.csv')
#filter the hits to just nimh papers
nimh_hits = hits[hits.pmcid.isin(nimh_papers.pmcid)]
nimh_hits.shape

(5726, 5)

In [28]:
# filter out the previously sampled cases
excludes = pd.read_csv('output/labeled_data/regex_hits.csv')
nimh_hits = nimh_hits[~nimh_hits.pmcid.isin(excludes.pmcid)]
print(nimh_hits.shape)
#pull a sample of 100 hits
sampled_papers = pd.Series(nimh_hits.pmcid.unique()).sample(100, replace=False)
sampled_papers.shape

(5470, 5)


(100,)

In [29]:
#get location of sampled hits
file_locs = file_ix[file_ix.pmcid.isin(sampled_papers)]
data_collect = []
for i, file_path in enumerate(file_locs.file):
    with open(file_path[24:]) as infile:
        dat = json.load(infile)
        paper = dat[file_locs.paper_number.iloc[i]]
        out = hit_contexts(paper)
        data_collect.extend(out)

In [30]:
df_to_label = pd.DataFrame(data_collect, columns=['text', 'repo', 'paper_offset', 'pmcid', 'doi', 'section'])
df_to_label = df_to_label.drop_duplicates(subset=['pmcid'])
#df_to_label.to_csv('output/labeled_data/regex_hits3.csv', index=False)
df_to_label.to_csv('output/labeled_data/regex_hits4.csv', index=False)

In [22]:
df_to_label.head()

Unnamed: 0,text,repo,paper_offset,pmcid,doi,section
0,MRI data from the Human Connectome Project (ht...,humanconnectome.org,16764,5837394,10.1093/brain/awx309,METHODS
1,Variability in single-subject whole-brain func...,nitrc,19580,4146649,10.1038/nn.3778,METHODS
2,Genotype calls and hybridization intensity dat...,github,31722,4751547,10.1534/g3.115.022087,METHODS
4,Gene expression was measured by RNAseq and qua...,github,8101,5561488,10.1002/ajmg.a.38327,METHODS
5,The GTEx genotype and gene expression data wer...,dbgap,40358,4609956,10.1038/srep15145,METHODS


## Additional cases - NDAR selections3

In [31]:
ndar_collections = glob.glob('output/ndar_collections/*')
ndar_collections.sort()

In [32]:
l = []
for collection in ndar_collections:
    soup = BeautifulSoup(open(collection), "html.parser")
    table = soup.find_all(id='publication-table')[0]
    table_rows = table.find_all('tr')
    for tr in table_rows:
        td = tr.find_all('td')
        row = [tr.text for tr in td]
        l.append(row)
        
collection_pubs = pd.DataFrame(l, columns = ['pmid', 'study', 'title', 'journal', 'authors', 'date', 'status'])

In [33]:
#pick out the pmids for the studies marked as relevant
collection_pmids = pd.Series(collection_pubs.pmid[collection_pubs.status=='Relevant'].unique())

#read in the linking file and convert pmids to integer
pmid_to_pmcid = pd.read_csv('data/PMC-ids.csv')
pmid_to_pmcid['pmid'] = pmid_to_pmcid.PMID.fillna(0.0).astype(int)

#keep the rows in the linking file that have a match from ndar; strip out letters from pmcid
referenced_papers = pmid_to_pmcid[pmid_to_pmcid.pmid.isin(collection_pmids)]
referenced_papers['pmcid'] = referenced_papers.PMCID.apply(lambda x: x[3:])
 

  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [34]:
# filter to just nimh funded work
nimh_papers = pd.read_csv('output/nimh_papers.csv')
nimh_pmcids = nimh_papers.pmcid.astype('str').tolist()

referenced_papers = referenced_papers[referenced_papers.pmcid.isin(nimh_pmcids)]

In [35]:
#filter out the already-labeled ndar data
ndar_lab_files = glob.glob('output/labeled_data/ndar*.csv')
df_ndar_labs = pd.concat((pd.read_csv(f) for f in ndar_lab_files)).reset_index()
labeled_ndars = df_ndar_labs.pmcid.astype('str').tolist()
referenced_papers = referenced_papers[~referenced_papers.pmcid.isin(labeled_ndars)]

In [36]:
#get the file location for the matches
#pull a sample of 100 hits
sampled_papers = pd.Series(referenced_papers.pmcid.unique()).sample(100, replace=False)

#this file was manually labeled so that I can get specific passages
sampled_papers.to_csv('output/ndar_pmcids3.csv', index=False)

  


In [195]:
sampled_papers = pd.read_csv('output/ndar_pmcids3.csv')
sampled_papers['pmcid'] = sampled_papers.PMCID.astype('int').astype('str')
file_ix = pd.read_csv('output/file_index.csv')
#split out those that have shared and not shared data
no_shares = sampled_papers[sampled_papers.open_data==0]
shares = sampled_papers[sampled_papers.open_data==1]

In [56]:
no_shares_locs = file_ix[file_ix.pmcid.astype('str').isin(no_shares.PMCID.astype('int').astype('str').tolist())]
text = []
repo = []
paper_offset = []
pmcid = []
doi = []
section_type = []
for i, file_path in enumerate(no_shares_locs.file):
    with open(no_shares_locs.file.iloc[i][24:]) as infile:
        dat = json.load(infile)
        paper = dat[no_shares_locs.paper_number.iloc[i]]
        section = sample_section(paper)
        text.append(section['text'])
        repo.append(np.nan)
        paper_offset.append(section['offset'])
        pmcid.append(no_shares_locs.pmcid.iloc[i])
        doi.append(np.nan)
        section_type.append(section['infons']['section_type'])
        
dat = pd.DataFrame({'text':text,
                    'repo':repo,
                    'paper_offset':paper_offset,
                    'pmcid':pmcid,
                    'doi':doi,
                    'section':section_type})

In [59]:
shares_locs = file_ix[file_ix.pmcid.astype('str').isin(shares.PMCID.astype('int').astype('str').tolist())]
text = []
repo = []
paper_offset = []
pmcid = []
doi = []
section_type = []

In [60]:
shares_locs.shape

(11, 3)

In [186]:
i = 0
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][31]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][31]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][31]['infons']['section_type'])
i = 1
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][14]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][14]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][14]['infons']['section_type'])
i = 2
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][0]['infons']['notes'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][0]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append('NOTES')
i = 3
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][26]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][26]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][26]['infons']['section_type'])
i = 4
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][40]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][40]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][40]['infons']['section_type'])
i = 5
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][19]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][19]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][19]['infons']['section_type'])
i = 6
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][15]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][15]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][15]['infons']['section_type'])
i = 7
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][47]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][47]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][47]['infons']['section_type'])
i = 8
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][77]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][77]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][77]['infons']['section_type'])
i = 9
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][34]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][34]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][34]['infons']['section_type'])
i = 10
with open(shares_locs.file.iloc[i]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[i]]
    text.append(paper['documents'][0]['passages'][84]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][84]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[i])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][84]['infons']['section_type'])

In [187]:
dat2 = pd.DataFrame({'text':text,
                    'repo':repo,
                    'paper_offset':paper_offset,
                    'pmcid':pmcid,
                    'doi':doi,
                    'section':section_type})

In [199]:
ndar_labs = pd.concat([dat, dat2])
ndar_labs['pmcid'] = ndar_labs.pmcid.astype('str')
ndar_labs = ndar_labs.merge(sampled_papers[['pmcid', 'data_sharing', 'open_data']], how='left', on='pmcid')
ndar_labs.to_csv('output/labeled_data/ndar_labs3.csv', index=False)