In [98]:
import pandas as pd
import re
import glob
import json
from bs4 import BeautifulSoup
import random
import numpy as np

# Making datasets for labeling
There are several stages to this, and I'm going to pull samples to label from three sources.

First, I'm limiting this to the subset of papers that are funded by the NIMH.

Using that subset, I'm going to label 'passages' for whether they contain an instance of data sharing. A passage roughly corresponds to a paragraph in the paper, but sometimes is a footnote or a reference or a title. It's an organizational unit within the PMC fulltext databnase.

The sources that I'm pulling these passages from are:

1. Contexts previously identified as possible data-sharing using regular expressions.
2. A list of papers described as having shared data using NDAR. For this, I'll attempt to use any passages where data sharing is described
3. A random selection of passsages that are unlikely to contain data-sharing.

In [190]:
#function to pull segments for labeling
def hit_contexts(r_out):
    out_dat = []
    
    reg_matches = re.compile(r"""(github)|(osf\.io)|(nda\.nih\.gov)|(openneuro)|(\sndar)|
                                 (national database for autism research)|(brain-map\.org)|
                                 (humanconnectome\.org)|(balsa\.wustl\.edu)|(loni\.usc\.edu)|
                                 (ida\.loni\.usc\.edu)|(fmridc)|(ccrns)|(datalad)|(dataverse)|
                                 (dbgap)|(nih\.gov\/gap)|(dryad)|(figshare)|(fcon_1000\.projects)|
                                 (nitrc)|(mcgill\.ca\/bic\/resources\/omega)|(xnat\.org)|
                                 (zenodo)|(opendata\.aws)""", re.X)
    

    try:
        tmp_doi = r_out['documents'][0]['passages'][0]['infons']['article-id_doi']
    except:
        tmp_doi = None
    try:
        tmp_pmcid = r_out['documents'][0]['id']
    except:
        tmp_pmcid = None
        
    for passage in r_out['documents'][0]['passages']:
        m = re.finditer(reg_matches, passage['text'].lower())
        if m:
            segments = [(max(0, item.start()-175),
                         min(len(passage['text']), item.end(0)+125),
                         item.group()) for item in m]
            
            try:
                section_type = passage['infons']['section_type']
            except:
                section_type=None
            
            for seg in segments:
                out_dat.append([passage['text'], #delete the seg[0]:seg[1] and you'll have the full segment
                                seg[2], #ids which repo
                                passage['offset'], #how far into the paper?
                                tmp_pmcid, 
                                tmp_doi, 
                                section_type])
        else:
            out_dat.append([None, None, tmp_pmcid, tmp_doi, None])
        
    return(out_dat)

def sample_section(paper):
    candidate = []
    for i in paper['documents'][0]['passages']:
        if i['infons']['section_type'] != 'REF':
            candidate.append(i)
    
    selection = random.choice(candidate)
    
    return(selection)

## Contexts previously identified as probable data sharing (via regex)

In [65]:
# load previously identified hits and papers funded by nimh
hit_contexts = pd.read_csv('output/hit_contexts.csv') #need the subset that is in nimh funded papers
nimh_papers = pd.read_csv('output/nimh_papers.csv')
#load file index
file_ix = pd.read_csv('output/file_index.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [52]:
#filter the hits to just nimh papers
nimh_hits = hit_contexts[hit_contexts.pmcid.isin(nimh_papers.pmcid)]
nimh_hits.shape

(5726, 5)

In [96]:
#pull a sample of 100 hits
sampled_papers = pd.Series(nimh_hits.pmcid.unique()).sample(100, replace=False)
sampled_papers.shape

(100,)

In [97]:
#get location of sampled hits
file_locs = file_ix[file_ix.pmcid.isin(sampled_papers)]

In [98]:
file_locs.shape

(93, 3)

In [76]:
data_collect = []
for i, file_path in enumerate(file_locs.file):
    with open(file_path[24:]) as infile:
        dat = json.load(infile)
        paper = dat[file_locs.paper_number.iloc[i]]
        out = hit_contexts(paper)
        data_collect.extend(out)

In [99]:
df_to_label = pd.DataFrame(data_collect, columns=['text', 'repo', 'paper_offset', 'pmcid', 'doi', 'section'])
df_to_label = df_to_label.drop_duplicates(subset=['pmcid'])
df_to_label.to_csv('output/labeled_data/regex_hits.csv', index=False)

## Papers identified through NDAR

This is a convoluted process. First, I pull out all the papers mentioned in scraped NDA collections as being relevant. Next, I got a sample of those papers, and manually labeled them for containing instances of data sharing. With those labeled papers, I pulled out the section of text in each paper that mentions data sharing. If there was no such mention, then I just pulled out a random section (leaving out anything that was a reference). I then put these data together and wrote them as a csv file (`ndar_labs.csv`). Though I aimed for a sample of 100, I ended up with 65 due to a few papers being embargoed, and some papers listed in NDAR not being in the full-text database that I have.

In [2]:
ndar_collections = glob.glob('output/ndar_collections/*')
ndar_collections.sort()

In [54]:
len(ndar_collections)

1203

In [10]:
l = []
for collection in ndar_collections:
    soup = BeautifulSoup(open(collection), "html.parser")
    table = soup.find_all(id='publication-table')[0]
    table_rows = table.find_all('tr')
    for tr in table_rows:
        td = tr.find_all('td')
        row = [tr.text for tr in td]
        l.append(row)
        
collection_pubs = pd.DataFrame(l, columns = ['pmid', 'study', 'title', 'journal', 'authors', 'date', 'status'])

In [39]:
#pick out the pmids for the studies marked as relevant
collection_pmids = pd.Series(collection_pubs.pmid[collection_pubs.status=='Relevant'].unique())

In [40]:
#read in the linking file and convert pmids to integer
pmid_to_pmcid = pd.read_csv('data/PMC-ids.csv')
pmid_to_pmcid['pmid'] = pmid_to_pmcid.PMID.fillna(0.0).astype(int)

  interactivity=interactivity, compiler=compiler, result=result)


In [41]:
pmid_to_pmcid.head()

Unnamed: 0,Journal Title,ISSN,eISSN,Year,Volume,Issue,Page,DOI,PMCID,PMID,Manuscript Id,Release Date,pmid
0,Breast Cancer Res,1465-5411,1465-542X,2000,3,1,55,,PMC13900,11250746.0,,live,11250746
1,Breast Cancer Res,1465-5411,1465-542X,2000,3,1,61,,PMC13901,11250747.0,,live,11250747
2,Breast Cancer Res,1465-5411,1465-542X,2000,3,1,66,,PMC13902,11250748.0,,live,11250748
3,Breast Cancer Res,1465-5411,1465-542X,1999,2,1,59,10.1186/bcr29,PMC13911,11056684.0,,live,11056684
4,Breast Cancer Res,1465-5411,1465-542X,1999,2,1,64,,PMC13912,11400682.0,,live,11400682


In [43]:
#keep the rows in the linking file that have a match from ndar; strip out letters from pmcid
referenced_papers = pmid_to_pmcid[pmid_to_pmcid.pmid.isin(collection_pmids)]
referenced_papers['pmcid'] = referenced_papers.PMCID.apply(lambda x: x[3:])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [50]:
#get the file location for the matches
#pull a sample of 100 hits
sampled_papers = pd.Series(referenced_papers.pmcid.unique()).sample(100, replace=False)
#this file was manually labeled so that I can get specific passages
sampled_papers.to_csv('output/ndar_pmcids.csv', index=False) 

  after removing the cwd from sys.path.


In [58]:
sampled_papers = pd.read_csv('output/ndar_pmcids.csv')
#split out those that have shared and not shared data
no_shares = sampled_papers[sampled_papers.data_sharing=='0']
shares = sampled_papers[sampled_papers.data_sharing=='1']

In [109]:
no_shares_locs = file_ix[file_ix.pmcid.isin(no_shares.PMCID.tolist())]
text = []
repo = []
paper_offset = []
pmcid = []
doi = []
section_type = []
for i, file_path in enumerate(no_shares_locs.file):
    with open(no_shares_locs.file.iloc[i][24:]) as infile:
        dat = json.load(infile)
        paper = dat[no_shares_locs.paper_number.iloc[i]]
        section = sample_section(paper)
        text.append(section['text'])
        repo.append(np.nan)
        paper_offset.append(section['offset'])
        pmcid.append(no_shares_locs.pmcid.iloc[i])
        doi.append(np.nan)
        section_type.append(section['infons']['section_type'])
        
dat = pd.DataFrame({'text':text,
                    'repo':repo,
                    'paper_offset':paper_offset,
                    'pmcid':pmcid,
                    'doi':doi,
                    'section':section_type})

In [165]:
shares_locs = file_ix[file_ix.pmcid.isin(shares.PMCID.tolist())]
text = []
repo = []
paper_offset = []
pmcid = []
doi = []
section_type = []


In [166]:
with open(shares_locs.file.iloc[0][24:]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[0]]
    text.append(paper['documents'][0]['passages'][77]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][77]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[0])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][77]['infons']['section_type'])

In [167]:
with open(shares_locs.file.iloc[2][24:]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[2]]
    text.append(paper['documents'][0]['passages'][62]['text'])
    repo.append('GEO')
    paper_offset.append(paper['documents'][0]['passages'][62]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[2])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][62]['infons']['section_type'])

In [168]:
with open(shares_locs.file.iloc[3][24:]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[3]]
    text.append(paper['documents'][0]['passages'][88]['text'])
    repo.append('OSF')
    paper_offset.append(paper['documents'][0]['passages'][88]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[3])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][88]['infons']['section_type'])

In [169]:
with open(shares_locs.file.iloc[4][24:]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[4]]
    text.append(paper['documents'][0]['passages'][47]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][47]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[4])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][47]['infons']['section_type'])

In [170]:
with open(shares_locs.file.iloc[5][24:]) as infile:
    share_dat = json.load(infile)
    paper = share_dat[shares_locs.paper_number.iloc[5]]
    text.append(paper['documents'][0]['passages'][91]['text'])
    repo.append('NDAR')
    paper_offset.append(paper['documents'][0]['passages'][91]['offset'])
    pmcid.append(shares_locs.pmcid.iloc[5])
    doi.append(np.nan)
    section_type.append(paper['documents'][0]['passages'][91]['infons']['section_type'])

In [172]:
dat2 = pd.DataFrame({'text':text,
                    'repo':repo,
                    'paper_offset':paper_offset,
                    'pmcid':pmcid,
                    'doi':doi,
                    'section':section_type})

In [175]:
ndar_labs = pd.concat([dat, dat2])

In [176]:
ndar_labs.to_csv('output/labeled_data/ndar_labs.csv', index=False)

## Now do a random selection of other passages

In [177]:
nimh_papers = pd.read_csv('output/nimh_papers.csv')
#load file index
file_ix = pd.read_csv('output/file_index.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [178]:
#pull a sample of 100 hits
sampled_papers = pd.Series(nimh_papers.pmcid.unique()).sample(100, replace=False)
sampled_papers.shape

(100,)

In [179]:
file_locs = file_ix[file_ix.pmcid.isin(sampled_papers)]
file_locs.shape

(89, 3)

In [180]:
out_dat = []
for i, file_path in enumerate(file_locs.file):
    with open(file_path[24:]) as infile:
        dat = json.load(infile)
        paper = dat[file_locs.paper_number.iloc[i]]
        sec = sample_section(paper)
        out_dat.append([sec['text'], #delete the seg[0]:seg[1] and you'll have the full segment
                np.nan, #ids which repo
                sec['offset'], #how far into the paper?
                file_locs.pmcid.iloc[i], 
                np.nan, 
                sec['infons']['section_type']])
        


In [183]:
temp = pd.DataFrame(out_dat, columns=['text', 'repo', 'paper_offset', 'pmcid', 'doi', 'section'])
temp.to_csv('output/labeled_data/random_selections.csv', index=False)

# We need to add some additional cases

First, I'm going to pull out some of the instances identified via regex matches.

In [186]:
# load previously identified hits and papers funded by nimh
hit_contexts = pd.read_csv('output/hit_contexts.csv') #need the subset that is in nimh funded papers
nimh_papers = pd.read_csv('output/nimh_papers.csv')
#load file index
file_ix = pd.read_csv('output/file_index.csv')
#filter the hits to just nimh papers
nimh_hits = hit_contexts[hit_contexts.pmcid.isin(nimh_papers.pmcid)]
nimh_hits.shape

  interactivity=interactivity, compiler=compiler, result=result)


(5726, 5)

In [187]:
# filter out the previously sampled cases
excludes = pd.read_csv('output/labeled_data/regex_hits.csv')
nimh_hits = nimh_hits[~nimh_hits.pmcid.isin(excludes.pmcid)]
print(nimh_hits.shape)
#pull a sample of 100 hits
sampled_papers = pd.Series(nimh_hits.pmcid.unique()).sample(100, replace=False)
sampled_papers.shape

(5463, 5)


(100,)

In [191]:
#get location of sampled hits
file_locs = file_ix[file_ix.pmcid.isin(sampled_papers)]
data_collect = []
for i, file_path in enumerate(file_locs.file):
    with open(file_path[24:]) as infile:
        dat = json.load(infile)
        paper = dat[file_locs.paper_number.iloc[i]]
        out = hit_contexts(paper)
        data_collect.extend(out)

In [192]:
df_to_label = pd.DataFrame(data_collect, columns=['text', 'repo', 'paper_offset', 'pmcid', 'doi', 'section'])
df_to_label = df_to_label.drop_duplicates(subset=['pmcid'])
df_to_label.to_csv('output/labeled_data/regex_hits2.csv', index=False)