In [1]:
import pandas as pd
import glob
import json
import re
from urlextract import URLExtract
pd.set_option('display.max_colwidth', -1)

# Pulling the context of a hit

This notebook contains code to go through the aggregated data, pulling out the context of a hit. This will facilitate checking how often our regex patterns are leading to false positives

In [2]:
extractor = URLExtract()

def hit_contexts(r_out):
    out_dat = []
    
    reg_matches = re.compile(r"""(github)|(osf\.io)|(nda\.nih\.gov)|(openneuro)|(\sndar)|
                                 (national database for autism research)|(brain-map\.org)|
                                 (humanconnectome\.org)|(balsa\.wustl\.edu)|(loni\.usc\.edu)|
                                 (ida\.loni\.usc\.edu)|(fmridc)|(ccrns)|(datalad)|(dataverse)|
                                 (dbgap)|(nih\.gov\/gap)|(dryad)|(figshare)|(fcon_1000\.projects)|
                                 (nitrc)|(mcgill\.ca\/bic\/resources\/omega)|(xnat\.org)|
                                 (zenodo)|(opendata\.aws)""", re.X)
    

    try:
        tmp_doi = r_out['documents'][0]['passages'][0]['infons']['article-id_doi']
    except:
        tmp_doi = None
    try:
        tmp_pmcid = r_out['documents'][0]['id']
    except:
        tmp_pmcid = None
        
    for passage in r_out['documents'][0]['passages']:
        m = re.finditer(reg_matches, passage['text'].lower())
        if m:
            segments = [(max(0, item.start()-175),
                         min(len(passage['text']), item.end(0)+125),
                         item.group()) for item in m]
            
            try:
                section_type = passage['infons']['section_type']
            except:
                section_type=None
            
            for seg in segments:
                out_dat.append([passage['text'], 
                                seg[2], 
                                tmp_pmcid, 
                                tmp_doi, 
                                section_type])
                
        if extractor.has_urls(passage['text'].lower()):
            try:
                section_type = passage['infons']['section_type']
            except:
                section_type=None
                
            out_dat.append([passage['text'],
                            'url_hit',
                            tmp_pmcid,
                            tmp_doi,
                            section_type])
        else:
            out_dat.append([None, None, tmp_pmcid, tmp_doi, None])
   
        
    return(out_dat)

In [3]:
already_retrieved = glob.glob('/home/riddleta/ac_knowl/output/full_texts/papes*')
already_retrieved = sorted(already_retrieved)
status_prints = already_retrieved[::3]
len(status_prints)

350

In [4]:
data_collect = []
for i in already_retrieved:
    if i in status_prints:
        print(i)
    with open(i) as infile:
        dat = json.load(infile)
        for j, paper in enumerate(dat):
            out_dat = hit_contexts(paper)
            data_collect.extend(out_dat)
              

/home/riddleta/ac_knowl/output/full_texts/papes_0.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1000000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1007500.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1015000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1022500.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1027500.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1035000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1042500.txt
/home/riddleta/ac_knowl/output/full_texts/papes_105000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1055000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1062500.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1070000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1075000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1082500.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1090000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1097500.txt
/home/riddleta/ac_knowl/output/full_texts/papes

/home/riddleta/ac_knowl/output/full_texts/papes_1922500.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1927500.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1935000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1942500.txt
/home/riddleta/ac_knowl/output/full_texts/papes_195000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1955000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1962500.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1970000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1975000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1982500.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1990000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_1997500.txt
/home/riddleta/ac_knowl/output/full_texts/papes_2000000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_2007500.txt
/home/riddleta/ac_knowl/output/full_texts/papes_2015000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_2022500.txt
/home/riddleta/ac_knowl/output/full_texts

/home/riddleta/ac_knowl/output/full_texts/papes_497500.txt
/home/riddleta/ac_knowl/output/full_texts/papes_500000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_507500.txt
/home/riddleta/ac_knowl/output/full_texts/papes_515000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_522500.txt
/home/riddleta/ac_knowl/output/full_texts/papes_527500.txt
/home/riddleta/ac_knowl/output/full_texts/papes_535000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_542500.txt
/home/riddleta/ac_knowl/output/full_texts/papes_55000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_555000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_562500.txt
/home/riddleta/ac_knowl/output/full_texts/papes_570000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_575000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_582500.txt
/home/riddleta/ac_knowl/output/full_texts/papes_590000.txt
/home/riddleta/ac_knowl/output/full_texts/papes_597500.txt
/home/riddleta/ac_knowl/output/full_texts/papes_602500.tx

In [105]:
data_collect = []
with open(already_retrieved[1]) as infile:
    dat = json.load(infile)
    for j, paper in enumerate(dat):
        out_dat = hit_contexts(paper)
        data_collect.extend(out_dat)

In [5]:
len(data_collect)

287704732

In [125]:
df = pd.DataFrame(data_collect)
df.columns = ['context', 'repo_hit', 'pmcid', 'doi', 'section']
df.to_csv('/home/riddleta/ac_knowl/output/hit_contexts.csv', index=False)

In [132]:
df.repo_hit.value_counts()

github                           54824
dryad                            19217
figshare                         13044
dbgap                            6927 
zenodo                           4954 
nitrc                            3964 
osf.io                           3606 
brain-map.org                    2108 
loni.usc.edu                     1602 
dataverse                        954  
 ndar                            927  
nih.gov/gap                      626  
fcon_1000.projects               595  
humanconnectome.org              424  
fmridc                           230  
openneuro                        105  
ccrns                            86   
ida.loni.usc.edu                 77   
balsa.wustl.edu                  70   
xnat.org                         65   
datalad                          37   
\tndar                           13   
\nndar                           9    
 ndar                            2    
opendata.aws                     1    
mcgill.ca/bic/resources/o

In [207]:
#github - all kinds of shit
#dryad - mostly posting data!
#figshare - mostly posting data!
#dbgap - mixture of posting data and reuse
#zenodo - mixture? zenodo seems to host lots of things
#nitrc - too general of a search term? includes lots o refs to BrainNet Viewer
#osf - mostly data!
#brain-map.org - mostly refs to already existing data
#loni - refs to existing data
#dataverse - mixture. seems like there are multiple things with the title 'dataverse'
#ndar - not many direct links. Lots of discussion. Plus a couple of things that aren't ndar: (nDart1 (gene?), tGuet Ndar (?), numpy ndarray...)
#nih.gov/gap - lots of reuse
#fcon_1000 - looks like lots of reuse
#humanconnectome - lots of reuse
#fmridc - seems mostly people talking about fmridc
#openneuro - looks like data posting
#ccrns seems like an acronym for something else
#ida.loni.usc.edu - downloads
#balsa - data posting!
#xnat - some data sharing? lots of things that don't appear to be data
#datalad - posting data and talking about datalad
#opendata.aws - one instance of reuse
#omega - one instance of just a link (appears to be reuse)

# % of 2.6 mil that are nimh funded
# percentage of all funded nimh that is in the 2.6 million https://federalreporter.nih.gov/FileDownload
#other tools to see if we can get a sense of all papers from the same timeframe.

sample_hits = df[df.repo_hit=='zenodo']
sample_hits.sample(n=5)

Unnamed: 0,context,repo_hit,pmcid,doi,section
58643,"chive (http://www.ebi.ac.uk/ena) under accession number PRJEB14185. Additionally, aligned reads in BAM format for all Strand-seq libraries used in this study are available at Zenodo (doi: 10.5281/zenodo.1203703). PacBio reads are available from the Sequence Reads Archive (https://www.ncbi.nlm.nih.gov/sra/",zenodo,6022540,10.1093/bioinformatics/bty290,INTRO
63114,"Fluxes were determined through computer assisted metabolic engineering and optimization (CAMEO) toolbox (doi: 10.5281/zenodo.18400) using parsimonious flux balance analysis (pFBA). A modified iTO977 yeast model was used, for which the fluxes were bo",zenodo,4791802,10.1186/s12934-016-0451-5,METHODS
99298,ailable via GitHub (https://github.com/egking/QTLbiasSIM). The same set of code with nearly all raw and intermediate data files is available at Zenodo: http://doi.org/10.5281/zenodo.438140.,zenodo,5473746,10.1534/g3.117.041426,METHODS
104774,The datasets analysed in the current study are available at https://doi.org/10.5281/zenodo.846863 and in the OpenfMRI repository at https://openfmri.org/dataset/ds000201/.,zenodo,5612991,10.1038/s41598-017-12098-9,METHODS
80856,"Pritt J, Chen N, Langmead B. Forge software. 2018d. Zenodo. 10.5281/zenodo.1482926.",zenodo,6296055,10.1186/s13059-018-1595-x,REF


In [3]:
with open('/home/riddleta/ac_knowl/output/full_texts/papes_1402500.txt') as infile:
        dat = json.load(infile)

In [7]:
dat[2439]#['documents'][0]['passages'][17]

{'date': '20190204',
 'source': 'PMC',
 'infons': {},
 'documents': [{'passages': [{'text': 'Diagnosis of cysticercosis in endemic regions',
     'offset': 0,
     'relations': [],
     'infons': {'name_2': 'surname:Gilman;given-names:R.',
      'name_3': 'surname:Herrera;given-names:G.',
      'name_0': 'surname:Garcia;given-names:H. H.',
      'name_1': 'surname:Martinez;given-names:M.',
      'name_6': 'surname:Diaz;given-names:F.',
      'name_7': 'surname:Verastegui;given-names:M.',
      'name_4': 'surname:Tsang;given-names:V. C. W.',
      'name_5': 'surname:Pilcher;given-names:J. B.',
      'name_8': 'surname:Gallo;given-names:C.',
      'name_9': 'surname:Porras;given-names:M.',
      'lpage': '551',
      'fpage': '549',
      'year': '2010',
      'section_type': 'TITLE',
      'article-id_pmid': '1678809',
      'issue': '8766',
      'volume': '338',
      'article-id_pmc': '2913119',
      'article-id_manuscript': 'UKMS31615',
      'license': '\n          This file is av