In [2]:
import csv
import collections
import pprint
import lxml.etree as ET
from copy import deepcopy
import json
import functools

In [3]:
with open('data/cleanup/venue_citations_graph.csv') as f:
    rdr = csv.reader(f)
    head = next(rdr)
    rows = [r for r in rdr]
rows

[['http://vivo.brown.edu/individual/n52190',
  'http://vivo.brown.edu/ontology/citation#date',
  '2009-09-01',
  'http://vitro.mannlib.cornell.edu/default/vitro-kb-2'],
 ['http://vivo.brown.edu/individual/n52190',
  'http://vivo.brown.edu/ontology/citation#volume',
  '18',
  'http://vitro.mannlib.cornell.edu/default/vitro-kb-2'],
 ['http://vivo.brown.edu/individual/n52190',
  'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
  'http://vivo.brown.edu/ontology/citation#Citation',
  'http://vitro.mannlib.cornell.edu/default/vitro-kb-inf'],
 ['http://vivo.brown.edu/individual/n52190',
  'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
  'http://www.w3.org/2002/07/owl#Thing',
  'http://vitro.mannlib.cornell.edu/default/vitro-kb-inf'],
 ['http://vivo.brown.edu/individual/n52190',
  'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
  'http://vivo.brown.edu/ontology/citation#Article',
  'http://vitro.mannlib.cornell.edu/default/vitro-kb-2'],
 ['http://vivo.brown.edu/individual/n52190',
  '

In [4]:
short_ids = [ r[2] for r in rows if r[1] == 'http://vivo.brown.edu/ontology/citation#hasContributor']
','.join([ "'{}'".format(s[33:]) for s in short_ids])

"'dsavitz','fdomini','ssun','dwazermd','jheadiii','jwandsmd','ll7','dsavitz','gkarniad','cshu','ktkelsey','dcoustan','ssun','mbossy','jwandsmd','bsandste','mranney','wwarrenj','ktkelsey','dkielmd','johsilve','gwessel','ktkelsey','jheadiii','eupfal'"

In [5]:
cite_props = { r[1] for r in rows if r[1] != 'http://vivo.brown.edu/ontology/citation#venueFor'}
cite_props

{'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType',
 'http://vivo.brown.edu/ontology/citation#authorList',
 'http://vivo.brown.edu/ontology/citation#date',
 'http://vivo.brown.edu/ontology/citation#doi',
 'http://vivo.brown.edu/ontology/citation#eissn',
 'http://vivo.brown.edu/ontology/citation#hasContributor',
 'http://vivo.brown.edu/ontology/citation#hasVenue',
 'http://vivo.brown.edu/ontology/citation#issn',
 'http://vivo.brown.edu/ontology/citation#issue',
 'http://vivo.brown.edu/ontology/citation#pages',
 'http://vivo.brown.edu/ontology/citation#pmcid',
 'http://vivo.brown.edu/ontology/citation#pmid',
 'http://vivo.brown.edu/ontology/citation#publishedIn',
 'http://vivo.brown.edu/ontology/citation#url',
 'http://vivo.brown.edu/ontology/citation#volume',
 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
 'http://www.w3.org/2000/01/rdf-schema#label'}

In [6]:
decl_props = { r[1] for r in rows if r[3] != 'http://vitro.mannlib.cornell.edu/default/vitro-kb-inf'}
decl_props

{'http://vivo.brown.edu/ontology/citation#authorList',
 'http://vivo.brown.edu/ontology/citation#date',
 'http://vivo.brown.edu/ontology/citation#doi',
 'http://vivo.brown.edu/ontology/citation#eissn',
 'http://vivo.brown.edu/ontology/citation#hasContributor',
 'http://vivo.brown.edu/ontology/citation#hasVenue',
 'http://vivo.brown.edu/ontology/citation#issn',
 'http://vivo.brown.edu/ontology/citation#issue',
 'http://vivo.brown.edu/ontology/citation#pages',
 'http://vivo.brown.edu/ontology/citation#pmcid',
 'http://vivo.brown.edu/ontology/citation#pmid',
 'http://vivo.brown.edu/ontology/citation#publishedIn',
 'http://vivo.brown.edu/ontology/citation#url',
 'http://vivo.brown.edu/ontology/citation#volume',
 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
 'http://www.w3.org/2000/01/rdf-schema#label'}

In [7]:
inf_props = { r[1] for r in rows if r[3] == 'http://vitro.mannlib.cornell.edu/default/vitro-kb-inf' }
inf_props

{'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType',
 'http://vivo.brown.edu/ontology/citation#venueFor',
 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'}

In [8]:
inf_types = { r[2] for r in rows if r[3] == 'http://vitro.mannlib.cornell.edu/default/vitro-kb-inf'
            and r[1] == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'}
inf_types

{'http://vivo.brown.edu/ontology/citation#Citation',
 'http://www.w3.org/2002/07/owl#Thing'}

In [9]:
inf_types = { r[2] for r in rows if r[3] != 'http://vitro.mannlib.cornell.edu/default/vitro-kb-inf'
            and r[1] == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'}
inf_types

{'http://vivo.brown.edu/ontology/citation#Article',
 'http://vivo.brown.edu/ontology/citation#Citation',
 'http://vivo.brown.edu/ontology/citation#NoID',
 'http://vivo.brown.edu/ontology/citation#Review',
 'http://vivo.brown.edu/ontology/citation#Venue'}

In [10]:
limited = [ r for r in rows if r[1]== 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType'
          and r[3] != 'http://vitro.mannlib.cornell.edu/default/vitro-kb-inf' ]
limited

[]

In [11]:
cprops = collections.defaultdict(set)
for r in rows:
    cprops[r[0]].add(r[1])
print(len(cprops))
for c in cprops:
    print(c)
    pprint.pprint(cprops[c])

25
http://vivo.brown.edu/individual/n52190
{'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType',
 'http://vivo.brown.edu/ontology/citation#authorList',
 'http://vivo.brown.edu/ontology/citation#date',
 'http://vivo.brown.edu/ontology/citation#doi',
 'http://vivo.brown.edu/ontology/citation#eissn',
 'http://vivo.brown.edu/ontology/citation#hasContributor',
 'http://vivo.brown.edu/ontology/citation#hasVenue',
 'http://vivo.brown.edu/ontology/citation#issn',
 'http://vivo.brown.edu/ontology/citation#issue',
 'http://vivo.brown.edu/ontology/citation#pages',
 'http://vivo.brown.edu/ontology/citation#pmcid',
 'http://vivo.brown.edu/ontology/citation#pmid',
 'http://vivo.brown.edu/ontology/citation#venueFor',
 'http://vivo.brown.edu/ontology/citation#volume',
 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
 'http://www.w3.org/2000/01/rdf-schema#label'}
http://vivo.brown.edu/individual/n12312
{'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType',
 'http://vivo.brow

In [12]:
shared = set()
for c in cprops:
    if not shared:
        shared = cprops[c]
    else:
        shared &= cprops[c]
shared

{'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType',
 'http://vivo.brown.edu/ontology/citation#date',
 'http://vivo.brown.edu/ontology/citation#hasContributor',
 'http://vivo.brown.edu/ontology/citation#hasVenue',
 'http://vivo.brown.edu/ontology/citation#issue',
 'http://vivo.brown.edu/ontology/citation#pages',
 'http://vivo.brown.edu/ontology/citation#venueFor',
 'http://vivo.brown.edu/ontology/citation#volume',
 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
 'http://www.w3.org/2000/01/rdf-schema#label'}

In [13]:
crabids = { r[0]: {} for r in rows }
id_props = { 'http://www.w3.org/2000/01/rdf-schema#label',
            'http://vivo.brown.edu/ontology/citation#doi',
            'http://vivo.brown.edu/ontology/citation#pmid',
           'http://vivo.brown.edu/ontology/citation#url'}
for r in rows:
    if r[1] in id_props:
        crabids[r[0]][r[1][r[1].index('#')+1:]] = r[2]
crabids

{'http://vivo.brown.edu/individual/n52190': {'pmid': '19757520',
  'doi': '10.1089/jwh.2008.1102',
  'label': 'Arthritis and rheumatism'},
 'http://vivo.brown.edu/individual/n12312': {'doi': '10.1167/1.3.328',
  'label': 'Journal of Vision'},
 'http://vivo.brown.edu/individual/n38511': {'pmid': '20297818',
  'doi': '10.1021/ja1009629',
  'label': 'J. Appl. Phys.'},
 'http://vivo.brown.edu/individual/n79283': {'pmid': '18716592',
  'doi': '10.1120/jacmp.v9i3.2775',
  'label': 'Brachytherapy'},
 'http://vivo.brown.edu/individual/n79541': {'doi': '10.1130/g24382a.1',
  'label': 'Science'},
 'http://vivo.brown.edu/individual/n855be37b19f14d619c2adf820ae33f32': {'doi': '10.1158/1538-7445.am2013-890',
  'label': "Alzheimer's & Dementia"},
 'http://vivo.brown.edu/individual/n5101': {'doi': '10.1109/jproc.2014.2298231',
  'label': 'RF and Microwave Hardware Challenges for Future Radio Spectrum Access'},
 'http://vivo.brown.edu/individual/n59735': {'pmid': '20696650',
  'doi': '10.1093/aje/kwq1

In [14]:
for c in crabids:
    if not set(['doi','pmid']) & set(crabids[c].keys()):
        print(crabids[c])

{'url': 'http://www.journals.uchicago.edu/doi/full/10.1086/692251', 'label': 'Speculum'}


In [15]:
labels = collections.defaultdict(set)
for r in rows:
    if r[1] == 'http://www.w3.org/2000/01/rdf-schema#label':
        labels[r[0]].add(r[2])
for l in labels:
    if len(labels[l]) > 1:
        print(l)
        print("\t", labels[l])

http://vivo.brown.edu/individual/n5101
	 {'RF and Microwave Hardware Challenges for Future Radio Spectrum Access', 'The 2005 IEEE Annual Conference Wireless and Micrwave Technology, 2005.'}
http://vivo.brown.edu/individual/n21627
	 {'The Johns Hopkins medical journal', 'Hepatitis B virus mutants associated with 3TC and famciclovir administration are replication defective.'}
http://vivo.brown.edu/individual/n16928
	 {'Base excision repair genes and risk of lung cancer among San Francisco Bay Area Latinos and African-Americans.', 'Chemico-biological interactions'}
http://vivo.brown.edu/individual/n57636
	 {'Advances in prostaglandin, thromboxane, and leukotriene research', 'Calcium intake and hip fracture risk in men and women: a meta-analysis of prospective cohort studies and randomized controlled trials.'}
http://vivo.brown.edu/individual/n44730
	 {'Recent Ice Ages on Mars: The role of radiatively active clouds and cloud microphysics', 'Earth Moon Planet'}
http://vivo.brown.edu/individ

In [16]:
for l in labels:
    if len(labels[l]) < 2:
        print(l)
        print("\t", labels[l])

http://vivo.brown.edu/individual/n52190
	 {'Arthritis and rheumatism'}
http://vivo.brown.edu/individual/n12312
	 {'Journal of Vision'}
http://vivo.brown.edu/individual/n38511
	 {'J. Appl. Phys.'}
http://vivo.brown.edu/individual/n79283
	 {'Brachytherapy'}
http://vivo.brown.edu/individual/n79541
	 {'Science'}
http://vivo.brown.edu/individual/n855be37b19f14d619c2adf820ae33f32
	 {"Alzheimer's & Dementia"}
http://vivo.brown.edu/individual/n59735
	 {'Paediatric and perinatal epidemiology'}
http://vivo.brown.edu/individual/n3917
	 {'Computer Methods in Applied Mechanics and Engineering'}
http://vivo.brown.edu/individual/n91166
	 {'Journal of Computational Physics'}
http://vivo.brown.edu/individual/n67856
	 {'Human Genetics'}
http://vivo.brown.edu/individual/n28550
	 {'Neuropsychologia'}
http://vivo.brown.edu/individual/n2053
	 {'Nanoscale'}
http://vivo.brown.edu/individual/n8081cd271207487a9ab47e57c0a2caee
	 {'Speculum'}
http://vivo.brown.edu/individual/n27567
	 {'SIAM Journal on Mathematica

In [17]:
tree = ET.parse('data/cleanup/describe_cites.rdf')
root = tree.getroot()
ns = root.nsmap
qn = { 'rdf' : '{{{0}}}'.format(ns['rdf']), 'bcite' : '{{{0}}}'.format(ns['bcite']),
     'rdfs' : '{{{0}}}'.format(ns['rdfs']), 'vitro' : '{{{0}}}'.format(ns['vitro'])}

In [18]:
add_node = ET.Element(qn['rdf'] + "RDF", nsmap=ns)
rmv_node = ET.Element(qn['rdf'] + "RDF", nsmap=ns)

skip_tags = { qn['bcite']+'venueFor', qn['bcite']+'issn', qn['bcite']+'eissn',
            qn['vitro']+'mostSpecificType' }
venue_titles = { 'The 2005 IEEE Annual Conference Wireless and Micrwave Technology, 2005.',
               'The Johns Hopkins medical journal', 'Chemico-biological interactions',
               'Advances in prostaglandin, thromboxane, and leukotriene research',
               'Earth Moon Planet', 'Biocomputing 2012'}
skip_rsc = { 'http://vivo.brown.edu/ontology/citation#Venue',
            'http://www.w3.org/2002/07/owl#Thing'}

nodes = root.findall('rdf:Description', ns)
for n in nodes:
    anode = ET.SubElement(add_node, qn['rdf'] + "Description")
    rnode = ET.SubElement(rmv_node, qn['rdf'] + "Description")
    rnode.set(qn['rdf']+'about', n.get(qn['rdf']+'about'))
    type_node = None
    noid_node = None
    spec_type = 'http://vivo.brown.edu/ontology/citation#Citation'
    for child in n:
        if child.tag not in skip_tags:
            rsc = child.get(qn['rdf']+'resource')
            if (rsc in skip_rsc or child.text in venue_titles):
                continue
            if (child.tag == qn['rdf'] + 'type'):
                if rsc == 'http://vivo.brown.edu/ontology/citation#NoID':
                    noid_node = deepcopy(child)
                    continue
                if spec_type == 'http://vivo.brown.edu/ontology/citation#Citation':
                    spec_type = child.get(qn['rdf']+'resource')
                    type_node = deepcopy(child)
                continue
            anode.append(deepcopy(child))
            rnode.append(deepcopy(child))
    anode.append(deepcopy(type_node))
    rnode.append(deepcopy(type_node))
    if noid_node is not None:
        rnode.append(noid_node)

In [19]:
for e, a in enumerate(add_node):
    print("{}: {} of {}".format(e, a.tag, a.get(qn['rdf'] + 'about')))
    for c in a:
        print('\t', c.tag, c.attrib, c.text)
    print('-'*100)
print('\n\n'+ '='*100 + '\n\n')
for e, a in enumerate(rmv_node):
    print("{}: {} of {}".format(e, a.tag, a.get(qn['rdf'] + 'about')))
    for c in a:
        print('\t', c.tag, c.attrib, c.text)
    print('-'*100)

0: {http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description of None
	 {http://vivo.brown.edu/ontology/citation#}pages {} 411
	 {http://vivo.brown.edu/ontology/citation#}volume {} 36
	 {http://vivo.brown.edu/ontology/citation#}authorList {} Dickson, James L., Head, James W., Marchant, David R.
	 {http://vivo.brown.edu/ontology/citation#}doi {} 10.1130/g24382a.1
	 {http://vivo.brown.edu/ontology/citation#}hasVenue {'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource': 'http://vivo.brown.edu/individual/n38102'} None
	 {http://www.w3.org/2000/01/rdf-schema#}label {} Science
	 {http://vivo.brown.edu/ontology/citation#}issue {} 5
	 {http://vivo.brown.edu/ontology/citation#}hasContributor {'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource': 'http://vivo.brown.edu/individual/jheadiii'} None
	 {http://vivo.brown.edu/ontology/citation#}date {'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}datatype': 'http://www.w3.org/2001/XMLSchema#date'} 2008-01-01
	 {http://www.w3.org/1999/02/22-rdf-syn

	 {http://vivo.brown.edu/ontology/citation#}authorList {} Madeleine, J.-B., Head, J. W., Forget, F., Navarro, T., Millour, E., Spiga, A., Colaïtis, A., Määttänen, A., Montmessin, F., Dickson, J. L.
	 {http://vivo.brown.edu/ontology/citation#}volume {} 41
	 {http://vivo.brown.edu/ontology/citation#}hasContributor {'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource': 'http://vivo.brown.edu/individual/jheadiii'} None
	 {http://vivo.brown.edu/ontology/citation#}pages {} 4873-4879
	 {http://vivo.brown.edu/ontology/citation#}doi {} 10.1002/2014gl059861
	 {http://www.w3.org/1999/02/22-rdf-syntax-ns#}type {'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource': 'http://vivo.brown.edu/ontology/citation#Article'} None
----------------------------------------------------------------------------------------------------
11: {http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description of http://vivo.brown.edu/individual/n38511
	 {http://vivo.brown.edu/ontology/citation#}issue {} 14
	 {http://vivo.b

In [20]:
contribs = collections.defaultdict(set)
for r in rows:
    if r[1] == 'http://vivo.brown.edu/ontology/citation#hasContributor':
        contribs[r[2][33:]].add(r[0])
contribs

defaultdict(set,
            {'dsavitz': {'http://vivo.brown.edu/individual/n52190',
              'http://vivo.brown.edu/individual/n59735'},
             'fdomini': {'http://vivo.brown.edu/individual/n12312'},
             'ssun': {'http://vivo.brown.edu/individual/n2053',
              'http://vivo.brown.edu/individual/n38511'},
             'dwazermd': {'http://vivo.brown.edu/individual/n79283'},
             'jheadiii': {'http://vivo.brown.edu/individual/n44730',
              'http://vivo.brown.edu/individual/n79541'},
             'jwandsmd': {'http://vivo.brown.edu/individual/n21627',
              'http://vivo.brown.edu/individual/n855be37b19f14d619c2adf820ae33f32'},
             'll7': {'http://vivo.brown.edu/individual/n5101'},
             'gkarniad': {'http://vivo.brown.edu/individual/n3917'},
             'cshu': {'http://vivo.brown.edu/individual/n91166'},
             'ktkelsey': {'http://vivo.brown.edu/individual/n16928',
              'http://vivo.brown.edu/individual

In [21]:
pkeys = []
for c, sids in contribs.items():
    for s in sids:
        tmpl = { 'short_id': c, 'rabid': s, 'doi':'', 'pmid': '', 'url': ''}
        for k,v in crabids[s].items():
            if k in tmpl:
                tmpl[k] = v
        pkeys.append(tmpl)
pkeys

[{'short_id': 'dsavitz',
  'rabid': 'http://vivo.brown.edu/individual/n59735',
  'doi': '10.1093/aje/kwq181',
  'pmid': '20696650',
  'url': ''},
 {'short_id': 'dsavitz',
  'rabid': 'http://vivo.brown.edu/individual/n52190',
  'doi': '10.1089/jwh.2008.1102',
  'pmid': '19757520',
  'url': ''},
 {'short_id': 'fdomini',
  'rabid': 'http://vivo.brown.edu/individual/n12312',
  'doi': '10.1167/1.3.328',
  'pmid': '',
  'url': ''},
 {'short_id': 'ssun',
  'rabid': 'http://vivo.brown.edu/individual/n38511',
  'doi': '10.1021/ja1009629',
  'pmid': '20297818',
  'url': ''},
 {'short_id': 'ssun',
  'rabid': 'http://vivo.brown.edu/individual/n2053',
  'doi': '10.1002/anie.201300276',
  'pmid': '23426846',
  'url': ''},
 {'short_id': 'dwazermd',
  'rabid': 'http://vivo.brown.edu/individual/n79283',
  'doi': '10.1120/jacmp.v9i3.2775',
  'pmid': '18716592',
  'url': ''},
 {'short_id': 'jheadiii',
  'rabid': 'http://vivo.brown.edu/individual/n44730',
  'doi': '10.1002/2014gl059861',
  'pmid': '',
  '

In [24]:
dois = { p['doi']: p for p in pkeys if p['doi'] != ''}
pmids = { p['pmid']: p for p in pkeys if p['pmid'] != ''}
print("DOIs: ",len(dois))
print("PMIDs: ",len(pmids))

DOIs:  22
PMIDs:  14


In [25]:
doi_match = {}
pmid_match = {}
matched = set()
for v in vmgr:
    if v['doi'] in dois:
        doi_match[v['doi']] = v
        matched.add(frozenset(dois[v['doi']].values()))
    if v['pmid'].lower() in pmids:
        pmid_match[v['pmid']] = v['pmid']
        matched.add(frozenset(pmids[v['pmid']].values()))

In [26]:
len(matched)

24

In [27]:
matched

{frozenset({'',
            '10.1002/hep.510270243',
            '9462667',
            'http://vivo.brown.edu/individual/n21627',
            'jwandsmd'}),
 frozenset({'',
            '10.1038/sj.onc.1204302',
            '11313923',
            'http://vivo.brown.edu/individual/n67856',
            'ktkelsey'}),
 frozenset({'',
            '10.1109/jproc.2014.2298231',
            'http://vivo.brown.edu/individual/n5101',
            'll7'}),
 frozenset({'',
            '10.1016/j.jde.2009.06.010',
            'bsandste',
            'http://vivo.brown.edu/individual/n27567'}),
 frozenset({'',
            '10.1093/carcin/bgn261',
            '19029194',
            'http://vivo.brown.edu/individual/n16928',
            'ktkelsey'}),
 frozenset({'',
            '10.1002/2014gl059861',
            'http://vivo.brown.edu/individual/n44730',
            'jheadiii'}),
 frozenset({'',
            '10.2337/dc11-1687',
            '22301123',
            'dcoustan',
            'http://vivo.

In [28]:
for d in dois:
    if d not in doi_match:
        print(dois[d])

{'short_id': 'dsavitz', 'rabid': 'http://vivo.brown.edu/individual/n52190', 'doi': '10.1089/jwh.2008.1102', 'pmid': '19757520', 'url': ''}
{'short_id': 'dwazermd', 'rabid': 'http://vivo.brown.edu/individual/n79283', 'doi': '10.1120/jacmp.v9i3.2775', 'pmid': '18716592', 'url': ''}


In [29]:
for d in pmids:
    if d not in pmid_match:
        print(pmids[d])

{'short_id': 'ssun', 'rabid': 'http://vivo.brown.edu/individual/n38511', 'doi': '10.1021/ja1009629', 'pmid': '20297818', 'url': ''}
{'short_id': 'ssun', 'rabid': 'http://vivo.brown.edu/individual/n2053', 'doi': '10.1002/anie.201300276', 'pmid': '23426846', 'url': ''}
{'short_id': 'mranney', 'rabid': 'http://vivo.brown.edu/individual/n1814', 'doi': '10.5811/westjem.2017.6.34849', 'pmid': '28874943', 'url': ''}
{'short_id': 'wwarrenj', 'rabid': 'http://vivo.brown.edu/individual/n7b205dc5b041431da1f101ae2c08499c', 'doi': '10.1016/j.humov.2006.05.001', 'pmid': '16859793', 'url': ''}
{'short_id': 'gwessel', 'rabid': 'http://vivo.brown.edu/individual/nd6e87771a19c4065bfc50964259ed905', 'doi': '10.1093/molehr/gav064', 'pmid': '26590170', 'url': ''}


In [30]:
len(doi_match)

20

In [31]:
len(pmid_match)

9

In [32]:
c = 1
for p in pkeys:
    if p['doi'] in doi_match:
        print(c)
        print(json.loads(doi_match[p['doi']]['display'])['citation']['uri'])
        c+=1

1
http://vivo.brown.edu/individual/n89483
2
http://vivo.brown.edu/individual/n415
3
http://vivo.brown.edu/individual/n81532
4
http://vivo.brown.edu/individual/n48063
5
http://vivo.brown.edu/individual/n44730
6
http://vivo.brown.edu/individual/n87370
7
http://vivo.brown.edu/individual/n64700
8
http://vivo.brown.edu/individual/n21627
9
http://vivo.brown.edu/individual/n5101
10
http://vivo.brown.edu/individual/n68447
11
http://vivo.brown.edu/individual/n98010
12
http://vivo.brown.edu/individual/n5868
13
http://vivo.brown.edu/individual/n16928
14
http://vivo.brown.edu/individual/n28054
15
http://vivo.brown.edu/individual/n8655
16
http://vivo.brown.edu/individual/n17882
17
http://vivo.brown.edu/individual/n10978
18
http://vivo.brown.edu/individual/n32902
19
http://vivo.brown.edu/individual/n79206
20
http://vivo.brown.edu/individual/n32612


In [33]:
'10.1002/ANIE.201300276'.lower()

'10.1002/anie.201300276'

In [35]:
def get_node_data(node):
    return (node.tag, node.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource') or node.text)

In [36]:
def get_node_uri(node):
    return node.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about')

In [37]:
def filter_node(node, matches, on_property=False):
    if on_property:
        return get_node_data(node)[0] in matches
    else:
        return get_node_data(node) in matches

In [233]:
def filter_citation_titles(node):
    extra_titles = {'RF and Microwave Hardware Challenges for Future Radio Spectrum Access',
                    'Hepatitis B virus mutants associated with 3TC and famciclovir administration are replication defective.'
                    'Base excision repair genes and risk of lung cancer among San Francisco Bay Area Latinos and African-Americans.',
                    'Calcium intake and hip fracture risk in men and women: a meta-analysis of prospective cohort studies and randomized controlled trials.',
                    'Recent Ice Ages on Mars: The role of radiatively active clouds and cloud microphysics',
                    'Database-support for continuous prediction queries over streaming data' }
    bad_data = { ('{http://www.w3.org/2000/01/rdf-schema#}label', t) for t in extra_titles }
    return filter_node(node, bad_data)

In [234]:
def filter_venue_type(node):
    venue_data = { '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}type','http://vivo.brown.edu/ontology/citation#Venue' }
    return not filter_node(node, venue_data)

In [39]:
def filter_venue_props(node):
    venue_props = { '{http://vivo.brown.edu/ontology/citation#}venueFor',
                   '{http://vivo.brown.edu/ontology/citation#}issn',
                   '{http://vivo.brown.edu/ontology/citation#}eissn' }
    return not filter_node(node, venue_props, on_property=True)

In [40]:
def is_citation_data(node):
    return filter_venue_props(node) and filter_venue_type(node)

In [41]:
def filter_inferred_data(node):
    inf_data = {
        ('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}type','http://www.w3.org/2002/07/owl#Thing'),
        ('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}type','http://vivo.brown.edu/ontology/citation#Citation')
    }
    return not filter_node(node, inf_data)

In [42]:
def filter_inferred_props(node):
    inf_props = { '{http://vitro.mannlib.cornell.edu/ns/vitro/0.7#}mostSpecificType' }
    return not filter_node(node, inf_props, on_property=True)

In [43]:
def is_asserted(node):
    return filter_inferred_data(node) and filter_inferred_props(node)

In [53]:
def get_sample_citation_type(nodes):
    for n in nodes:
        if get_node_data(n[1]) == ('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}type','http://vivo.brown.edu/ontology/citation#Citation'):
            return deepcopy(n[1])

In [60]:
def add_assertion(nodes, samp):
    asserted = [ d for d in nodes if get_node_data(d)[0] == '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}type' ]
    if not asserted:
        nodes.add(deepcopy(samp))
    return nodes

In [56]:
def remove_assertion(nodes, test):
    return [ d for d in nodes if get_node_data(d) != test ]

In [57]:
def make_new_node(uri, elems):
    node = ET.Element('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description')
    node.set('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about', uri)
    for e in elems:
        node.append(deepcopy(e))
    return node

In [65]:
def duplicate_nodes(uri, elems):
    a = make_new_node('',
        remove_assertion(elems, ('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}type','http://vivo.brown.edu/ontology/citation#NoID')))
    r = make_new_node(uri, elems)
    return (a, r)

In [176]:
def get_node_identifiers(node):
    ids = { 'doi': '', 'pmid': '', 'url':'' }
    props = {'{http://vivo.brown.edu/ontology/citation#}doi',
            '{http://vivo.brown.edu/ontology/citation#}pmid',
            '{http://vivo.brown.edu/ontology/citation#}url'}
    for n in node:
        p, o = get_node_data(n)
        if p in props:
            ids[ p[p.index('#')+2:] ] = o
    return ids

In [170]:
def filter_json(jdata):
    try:
        d = json.loads(jdata)
        return True
    except:
        return False

In [171]:
def get_pub_data(pub):
    jdata = json.loads(pub['display'])
    return (
        ( pub['doi'], pub['pmid'], jdata['url'] ),
        ( pub['short_id'], jdata['citation'].get('title'), jdata['citation'].get('uri') )
    )

In [222]:
def map_node_to_pub(nodeIds, node, doiDict, pmidDict, urlDict):
    if nodeIds['doi'] in doiDict:
        return (node, doiDict[nodeIds['doi']])
    if nodeIds['pmid'] in pmidDict:
        return (node, pmidDict[nodeIds['pmid']])
    if nodeIds['url'] in urlDict:
        return (node, urlDict[nodeIds['url']])
    return (node, False)

In [229]:
def check_author(node, data):
    nd = [ get_node_data(n) for n in node ]
    auths = [ n[1] for n in nd if n[0] == '{http://vivo.brown.edu/ontology/citation#}hasContributor']
    return data[0] in auths

In [224]:
with open('data/cleanup/pubs.tsv') as f:
    rdr = csv.DictReader(f, delimiter='\t')
    vmgr = [ r for r in rdr ]

for v in vmgr:
    v['doi'] = v['doi'].lower()

In [225]:
tree = ET.parse('data/cleanup/describe_cites.rdf')
root = tree.getroot()
ns = root.nsmap

In [232]:
nodes = root.findall('rdf:Description', ns)
cite_data = [ (get_node_uri(n), d) for n in nodes for d in n if is_citation_data(d) ]
cite_data = [ (get_node_uri(n), d) for n in nodes for d in n if is_citation_data(d) ]
asserted_data = [ (uri, d) for uri, d in cite_data if is_asserted(d) ]

fused = collections.defaultdict(set)
for data in asserted_data:
    fused[data[0]].add(data[1])
sample_cite = get_sample_citation_type(cite_data)
fully_typed = [ (uri, add_assertion(fused[uri], sample_cite)) for uri in fused ]
split_nodes = [ duplicate_nodes(uri, assrts) for uri, assrts in fully_typed ]

to_remove = [ n[1] for n in split_nodes ]
wrapped_nodes = [ (get_node_identifiers(n[0]), n[0]) for n in split_nodes ]

good_pubs = [ p for p in vmgr if filter_json(p['display']) ]
wrapped_pubs = [ get_pub_data(pub) for pub in good_pubs ]
doi_pubs = { d[0][0]: d[1] for d in wrapped_pubs }
pmid_pubs = { d[0][1]: d[1] for d in wrapped_pubs }
url_pubs = { d[0][2]: d[1] for d in wrapped_pubs }

mapped = [ map_node_to_pub(node_ids, node, doi_pubs, pmid_pubs, url_pubs)
          for node_ids, node in wrapped_nodes ]
for m in mapped:
    if not m[1]:
        for n in m[0]:
            print(get_node_data(n))
checked = [ check_author(node[0], node[1]) for node in mapped ]
# add_node = ET.Element('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF', nsmap=ns)
# rmv_node = ET.Element('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF', nsmap=ns)
# add_rmv = [ make_ for ]

('{http://vivo.brown.edu/ontology/citation#}hasContributor', 'http://vivo.brown.edu/individual/mbossy')
('{http://vivo.brown.edu/ontology/citation#}url', 'http://www.journals.uchicago.edu/doi/full/10.1086/692251')
('{http://vivo.brown.edu/ontology/citation#}pages', '795-796')
('{http://vivo.brown.edu/ontology/citation#}issue', '3')
('{http://vivo.brown.edu/ontology/citation#}date', '2017-01-01')
('{http://www.w3.org/2000/01/rdf-schema#}label', 'Speculum')
('{http://vivo.brown.edu/ontology/citation#}hasVenue', 'http://vivo.brown.edu/individual/n8081cd271207487a9ab47e57c0a2caee')
('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}type', 'http://vivo.brown.edu/ontology/citation#Review')
('{http://vivo.brown.edu/ontology/citation#}volume', '92')


TypeError: 'bool' object is not subscriptable

In [None]:
qn = { 'rdf' : '{{{0}}}'.format(ns['rdf']), 'bcite' : '{{{0}}}'.format(ns['bcite']),
     'rdfs' : '{{{0}}}'.format(ns['rdfs']), 'vitro' : '{{{0}}}'.format(ns['vitro'])}
add_node = ET.Element(qn['rdf'] + "RDF", nsmap=ns)
rmv_node = ET.Element(qn['rdf'] + "RDF", nsmap=ns)

skip_tags = { qn['bcite']+'venueFor', qn['bcite']+'issn', qn['bcite']+'eissn',
            qn['vitro']+'mostSpecificType' }
venue_titles = { 'The 2005 IEEE Annual Conference Wireless and Micrwave Technology, 2005.',
               'The Johns Hopkins medical journal', 'Chemico-biological interactions',
               'Advances in prostaglandin, thromboxane, and leukotriene research',
               'Earth Moon Planet', 'Biocomputing 2012'}
skip_rsc = { ,
            'http://www.w3.org/2002/07/owl#Thing'}

nodes = root.findall('rdf:Description', ns)
for n in nodes:
    anode = ET.SubElement(add_node, qn['rdf'] + "Description")
    rnode = ET.SubElement(rmv_node, qn['rdf'] + "Description")
    rnode.set(qn['rdf']+'about', n.get(qn['rdf']+'about'))
    type_node = None
    noid_node = None
    spec_type = 'http://vivo.brown.edu/ontology/citation#Citation'
    for child in n:
        if child.tag not in skip_tags:
            rsc = child.get(qn['rdf']+'resource')
            if (rsc in skip_rsc or child.text in venue_titles):
                continue
            if (child.tag == qn['rdf'] + 'type'):
                if rsc == 'http://vivo.brown.edu/ontology/citation#NoID':
                    noid_node = deepcopy(child)
                    continue
                if spec_type == 'http://vivo.brown.edu/ontology/citation#Citation':
                    spec_type = child.get(qn['rdf']+'resource')
                    type_node = deepcopy(child)
                continue
            anode.append(deepcopy(child))
            rnode.append(deepcopy(child))
    anode.append(deepcopy(type_node))
    rnode.append(deepcopy(type_node))
    if noid_node is not None:
        rnode.append(noid_node)