In [1]:
import pandas as pd
from collections import namedtuple, defaultdict

In [14]:
with open('data/cite_properties.csv') as f:
    data = f.readlines()
    # skip header, strip trailing whitespace
    cite_props = [ d.strip() for d in data[1:] ]

cite_props

['http://vivo.brown.edu/ontology/citation#date',
 'http://vivo.brown.edu/ontology/citation#volume',
 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
 'http://vivo.brown.edu/ontology/citation#hasContributor',
 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType',
 'http://vivo.brown.edu/ontology/citation#authorList',
 'http://vivo.brown.edu/ontology/citation#pmid',
 'http://vivo.brown.edu/ontology/citation#issue',
 'http://vivo.brown.edu/ontology/citation#doi',
 'http://vivo.brown.edu/ontology/citation#hasVenue',
 'http://vivo.brown.edu/ontology/citation#pages',
 'http://www.w3.org/2000/01/rdf-schema#label',
 'http://vivo.brown.edu/ontology/citation#pmcid',
 'http://vivo.brown.edu/ontology/citation#issn',
 'http://vivo.brown.edu/ontology/citation#venueFor',
 'http://vivo.brown.edu/ontology/citation#eissn',
 'http://vivo.brown.edu/ontology/citation#publishedIn',
 'http://vivo.brown.edu/ontology/citation#book',
 'http://vivo.brown.edu/ontology/citation#hasLocation',
 'http:

In [15]:
cite_prop_map = { c: c[40:] for c in cite_props if c.startswith('http://vivo.brown.edu/ontology/citation#') }
cite_prop_map['rabid'] = 'rabid'
cite_prop_map.values()

dict_values(['date', 'volume', 'hasContributor', 'authorList', 'pmid', 'issue', 'doi', 'hasVenue', 'pages', 'pmcid', 'issn', 'venueFor', 'eissn', 'publishedIn', 'book', 'hasLocation', 'editorList', 'chapter', 'hasPublisher', 'isbn', 'url', 'hasConferenceLocation', 'conferenceDate', 'hasConference', 'reviewOf', 'title', 'number', 'version', 'hasAssignee', 'hasCountry', 'hasAuthority', 'patentNumber', 'rabid'])

In [37]:
RABCitation = namedtuple('RABCitation', sorted(cite_prop_map.values()))

In [38]:
with open('data/cite_data.nt') as f:
    data = f.readlines()

In [39]:
def clean_data_prop(oData):
    return oData.rsplit('"^^<http://www.w3.org/2001/XMLSchema#', maxsplit=1)[0].strip('\"\n .<>')

In [40]:
def parse_triple(rawRow):
    s,p,o = rawRow.split(' ',maxsplit=2)
    s = s.strip('<>')
    p = p.strip('<>')
    o = clean_data_prop(o)
    return (s,p,o)

In [41]:
cite_triples = []
for t in data:
    cite_triples.append(parse_triple(t))

cite_triples[0]

('http://vivo.brown.edu/individual/n5c6cae127059414ca258636cd3dc482b',
 'http://www.w3.org/2000/01/rdf-schema#label',
 'Erratum to: Global magnetic confinement for the 1.5D Vlasov-Maxwell system')

In [42]:
cite_dicts = defaultdict(dict)
for t in cite_triples:
    if t[1] in cite_prop_map:
        cite_dicts[t[0]][cite_prop_map[t[1]]] = t[2]

In [43]:
empty_row = { cite_prop_map[p]: '' for p in cite_prop_map }
rab_rows = []
for c in cite_dicts:
    d = cite_dicts[c]
    d['rabid'] = c
    row = empty_row.copy()
    row.update(d)
    rab_rows.append(RABCitation(**row))

for r in rab_rows[:5]:
    print(r)

RABCitation(authorList='Nguyen, Toan T., Nguyen, Truyen V., Strauss, Walter A', book='', chapter='', conferenceDate='', date='2015-06-01', doi='10.3934/krm.2015.8.615', editorList='', eissn='', hasAssignee='', hasAuthority='', hasConference='', hasConferenceLocation='', hasContributor='http://vivo.brown.edu/individual/wstrauss', hasCountry='', hasLocation='', hasPublisher='', hasVenue='http://vivo.brown.edu/individual/n6086eb8fe7824cad9423547d403a958d', isbn='', issn='', issue='3', number='', pages='615-616', patentNumber='', pmcid='', pmid='', publishedIn='Kinetic and Related Models', rabid='http://vivo.brown.edu/individual/n5c6cae127059414ca258636cd3dc482b', reviewOf='', title='', url='', venueFor='', version='', volume='8')
RABCitation(authorList='Li, Jin', book='', chapter='', conferenceDate='', date='2004-01-01', doi='10.1037/0012-1649.40.4.595', editorList='', eissn='', hasAssignee='', hasAuthority='', hasConference='', hasConferenceLocation='', hasContributor='http://vivo.brown.

In [44]:
rc_df = pd.DataFrame(rab_rows)
rc_df.head()

Unnamed: 0,authorList,book,chapter,conferenceDate,date,doi,editorList,eissn,hasAssignee,hasAuthority,...,pmcid,pmid,publishedIn,rabid,reviewOf,title,url,venueFor,version,volume
0,"Nguyen, Toan T., Nguyen, Truyen V., Strauss, W...",,,,2015-06-01,10.3934/krm.2015.8.615,,,,,...,,,Kinetic and Related Models,http://vivo.brown.edu/individual/n5c6cae127059...,,,,,,8
1,"Li, Jin",,,,2004-01-01,10.1037/0012-1649.40.4.595,,,,,...,,15238046.0,,http://vivo.brown.edu/individual/n52747,,,,,,40
2,"Brown JL, Sales JM, DiClemente RJ, Salazar LF,...",,,,2012-08-01,10.1007/s10461-012-0163-8,,,,,...,PMC3471653,22323006.0,,http://vivo.brown.edu/individual/n8301,,,,,,16
3,"Sears WL, Goto-Mandeville R, Mirapuri M, Braun L",,,,1995-07-01,10.1002/mc.2940130304,,,,,...,,7619217.0,,http://vivo.brown.edu/individual/n98528,,,,,,13
4,"Graves, Hannah, Hernandez, Lynn, Kahler, Chris...",,,,2015-01-01,10.1016/j.drugalcdep.2014.09.265,,,,,...,,,,http://vivo.brown.edu/individual/n52835,,,,,,146
