In [73]:
import csv

import pandas as pd

from collections import namedtuple, defaultdict
from functools import reduce

In [52]:
with open('data/rab/cite_properties.csv') as f:
    data = f.readlines()
    # skip header, strip trailing whitespace
    fin_cite_props = [ d.strip() for d in data[1:] ]

In [53]:
with open('data/rab/cite_data.nt') as f:
    fin_rab_cites = f.readlines()

In [82]:
with open('data/far/articles.csv') as f:
    rdr = csv.reader(f)
    header = next(rdr)
    fin_far_arts = [ r for r in rdr ]
len(fin_far_arts)

6392

In [79]:
reduce(lambda x, y: x if len(y) < x else len(y), fin_far_arts, 0)

20

In [84]:
extra = [ (e, l) for e, l in enumerate(fin_far_arts) if len(l) > 15 ]
print(extra)

[(153, ['174', '781', 'BOOK', '', '2016-01-09 17:25:02', '2016-01-09 17:25:02', '\\Repairable Men\\"', ' by John Carr Walker"', 'Rain Taxi Review of Books', '', '', '0000-00-00', '', 'PUB', 'OTHER', '']), (154, ['175', '781', 'BOOK', '', '2016-01-09 17:26:30', '2016-01-09 17:26:30', '\\No Stones in Heaven', '\\" by Arlene Swift Jones"', 'Friends Journal', '', '', '0000-00-00', '', 'PUB', 'OTHER', '']), (155, ['176', '781', 'BOOK', '', '2016-01-09 17:27:48', '2016-01-09 17:27:48', '\\By Faith and By Love: Martin and Mabel’s Journey', '\\" by Beverly England Williams"', 'Friends Journal', '', '', '0000-00-00', '', 'PUB', 'OTHER', '']), (267, ['292', '353', 'CONF', '', '2016-01-11 15:15:19', '2016-01-11 15:16:33', '  \\Vitality and Obsolescence in the Theatres of the Humanities: Or', ' #SandraBland and Hamlet.\\""', 'Humanities Futures, http://humanitiesfutures.org/papers/vitality-and-obsolesence-in-the-theatre-of-the-humanities-or-sandrabland-and-hamlet/', '', '', '0000-00-00', '', 'PRES

In [54]:
cite_prop_map = { c: c[40:] for c in fin_cite_props if c.startswith('http://vivo.brown.edu/ontology/citation#') }
cite_prop_map['rabid'] = 'rabid'
cite_prop_map.values()

dict_values(['date', 'volume', 'hasContributor', 'authorList', 'pmid', 'issue', 'doi', 'hasVenue', 'pages', 'pmcid', 'issn', 'venueFor', 'eissn', 'publishedIn', 'book', 'hasLocation', 'editorList', 'chapter', 'hasPublisher', 'isbn', 'url', 'hasConferenceLocation', 'conferenceDate', 'hasConference', 'reviewOf', 'title', 'number', 'version', 'hasAssignee', 'hasCountry', 'hasAuthority', 'patentNumber', 'rabid'])

In [55]:
RABCitation = namedtuple('RABCitation', sorted(cite_prop_map.values()))

In [56]:
def clean_data_prop(oData):
    return oData.rsplit('"^^<http://www.w3.org/2001/XMLSchema#', maxsplit=1)[0].strip('\"\n .<>')

In [57]:
def parse_triple(rawRow):
    s,p,o = rawRow.split(' ',maxsplit=2)
    s = s.strip('<>')
    p = p.strip('<>')
    o = clean_data_prop(o)
    return (s,p,o)

In [58]:
cite_triples = []
for t in fin_rab_cites:
    cite_triples.append(parse_triple(t))

cite_triples[0]

('http://vivo.brown.edu/individual/n5c6cae127059414ca258636cd3dc482b',
 'http://www.w3.org/2000/01/rdf-schema#label',
 'Erratum to: Global magnetic confinement for the 1.5D Vlasov-Maxwell system')

In [59]:
cite_dicts = defaultdict(dict)
for t in cite_triples:
    if t[1] in cite_prop_map:
        cite_dicts[t[0]][cite_prop_map[t[1]]] = t[2]

In [61]:
empty_row = { cite_prop_map[p]: '' for p in cite_prop_map }
rab_rows = []
for c in cite_dicts:
    d = cite_dicts[c]
    d['rabid'] = c
    row = empty_row.copy()
    row.update(d)
    rab_rows.append(RABCitation(**row))

print(rab_rows[0])

RABCitation(authorList='Nguyen, Toan T., Nguyen, Truyen V., Strauss, Walter A', book='', chapter='', conferenceDate='', date='2015-06-01', doi='10.3934/krm.2015.8.615', editorList='', eissn='', hasAssignee='', hasAuthority='', hasConference='', hasConferenceLocation='', hasContributor='http://vivo.brown.edu/individual/wstrauss', hasCountry='', hasLocation='', hasPublisher='', hasVenue='http://vivo.brown.edu/individual/n6086eb8fe7824cad9423547d403a958d', isbn='', issn='', issue='3', number='', pages='615-616', patentNumber='', pmcid='', pmid='', publishedIn='Kinetic and Related Models', rabid='http://vivo.brown.edu/individual/n5c6cae127059414ca258636cd3dc482b', reviewOf='', title='', url='', venueFor='', version='', volume='8')


In [62]:
df_cites_rab = pd.DataFrame(rab_rows)

cols = df_cites_rab.columns.tolist()
id_atts = [ 'rabid','doi','pmid','pmcid','issn','eissn']
cols = [ c for c in cols if c not in id_atts ]
cols = id_atts + cols
df_cites_rab = df_cites_rab[ cols ]
df_cites_rab.head()

Unnamed: 0,rabid,doi,pmid,pmcid,issn,eissn,authorList,book,chapter,conferenceDate,...,number,pages,patentNumber,publishedIn,reviewOf,title,url,venueFor,version,volume
0,http://vivo.brown.edu/individual/n5c6cae127059...,10.3934/krm.2015.8.615,,,,,"Nguyen, Toan T., Nguyen, Truyen V., Strauss, W...",,,,...,,615-616,,Kinetic and Related Models,,,,,,8
1,http://vivo.brown.edu/individual/n52747,10.1037/0012-1649.40.4.595,15238046.0,,,,"Li, Jin",,,,...,,595-605,,,,,,,,40
2,http://vivo.brown.edu/individual/n8301,10.1007/s10461-012-0163-8,22323006.0,PMC3471653,,,"Brown JL, Sales JM, DiClemente RJ, Salazar LF,...",,,,...,,1491-500,,,,,,,,16
3,http://vivo.brown.edu/individual/n98528,10.1002/mc.2940130304,7619217.0,,,,"Sears WL, Goto-Mandeville R, Mirapuri M, Braun L",,,,...,,146-56,,,,,,,,13
4,http://vivo.brown.edu/individual/n52835,10.1016/j.drugalcdep.2014.09.265,,,,,"Graves, Hannah, Hernandez, Lynn, Kahler, Chris...",,,,...,,e127-e128,,,,,,,,146
