In [1]:
import csv

import pandas as pd

from collections import namedtuple, defaultdict
from functools import reduce

In [2]:
with open('data/rab/cite_properties.csv') as f:
    data = f.readlines()
    # skip header, strip trailing whitespace
    fin_cite_props = [ d.strip() for d in data[1:] ]

In [3]:
with open('data/rab/cite_data.nt') as f:
    fin_rab_cites = f.readlines()

In [24]:
with open('data/far/articles.csv') as f:
    rdr = csv.reader(f, escapechar='\\')
    far_cites_header = next(rdr)
    fin_far_arts = [ r for r in rdr ]

In [27]:
df_cites_far = pd.DataFrame(fin_far_arts, columns=far_cites_header)
df_cites_far.head()

Unnamed: 0,id,activity_report_id,article_type_id,identifier,created_at,updated_at,title,journal,number,volume,date,coauthors,book_status_id,article_id_type_id,page_numbers
0,6,15,PEER,10.1117/1.nph.2.3.031202,2016-01-07 16:51:12,2016-01-07 16:51:12,Modified toolbox for optogenetics in the nonhu...,Neurophotonics,3,2,2015-05-29,"Dai, Ji and Ozden, Ilker and Brooks, Daniel I....",PUB,DOI,031202
1,7,25,PEER,10.1162/neco_a_00681,2016-01-07 17:07:06,2016-01-07 17:07:58,Spatiotemporal Conditional Inference and Hypot...,Neural Computation,1,27,0000-00-00,"Harrison, Matthew T. and Amarasingham, Asohan ...",PUB,DOI,104-150
2,9,760,PEER,10.1038/nature14105,2016-01-07 17:08:08,2016-01-07 17:08:08,Impact jetting as the origin of chondrules,Nature,7534,517,2015-01-14,"Johnson, Brandon C. and Minton, David A. and M...",PUB,DOI,339-341
3,10,760,PEER,10.1002/2015gl065022,2016-01-07 17:08:19,2016-01-07 17:10:52,The fractured Moon: Production and saturation ...,Geophysical Research Letters,17,42,2015-09-10,"Soderblom, Jason M. and Evans, Alexander J. an...",PUB,DOI,6939-6944
4,11,25,PEER,10.1073/pnas.1506400112,2016-01-07 17:08:37,2016-01-07 17:08:50,Ambiguity and nonidentifiability in the statis...,Proc Natl Acad Sci USA,20,112,0000-00-00,"Amarasingham, Asohan and Geman, Stuart and Har...",PUB,DOI,6455-6460


In [44]:
cite_prop_map = { c: c[40:] for c in fin_cite_props if c.startswith('http://vivo.brown.edu/ontology/citation#') }
cite_prop_map['rabid'] = 'rabid'
cite_prop_map['http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType'] = 'type'
cite_prop_map.values()

dict_values(['date', 'volume', 'hasContributor', 'authorList', 'pmid', 'issue', 'doi', 'hasVenue', 'pages', 'pmcid', 'issn', 'venueFor', 'eissn', 'publishedIn', 'book', 'hasLocation', 'editorList', 'chapter', 'hasPublisher', 'isbn', 'url', 'hasConferenceLocation', 'conferenceDate', 'hasConference', 'reviewOf', 'title', 'number', 'version', 'hasAssignee', 'hasCountry', 'hasAuthority', 'patentNumber', 'rabid', 'type'])

In [45]:
RABCitation = namedtuple('RABCitation', sorted(cite_prop_map.values()))

In [46]:
def clean_data_prop(oData):
    return oData.rsplit('"^^<http://www.w3.org/2001/XMLSchema#', maxsplit=1)[0].strip('\"\n .<>')

In [47]:
def parse_triple(rawRow):
    s,p,o = rawRow.split(' ',maxsplit=2)
    s = s.strip('<>')
    p = p.strip('<>')
    o = clean_data_prop(o)
    return (s,p,o)

In [48]:
cite_triples = []
for t in fin_rab_cites:
    cite_triples.append(parse_triple(t))

cite_triples[0]

('http://vivo.brown.edu/individual/n5c6cae127059414ca258636cd3dc482b',
 'http://www.w3.org/2000/01/rdf-schema#label',
 'Erratum to: Global magnetic confinement for the 1.5D Vlasov-Maxwell system')

In [49]:
cite_dicts = defaultdict(dict)
for t in cite_triples:
    if t[1] in cite_prop_map:
        cite_dicts[t[0]][cite_prop_map[t[1]]] = t[2]

In [50]:
empty_row = { cite_prop_map[p]: '' for p in cite_prop_map }
rab_rows = []
for c in cite_dicts:
    d = cite_dicts[c]
    d['rabid'] = c
    row = empty_row.copy()
    row.update(d)
    rab_rows.append(RABCitation(**row))

print(rab_rows[0])

RABCitation(authorList='Nguyen, Toan T., Nguyen, Truyen V., Strauss, Walter A', book='', chapter='', conferenceDate='', date='2015-06-01', doi='10.3934/krm.2015.8.615', editorList='', eissn='', hasAssignee='', hasAuthority='', hasConference='', hasConferenceLocation='', hasContributor='http://vivo.brown.edu/individual/wstrauss', hasCountry='', hasLocation='', hasPublisher='', hasVenue='http://vivo.brown.edu/individual/n6086eb8fe7824cad9423547d403a958d', isbn='', issn='', issue='3', number='', pages='615-616', patentNumber='', pmcid='', pmid='', publishedIn='Kinetic and Related Models', rabid='http://vivo.brown.edu/individual/n5c6cae127059414ca258636cd3dc482b', reviewOf='', title='', type='http://vivo.brown.edu/ontology/citation#Article', url='', venueFor='', version='', volume='8')


In [97]:
df_cites_rab = pd.DataFrame(rab_rows)

cols = df_cites_rab.columns.tolist()
id_atts = [ 'rabid','type','doi','pmid','pmcid','issn','eissn' ]
cols = [ c for c in cols if c not in id_atts ]
cols = id_atts + cols
df_cites_rab = df_cites_rab[ cols ]
df_cites_rab.type = df_cites_rab.type.str.rsplit('#').str.get(1)
df_cites_rab.head()

Unnamed: 0,rabid,type,doi,pmid,pmcid,issn,eissn,authorList,book,chapter,...,number,pages,patentNumber,publishedIn,reviewOf,title,url,venueFor,version,volume
0,http://vivo.brown.edu/individual/n5c6cae127059...,Article,10.3934/krm.2015.8.615,,,,,"Nguyen, Toan T., Nguyen, Truyen V., Strauss, W...",,,...,,615-616,,Kinetic and Related Models,,,,,,8
1,http://vivo.brown.edu/individual/n52747,Article,10.1037/0012-1649.40.4.595,15238046.0,,,,"Li, Jin",,,...,,595-605,,,,,,,,40
2,http://vivo.brown.edu/individual/n8301,Article,10.1007/s10461-012-0163-8,22323006.0,PMC3471653,,,"Brown JL, Sales JM, DiClemente RJ, Salazar LF,...",,,...,,1491-500,,,,,,,,16
3,http://vivo.brown.edu/individual/n98528,Article,10.1002/mc.2940130304,7619217.0,,,,"Sears WL, Goto-Mandeville R, Mirapuri M, Braun L",,,...,,146-56,,,,,,,,13
4,http://vivo.brown.edu/individual/n52835,Article,10.1016/j.drugalcdep.2014.09.265,,,,,"Graves, Hannah, Hernandez, Lynn, Kahler, Chris...",,,...,,e127-e128,,,,,,,,146


In [98]:
df_cites_rab.type.value_counts()

Article            42162
Citation            3548
ConferencePaper      837
NoID                 823
BookSection          513
Book                 496
Review               165
Abstract              71
WorkingPaper          23
Venue                 13
Patent                 3
Name: type, dtype: int64

In [104]:
df_cites_far.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6392 entries, 0 to 6391
Data columns (total 15 columns):
id                    6392 non-null object
activity_report_id    6392 non-null object
article_type_id       6392 non-null object
identifier            6392 non-null object
created_at            6392 non-null object
updated_at            6392 non-null object
title                 6392 non-null object
journal               6392 non-null object
number                6392 non-null object
volume                6392 non-null object
date                  6392 non-null object
coauthors             6392 non-null object
book_status_id        6392 non-null object
article_id_type_id    6392 non-null object
page_numbers          6392 non-null object
dtypes: object(15)
memory usage: 749.2+ KB


In [102]:
len(set(df_cites_rab[ (df_cites_rab.type=='Article') | (df_cites_rab.type=='Citation')].doi) & set(df_cites_far[ df_cites_far.article_id_type_id == 'DOI' ].identifier))

1830

In [101]:
df_cites_rab[ (df_cites_rab.type !='Article') & (df_cites_rab.doi != '') ].type.value_counts()

Citation           3258
ConferencePaper     803
Review               98
BookSection          86
Book                 32
Venue                13
Abstract             10
WorkingPaper          3
Name: type, dtype: int64