# FAR-VIVO Citation Data Analysis

In [33]:
import csv

import pandas as pd

from collections import namedtuple, defaultdict
from functools import reduce

## FAR Publication Data

In [34]:
with open('data/far/articles.csv') as f:
    rdr = csv.reader(f, escapechar='\\')
    far_arts_header = next(rdr)
    fin_far_arts = [ r for r in rdr ]
df_articles_far = pd.DataFrame(fin_far_arts, columns=far_arts_header)

In [35]:
with open('data/far/books.csv') as f:
    rdr = csv.reader(f, escapechar='\\')
    far_books_header = next(rdr)
    fin_far_books = [ r for r in rdr ]
df_books_far = pd.DataFrame(fin_far_books, columns=far_books_header)

In [36]:
with open('data/far/chapters.csv') as f:
    rdr = csv.reader(f, escapechar='\\')
    far_chapters_header = next(rdr)
    fin_far_chapters = [ r for r in rdr ]
df_chaps_far = pd.DataFrame(fin_far_chapters, columns=far_chapters_header)

In [37]:
with open('data/far/critical_reviews.csv') as f:
    rdr = csv.reader(f, escapechar='\\')
    far_revs_header = next(rdr)
    fin_far_revs = [ r for r in rdr ]
df_revs_far = pd.DataFrame(fin_far_revs, columns=far_revs_header)

In [38]:
with open('data/far/papers.csv') as f:
    rdr = csv.reader(f, escapechar='\\')
    far_papers_header = next(rdr)
    fin_far_papers = [ r for r in rdr ]
df_papers_far = pd.DataFrame(fin_far_papers, columns=far_papers_header)

In [39]:
with open('data/far/patents.csv') as f:
    rdr = csv.reader(f, escapechar='\\')
    far_patents_header = next(rdr)
    fin_far_patents = [ r for r in rdr ]
df_patents_far = pd.DataFrame(fin_far_patents, columns=far_patents_header)

In [40]:
with open('data/far/ph_abstracts.csv') as f:
    rdr = csv.reader(f, escapechar='\\')
    far_abst_header = next(rdr)
    fin_far_abst = [ r for r in rdr ]
df_abst_far = pd.DataFrame(fin_far_abst, columns=far_abst_header)

In [41]:
far_dfs = [ df_abst_far, df_articles_far, df_books_far, df_chaps_far,
           df_papers_far, df_patents_far, df_revs_far]
far_types = ['Abstracts', 'Articles', 'Books', 'Chapters', 'Papers', 'Patents', 'Reviews' ]
for e, df in enumerate(far_dfs):
    print("{}: {}".format(far_types[e], df.shape[0]))
    print("Columns:\n\t{}".format('\n\t'.join(df.columns)))
    print("Sample:\n\t{}".format(' | '.join(df.iloc[0].values)))
    print("\n" + '-'*100 + '\n')

Abstracts: 314
Columns:
	id
	activity_report_id
	presentation_type_id
	conference
	coauthors
	title
	abstract_date
	created_at
	updated_at
Sample:
	2 | 530 | CO | CROI (Conference on Retroviruses and Opportunistic Infections) | Agaba P, Genberg BL, Sagay S, Agbaji O, Dadem N, Kolawole G, Okonkwo P, Meloni S, Kanki P, Ware N. | Retention in a decentralized HIV care and treatment program in north central Nigeria.  | 2015-02-25 | 2016-01-19 15:57:00 | 2016-01-19 15:57:16

----------------------------------------------------------------------------------------------------

Articles: 6392
Columns:
	id
	activity_report_id
	article_type_id
	identifier
	created_at
	updated_at
	title
	journal
	number
	volume
	date
	coauthors
	book_status_id
	article_id_type_id
	page_numbers
Sample:
	6 | 15 | PEER | 10.1117/1.nph.2.3.031202 | 2016-01-07 16:51:12 | 2016-01-07 16:51:12 | Modified toolbox for optogenetics in the nonhuman primate | Neurophotonics | 3 | 2 | 2015-05-29 | Dai, Ji and Ozden, Ilker and B

## VIVO Data

In [57]:
with open('data/rab/cite_properties.csv') as f:
    data = f.readlines()
    # skip header, strip trailing whitespace
    fin_cite_props = [ d.strip() for d in data[1:] ]

In [58]:
with open('data/rab/cite_data.nt') as f:
    fin_rab_cites = f.readlines()

In [59]:
cite_prop_map = { c: c[40:] for c in fin_cite_props if c.startswith('http://vivo.brown.edu/ontology/citation#') }
cite_prop_map['rabid'] = 'rabid'
cite_prop_map['http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType'] = 'type'
cite_prop_map.values()

dict_values(['date', 'volume', 'hasContributor', 'authorList', 'pmid', 'issue', 'doi', 'hasVenue', 'pages', 'pmcid', 'issn', 'venueFor', 'eissn', 'publishedIn', 'book', 'hasLocation', 'editorList', 'chapter', 'hasPublisher', 'isbn', 'url', 'hasConferenceLocation', 'conferenceDate', 'hasConference', 'reviewOf', 'title', 'number', 'version', 'hasAssignee', 'hasCountry', 'hasAuthority', 'patentNumber', 'rabid', 'type'])

In [60]:
RABCitation = namedtuple('RABCitation', sorted(cite_prop_map.values()))

In [61]:
def clean_data_prop(oData):
    return oData.rsplit('"^^<http://www.w3.org/2001/XMLSchema#', maxsplit=1)[0].strip('\"\n .<>')

In [62]:
def parse_triple(rawRow):
    s,p,o = rawRow.split(' ',maxsplit=2)
    s = s.strip('<>')
    p = p.strip('<>')
    o = clean_data_prop(o)
    return (s,p,o)

In [63]:
cite_triples = []
for t in fin_rab_cites:
    cite_triples.append(parse_triple(t))

cite_triples[0]

('http://vivo.brown.edu/individual/n5c6cae127059414ca258636cd3dc482b',
 'http://www.w3.org/2000/01/rdf-schema#label',
 'Erratum to: Global magnetic confinement for the 1.5D Vlasov-Maxwell system')

In [77]:
# Analyzing citations with more than 1 most specific type

no_ids = [ c for c in cite_triples
          if c[1] == 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType' 
          and c[2] == 'http://vivo.brown.edu/ontology/citation#NoID' ]
msts = defaultdict(set)
for c in cite_triples:
    if c[1] == 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType':
        msts[c[0]].add(c[2])
mlts = set()
for m,v in msts.items():
    if len(v) != 1:
        mlts.add(frozenset(v))
for m in mlts:
    print(list(m))

['http://vivo.brown.edu/ontology/citation#NoID', 'http://vivo.brown.edu/ontology/citation#Patent']
['http://vivo.brown.edu/ontology/citation#BookSection', 'http://vivo.brown.edu/ontology/citation#NoID']
['http://vivo.brown.edu/ontology/citation#Article', 'http://vitro.mannlib.cornell.edu/ns/vitro/public#File']
['http://vivo.brown.edu/ontology/citation#NoID', 'http://vivo.brown.edu/ontology/citation#Review']
['http://vivo.brown.edu/ontology/citation#Article', 'http://vivo.brown.edu/ontology/citation#Review']
['http://vivo.brown.edu/ontology/citation#NoID', 'http://vivo.brown.edu/ontology/citation#WorkingPaper']
['http://vivo.brown.edu/ontology/citation#NoID', 'http://vivo.brown.edu/ontology/citation#Article']
['http://vivo.brown.edu/ontology/citation#Citation', 'http://vivo.brown.edu/ontology/citation#Venue']
['http://vivo.brown.edu/ontology/citation#NoID', 'http://vivo.brown.edu/ontology/citation#Abstract']
['http://vivo.brown.edu/ontology/citation#Book', 'http://vivo.brown.edu/ontolog

In [64]:
cite_dicts = defaultdict(dict)
for t in cite_triples:
    if t[1] in cite_prop_map:
        cite_dicts[t[0]][cite_prop_map[t[1]]] = t[2]

In [65]:
empty_row = { cite_prop_map[p]: '' for p in cite_prop_map }
rab_rows = []
for c in cite_dicts:
    d = cite_dicts[c]
    d['rabid'] = c
    row = empty_row.copy()
    row.update(d)
    rab_rows.append(RABCitation(**row))

print(rab_rows[0])

RABCitation(authorList='Nguyen, Toan T., Nguyen, Truyen V., Strauss, Walter A', book='', chapter='', conferenceDate='', date='2015-06-01', doi='10.3934/krm.2015.8.615', editorList='', eissn='', hasAssignee='', hasAuthority='', hasConference='', hasConferenceLocation='', hasContributor='http://vivo.brown.edu/individual/wstrauss', hasCountry='', hasLocation='', hasPublisher='', hasVenue='http://vivo.brown.edu/individual/n6086eb8fe7824cad9423547d403a958d', isbn='', issn='', issue='3', number='', pages='615-616', patentNumber='', pmcid='', pmid='', publishedIn='Kinetic and Related Models', rabid='http://vivo.brown.edu/individual/n5c6cae127059414ca258636cd3dc482b', reviewOf='', title='', type='http://vivo.brown.edu/ontology/citation#Article', url='', venueFor='', version='', volume='8')


In [66]:
df_cites_rab = pd.DataFrame(rab_rows)

cols = df_cites_rab.columns.tolist()
id_atts = [ 'rabid','type','doi','pmid','pmcid','issn','eissn' ]
cols = [ c for c in cols if c not in id_atts ]
cols = id_atts + cols
df_cites_rab = df_cites_rab[ cols ]
df_cites_rab.type = df_cites_rab.type.str.rsplit('#').str.get(1)
df_cites_rab.head()

Unnamed: 0,rabid,type,doi,pmid,pmcid,issn,eissn,authorList,book,chapter,...,number,pages,patentNumber,publishedIn,reviewOf,title,url,venueFor,version,volume
0,http://vivo.brown.edu/individual/n5c6cae127059...,Article,10.3934/krm.2015.8.615,,,,,"Nguyen, Toan T., Nguyen, Truyen V., Strauss, W...",,,...,,615-616,,Kinetic and Related Models,,,,,,8
1,http://vivo.brown.edu/individual/n52747,Article,10.1037/0012-1649.40.4.595,15238046.0,,,,"Li, Jin",,,...,,595-605,,,,,,,,40
2,http://vivo.brown.edu/individual/n8301,Article,10.1007/s10461-012-0163-8,22323006.0,PMC3471653,,,"Brown JL, Sales JM, DiClemente RJ, Salazar LF,...",,,...,,1491-500,,,,,,,,16
3,http://vivo.brown.edu/individual/n98528,Article,10.1002/mc.2940130304,7619217.0,,,,"Sears WL, Goto-Mandeville R, Mirapuri M, Braun L",,,...,,146-56,,,,,,,,13
4,http://vivo.brown.edu/individual/n52835,Article,10.1016/j.drugalcdep.2014.09.265,,,,,"Graves, Hannah, Hernandez, Lynn, Kahler, Chris...",,,...,,e127-e128,,,,,,,,146


## 3rd-party IDs

In [None]:
pd.melt(df_cites_rab, id_vars=['rabid', 'type'], value_vars=['']

In [67]:
df_cites_rab.type.value_counts()

Article            42162
Citation            3548
ConferencePaper      837
NoID                 823
BookSection          513
Book                 496
Review               165
Abstract              71
WorkingPaper          23
Venue                 13
Patent                 3
Name: type, dtype: int64

In [53]:
df_cites_far.info()

NameError: name 'df_cites_far' is not defined

In [None]:
len(set(df_cites_rab[ (df_cites_rab.type=='Article') | (df_cites_rab.type=='Citation')].doi) & set(df_articles_far[ df_articles_far.article_id_type_id == 'DOI' ].identifier))

In [None]:
df_cites_rab[ (df_cites_rab.type !='Article') & (df_cites_rab.doi != '') ].type.value_counts()

In [None]:
df_cites_rab.type.value_counts()