# FAR-VIVO Citation Data Analysis

In [2]:
import csv

import requests
import pandas as pd

from collections import namedtuple, defaultdict
from functools import reduce

vprod = %env VIVO_PRODUCTION
vstage = %env VIVO_STAGING
vuser = %env VIVO_USER
vpass = %env VIVO_PASSWORD

## Contents
* [Acquire](#Acquire)
* [Load FAR Data](#FAR-Publication-Data)
* [Load VIVO Data](#VIVO-Data)

# Acquire
[back](#Contents)

In [2]:
def get_citation_properties(endpoint):
    query = """
    SELECT DISTINCT ?prop
    WHERE {{
        ?cite a <http://vivo.brown.edu/ontology/citation#Citation> .
        ?cite ?prop ?o.
    }}
    """
    data = { 'email': vuser, 'password': vpass, 'query': query }
    headers = { 'Accept': 'text/csv', 'charset': 'utf-8' }
    resp = requests.post(endpoint, data=data, headers=headers)
    if resp.status_code == 200:
        return resp.text
    else:
        print(resp.text)
        return False

In [3]:
with open('data/rab/query_properties.csv','w+') as f:
    f.write(get_citation_properties(vstage))

In [4]:
def get_citation_data(endpoint):
    query = """
    DESCRIBE ?cite
    WHERE {{ ?cite a <http://vivo.brown.edu/ontology/citation#Citation> .}}
    """
    data = { 'email': vuser, 'password': vpass, 'query': query }
    headers = { 'Accept': 'text/plain', 'charset': 'utf-8' }
    resp = requests.post(endpoint, data=data, headers=headers)
    if resp.status_code == 200:
        return resp.text
    else:
        print(resp.text)
        return False

In [5]:
with open('data/rab/query_citations.nt', 'w+') as f:
    f.write(get_citation_data(vstage))

## FAR Publication Data
[back](#Contents)

In [6]:
with open('data/far/articles.csv') as f:
    rdr = csv.reader(f, escapechar='\\')
    far_arts_header = next(rdr)
    fin_far_arts = [ r for r in rdr ]
df_articles_far = pd.DataFrame(fin_far_arts, columns=far_arts_header)

In [7]:
with open('data/far/books.csv') as f:
    rdr = csv.reader(f, escapechar='\\')
    far_books_header = next(rdr)
    fin_far_books = [ r for r in rdr ]
df_books_far = pd.DataFrame(fin_far_books, columns=far_books_header)

In [8]:
with open('data/far/chapters.csv') as f:
    rdr = csv.reader(f, escapechar='\\')
    far_chapters_header = next(rdr)
    fin_far_chapters = [ r for r in rdr ]
df_chaps_far = pd.DataFrame(fin_far_chapters, columns=far_chapters_header)

In [9]:
with open('data/far/critical_reviews.csv') as f:
    rdr = csv.reader(f, escapechar='\\')
    far_revs_header = next(rdr)
    fin_far_revs = [ r for r in rdr ]
df_revs_far = pd.DataFrame(fin_far_revs, columns=far_revs_header)

In [10]:
with open('data/far/papers.csv') as f:
    rdr = csv.reader(f, escapechar='\\')
    far_papers_header = next(rdr)
    fin_far_papers = [ r for r in rdr ]
df_papers_far = pd.DataFrame(fin_far_papers, columns=far_papers_header)

In [11]:
with open('data/far/patents.csv') as f:
    rdr = csv.reader(f, escapechar='\\')
    far_patents_header = next(rdr)
    fin_far_patents = [ r for r in rdr ]
df_patents_far = pd.DataFrame(fin_far_patents, columns=far_patents_header)

In [12]:
with open('data/far/ph_abstracts.csv') as f:
    rdr = csv.reader(f, escapechar='\\')
    far_abst_header = next(rdr)
    fin_far_abst = [ r for r in rdr ]
df_abst_far = pd.DataFrame(fin_far_abst, columns=far_abst_header)

In [13]:
far_dfs = [ df_abst_far, df_articles_far, df_books_far, df_chaps_far,
           df_papers_far, df_patents_far, df_revs_far]
far_types = ['Abstracts', 'Articles', 'Books', 'Chapters', 'Papers', 'Patents', 'Reviews' ]
for e, df in enumerate(far_dfs):
    print("{}: {}".format(far_types[e], df.shape[0]))
    print("Columns:\n\t{}".format('\n\t'.join(df.columns)))
    print("Sample:\n\t{}".format(' | '.join(df.iloc[0].values)))
    print("\n" + '-'*100 + '\n')

Abstracts: 314
Columns:
	id
	activity_report_id
	presentation_type_id
	conference
	coauthors
	title
	abstract_date
	created_at
	updated_at
Sample:
	2 | 530 | CO | CROI (Conference on Retroviruses and Opportunistic Infections) | Agaba P, Genberg BL, Sagay S, Agbaji O, Dadem N, Kolawole G, Okonkwo P, Meloni S, Kanki P, Ware N. | Retention in a decentralized HIV care and treatment program in north central Nigeria.  | 2015-02-25 | 2016-01-19 15:57:00 | 2016-01-19 15:57:16

----------------------------------------------------------------------------------------------------

Articles: 6392
Columns:
	id
	activity_report_id
	article_type_id
	identifier
	created_at
	updated_at
	title
	journal
	number
	volume
	date
	coauthors
	book_status_id
	article_id_type_id
	page_numbers
Sample:
	6 | 15 | PEER | 10.1117/1.nph.2.3.031202 | 2016-01-07 16:51:12 | 2016-01-07 16:51:12 | Modified toolbox for optogenetics in the nonhuman primate | Neurophotonics | 3 | 2 | 2015-05-29 | Dai, Ji and Ozden, Ilker and B

## VIVO Data
[back](#Contents)

In [3]:
with open('data/rab/query_properties.csv') as f:
    data = f.readlines()
    # skip header, strip trailing whitespace
    fin_cite_props = [ d.strip() for d in data[1:] ]

In [4]:
with open('data/rab/query_citations.nt') as f:
    fin_rab_cites = f.readlines()

In [5]:
cite_prop_map = { c: c[40:] for c in fin_cite_props if c.startswith('http://vivo.brown.edu/ontology/citation#') }
cite_prop_map['rabid'] = 'rabid'
cite_prop_map['http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType'] = 'type'
cite_prop_map.values()

dict_values(['date', 'volume', 'hasContributor', 'authorList', 'pmid', 'issue', 'doi', 'hasVenue', 'pages', 'pmcid', 'publishedIn', 'book', 'hasLocation', 'editorList', 'chapter', 'hasPublisher', 'isbn', 'url', 'hasConferenceLocation', 'conferenceDate', 'hasConference', 'issn', 'reviewOf', 'title', 'number', 'version', 'hasAssignee', 'hasCountry', 'hasAuthority', 'patentNumber', 'venueFor', 'rabid', 'type'])

In [6]:
RABCitation = namedtuple('RABCitation', sorted(cite_prop_map.values()))

In [7]:
def clean_data_prop(oData):
    return oData.rsplit('"^^<http://www.w3.org/2001/XMLSchema#', maxsplit=1)[0].strip('\"\n .<>')

In [8]:
def parse_triple(rawRow):
    s,p,o = rawRow.split(' ',maxsplit=2)
    s = s.strip('<>')
    p = p.strip('<>')
    o = clean_data_prop(o)
    return (s,p,o)

In [9]:
cite_triples = []
for t in fin_rab_cites:
    cite_triples.append(parse_triple(t))

cite_triples[0]

('http://vivo.brown.edu/individual/n5c6cae127059414ca258636cd3dc482b',
 'http://www.w3.org/2000/01/rdf-schema#label',
 'Erratum to: Global magnetic confinement for the 1.5D Vlasov-Maxwell system')

In [10]:
# Analyzing citations with more than 1 most specific type

no_ids = [ c for c in cite_triples
          if c[1] == 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType' 
          and c[2] == 'http://vivo.brown.edu/ontology/citation#NoID' ]
msts = defaultdict(set)
for c in cite_triples:
    if c[1] == 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType':
        msts[c[0]].add(c[2])
mlts = set()
for m,v in msts.items():
    if len(v) != 1:
        mlts.add(frozenset(v))

no_id = 'http://vivo.brown.edu/ontology/citation#NoID'        
for m in mlts:
    if no_id in m:
        print("With NoID: ", [ a for a in m if a != no_id])
    else:
        print("Redundant types: ". sorted(list(m)))

With NoID:  ['http://vivo.brown.edu/ontology/citation#Citation']
With NoID:  ['http://vivo.brown.edu/ontology/citation#Abstract']
With NoID:  ['http://vivo.brown.edu/ontology/citation#Book']
With NoID:  ['http://vivo.brown.edu/ontology/citation#ConferencePaper']
With NoID:  ['http://vivo.brown.edu/ontology/citation#Review']
With NoID:  ['http://vivo.brown.edu/ontology/citation#Patent']
With NoID:  ['http://vivo.brown.edu/ontology/citation#Article']
With NoID:  ['http://vivo.brown.edu/ontology/citation#BookSection']
With NoID:  ['http://vivo.brown.edu/ontology/citation#WorkingPaper']


In [11]:
def triple_match(triple, prop=None, obj=None):
    if prop and obj:
        return triple[1] == prop and triple[2] == obj
    if prop:
        return triple[1] == prop
    if obj:
        return triple[2] == obj
    return True

In [12]:
def filter_mst_no_id(triple):
    return not triple_match(triple,
                            'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType',
                            'http://vivo.brown.edu/ontology/citation#NoID')

good_triple = ('foo', 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType', 'bar')
bad_triple = ('foo', 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType',
              'http://vivo.brown.edu/ontology/citation#NoID')
assert filter_mst_no_id(good_triple) == True
assert filter_mst_no_id(bad_triple) == False

In [13]:
strip_msts = [ t for t in cite_triples if filter_mst_no_id(t) ]

In [14]:
cite_dicts = defaultdict(dict)
for t in strip_msts:
    if t[1] in cite_prop_map:
        cite_dicts[t[0]][cite_prop_map[t[1]]] = t[2]

In [15]:
empty_row = { cite_prop_map[p]: '' for p in cite_prop_map }
rab_rows = []
for c in cite_dicts:
    d = cite_dicts[c]
    d['rabid'] = c
    row = empty_row.copy()
    row.update(d)
    rab_rows.append(RABCitation(**row))

print(rab_rows[0])

RABCitation(authorList='Nguyen, Toan T., Nguyen, Truyen V., Strauss, Walter A', book='', chapter='', conferenceDate='', date='2015-06-01', doi='10.3934/krm.2015.8.615', editorList='', hasAssignee='', hasAuthority='', hasConference='', hasConferenceLocation='', hasContributor='http://vivo.brown.edu/individual/wstrauss', hasCountry='', hasLocation='', hasPublisher='', hasVenue='http://vivo.brown.edu/individual/n6086eb8fe7824cad9423547d403a958d', isbn='', issn='', issue='3', number='', pages='615-616', patentNumber='', pmcid='', pmid='', publishedIn='Kinetic and Related Models', rabid='http://vivo.brown.edu/individual/n5c6cae127059414ca258636cd3dc482b', reviewOf='', title='', type='http://vivo.brown.edu/ontology/citation#Article', url='', venueFor='', version='', volume='8')


In [16]:
df_cites_rab = pd.DataFrame(rab_rows)

cols = df_cites_rab.columns.tolist()
id_atts = [ 'rabid','type','doi','pmid','pmcid','issn' ]
cols = [ c for c in cols if c not in id_atts ]
cols = id_atts + cols
df_cites_rab = df_cites_rab[ cols ]
df_cites_rab.type = df_cites_rab.type.str.rsplit('#').str.get(1)
df_cites_rab.head()

Unnamed: 0,rabid,type,doi,pmid,pmcid,issn,authorList,book,chapter,conferenceDate,...,number,pages,patentNumber,publishedIn,reviewOf,title,url,venueFor,version,volume
0,http://vivo.brown.edu/individual/n5c6cae127059...,Article,10.3934/krm.2015.8.615,,,,"Nguyen, Toan T., Nguyen, Truyen V., Strauss, W...",,,,...,,615-616,,Kinetic and Related Models,,,,,,8
1,http://vivo.brown.edu/individual/n52747,Article,10.1037/0012-1649.40.4.595,15238046.0,,,"Li, Jin",,,,...,,595-605,,,,,,,,40
2,http://vivo.brown.edu/individual/n8301,Article,10.1007/s10461-012-0163-8,22323006.0,PMC3471653,,"Brown JL, Sales JM, DiClemente RJ, Salazar LF,...",,,,...,,1491-500,,,,,,,,16
3,http://vivo.brown.edu/individual/n98528,Article,10.1002/mc.2940130304,7619217.0,,,"Sears WL, Goto-Mandeville R, Mirapuri M, Braun L",,,,...,,146-56,,,,,,,,13
4,http://vivo.brown.edu/individual/n52835,Article,10.1016/j.drugalcdep.2014.09.265,,,,"Graves, Hannah, Hernandez, Lynn, Kahler, Chris...",,,,...,,e127-e128,,,,,,,,146


In [17]:
df_cites_rab.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49044 entries, 0 to 49043
Data columns (total 33 columns):
rabid                    49044 non-null object
type                     49044 non-null object
doi                      49044 non-null object
pmid                     49044 non-null object
pmcid                    49044 non-null object
issn                     49044 non-null object
authorList               49044 non-null object
book                     49044 non-null object
chapter                  49044 non-null object
conferenceDate           49044 non-null object
date                     49044 non-null object
editorList               49044 non-null object
hasAssignee              49044 non-null object
hasAuthority             49044 non-null object
hasConference            49044 non-null object
hasConferenceLocation    49044 non-null object
hasContributor           49044 non-null object
hasCountry               49044 non-null object
hasLocation              49044 non-null objec

## 3rd-party IDs

In [18]:
df_cites_rab.type.value_counts()

Article            42700
Citation            3708
ConferencePaper      930
BookSection          774
Book                 520
Review               245
Abstract             128
WorkingPaper          30
Patent                 9
Name: type, dtype: int64

In [22]:
len(df_cites_rab[ ((df_cites_rab.pmid != '') | (df_cites_rab.doi != '')) ])

46799

In [24]:
df_cites_rab[ ((df_cites_rab.pmid == '') & (df_cites_rab.doi == '')) ].type.value_counts()

BookSection        678
Article            644
Book               461
Abstract           118
Review             115
Citation           101
ConferencePaper     92
WorkingPaper        27
Patent               9
Name: type, dtype: int64

In [25]:
df_cites_rab[ df_cites_rab.isbn != '' ].type.value_counts()

Book           452
BookSection    224
Article          7
Name: type, dtype: int64

In [39]:
df_cites_rab.type.value_counts()

Article            42700
Citation            3708
ConferencePaper      930
BookSection          774
Book                 520
Review               245
Abstract             128
WorkingPaper          30
Patent                 9
Name: type, dtype: int64

In [29]:
df_cites_far.info()

NameError: name 'df_cites_far' is not defined

In [30]:
len(set(df_cites_rab[ (df_cites_rab.type=='Article') | (df_cites_rab.type=='Citation')].doi) & set(df_articles_far[ df_articles_far.article_id_type_id == 'DOI' ].identifier))

1830

In [31]:
df_cites_rab[ (df_cites_rab.type !='Article') & (df_cites_rab.doi != '') ].type.value_counts()

Citation           3361
ConferencePaper     838
Review              110
BookSection          90
Book                 32
Abstract             10
WorkingPaper          3
Name: type, dtype: int64

In [32]:
df_cites_rab.type.value_counts()

Article            42354
Citation            3661
ConferencePaper      882
NoID                 843
BookSection          528
Book                 499
Review               177
Abstract              72
WorkingPaper          24
Patent                 4
Name: type, dtype: int64