# FAR-VIVO Citation Data Analysis

In [114]:
import csv
import os

import requests
import pandas as pd
import numpy as np

from collections import namedtuple, defaultdict, Counter
from functools import reduce

vprod = %env VIVO_PRODUCTION
vstage = %env VIVO_STAGING
vuser = %env VIVO_USER
vpass = %env VIVO_PASSWORD

## Contents
* [Acquire](#Acquire)
* [Load FAR Data](#FAR-Publication-Data)
* [Load VIVO Data](#VIVO-Data)

# Acquire
[back](#Contents)

In [2]:
def get_citation_properties(endpoint):
    query = """
    SELECT DISTINCT ?prop
    WHERE {{
        ?cite a <http://vivo.brown.edu/ontology/citation#Citation> .
        ?cite ?prop ?o.
    }}
    """
    data = { 'email': vuser, 'password': vpass, 'query': query }
    headers = { 'Accept': 'text/csv', 'charset': 'utf-8' }
    resp = requests.post(endpoint, data=data, headers=headers)
    if resp.status_code == 200:
        return resp.text
    else:
        print(resp.text)
        return False

In [3]:
with open('data/rab/query_properties.csv','w+') as f:
    f.write(get_citation_properties(vstage))

In [4]:
def get_citation_data(endpoint):
    query = """
    DESCRIBE ?cite
    WHERE {{ ?cite a <http://vivo.brown.edu/ontology/citation#Citation> .}}
    """
    data = { 'email': vuser, 'password': vpass, 'query': query }
    headers = { 'Accept': 'text/plain', 'charset': 'utf-8' }
    resp = requests.post(endpoint, data=data, headers=headers)
    if resp.status_code == 200:
        return resp.text
    else:
        print(resp.text)
        return False

In [5]:
with open('data/rab/query_citations.nt', 'w+') as f:
    f.write(get_citation_data(vstage))

## FAR Publication Data
[back](#Contents)

In [112]:
def wrap_far_row(row, dtype, idIdx):
    row[idIdx] = dtype + '_' + row[idIdx]
    row.append(dtype)
    return row

In [117]:
def make_far_df(dtype, fname):
    with open(os.path.join('data/far/',fname)) as f:
        rdr = csv.reader(f, escapechar='\\')
        header = next(rdr)
        assert dtype not in header  
        header.append(dtype)
        id_idx = header.index('id')
        rows = [ wrap_far_row(r, dtype, id_idx) for r in rdr ]
    return pd.DataFrame(rows, columns=header)

In [144]:
far_files = [ ('article', 'articles.csv'), ('book', 'books.csv'),
             ('chapter', 'chapters.csv'), ('review', 'critical_reviews.csv'),
             ('paper', 'papers.csv'), ('patent', 'patents.csv'),
             ('abstract', 'ph_abstracts.csv') ]
dtypes = [ f[0] for f in far_files ]

df_cites_far = pd.concat(
    [ make_far_df(*f) for f in far_files ], axis=0, ignore_index=True,sort=False)
melted = pd.melt(df_cites_far, id_vars=['id'], value_vars=dtypes,
                var_name='drop_me', value_name='type')
df_cites_far = df_cites_far.join(
    melted.dropna().drop(columns='drop_me').set_index('id'), on='id')
df_cites_far.drop(columns=dtypes, inplace=True)
df_cites_far.head()

Unnamed: 0,id,activity_report_id,article_type_id,identifier,created_at,updated_at,title,journal,number,volume,...,other,conference,paper_date,patent_status_id,patent_number,patent_title,patent_date,presentation_type_id,abstract_date,type
0,article_6,15,PEER,10.1117/1.nph.2.3.031202,2016-01-07 16:51:12,2016-01-07 16:51:12,Modified toolbox for optogenetics in the nonhu...,Neurophotonics,3,2,...,,,,,,,,,,article
1,article_7,25,PEER,10.1162/neco_a_00681,2016-01-07 17:07:06,2016-01-07 17:07:58,Spatiotemporal Conditional Inference and Hypot...,Neural Computation,1,27,...,,,,,,,,,,article
2,article_9,760,PEER,10.1038/nature14105,2016-01-07 17:08:08,2016-01-07 17:08:08,Impact jetting as the origin of chondrules,Nature,7534,517,...,,,,,,,,,,article
3,article_10,760,PEER,10.1002/2015gl065022,2016-01-07 17:08:19,2016-01-07 17:10:52,The fractured Moon: Production and saturation ...,Geophysical Research Letters,17,42,...,,,,,,,,,,article
4,article_11,25,PEER,10.1073/pnas.1506400112,2016-01-07 17:08:37,2016-01-07 17:08:50,Ambiguity and nonidentifiability in the statis...,Proc Natl Acad Sci USA,20,112,...,,,,,,,,,,article


In [145]:
df_cites_far.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9689 entries, 0 to 9688
Data columns (total 33 columns):
id                      9689 non-null object
activity_report_id      9689 non-null object
article_type_id         6392 non-null object
identifier              6392 non-null object
created_at              9689 non-null object
updated_at              9689 non-null object
title                   9593 non-null object
journal                 6429 non-null object
number                  6392 non-null object
volume                  6392 non-null object
date                    6392 non-null object
coauthors               8281 non-null object
book_status_id          7871 non-null object
article_id_type_id      6392 non-null object
page_numbers            6392 non-null object
book_type_id            467 non-null object
press                   1479 non-null object
doi                     1479 non-null object
book_role_id            467 non-null object
book_title              1012 non-null ob

## VIVO Data
[back](#Contents)

In [3]:
with open('data/rab/query_properties.csv') as f:
    data = f.readlines()
    # skip header, strip trailing whitespace
    fin_cite_props = [ d.strip() for d in data[1:] ]

In [4]:
with open('data/rab/query_citations.nt') as f:
    fin_rab_cites = f.readlines()

In [62]:
cite_prop_map = { c: c[40:] for c in fin_cite_props if c.startswith('http://vivo.brown.edu/ontology/citation#') }
cite_prop_map['rabid'] = 'rabid'
cite_prop_map['http://www.w3.org/2000/01/rdf-schema#label'] = 'label'
cite_prop_map['http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType'] = 'type'
cite_prop_map.values()

dict_values(['date', 'volume', 'hasContributor', 'authorList', 'pmid', 'issue', 'doi', 'hasVenue', 'pages', 'pmcid', 'publishedIn', 'book', 'hasLocation', 'editorList', 'chapter', 'hasPublisher', 'isbn', 'url', 'hasConferenceLocation', 'conferenceDate', 'hasConference', 'issn', 'reviewOf', 'title', 'number', 'version', 'hasAssignee', 'hasCountry', 'hasAuthority', 'patentNumber', 'venueFor', 'rabid', 'label', 'type'])

In [63]:
RABCitation = namedtuple('RABCitation', sorted(cite_prop_map.values()))

In [64]:
def clean_data_prop(oData):
    return oData.rsplit('"^^<http://www.w3.org/2001/XMLSchema#', maxsplit=1)[0].strip('\"\n .<>')

In [65]:
def parse_triple(rawRow):
    s,p,o = rawRow.split(' ',maxsplit=2)
    s = s.strip('<>')
    p = p.strip('<>')
    o = clean_data_prop(o)
    return (s,p,o)

In [66]:
cite_triples = []
for t in fin_rab_cites:
    cite_triples.append(parse_triple(t))

cite_triples[0]

('http://vivo.brown.edu/individual/n5c6cae127059414ca258636cd3dc482b',
 'http://www.w3.org/2000/01/rdf-schema#label',
 'Erratum to: Global magnetic confinement for the 1.5D Vlasov-Maxwell system')

In [67]:
# Analyzing citations with more than 1 most specific type

no_ids = [ c for c in cite_triples
          if c[1] == 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType' 
          and c[2] == 'http://vivo.brown.edu/ontology/citation#NoID' ]
msts = defaultdict(set)
for c in cite_triples:
    if c[1] == 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType':
        msts[c[0]].add(c[2])
mlts = set()
for m,v in msts.items():
    if len(v) != 1:
        mlts.add(frozenset(v))

no_id = 'http://vivo.brown.edu/ontology/citation#NoID'        
for m in mlts:
    if no_id in m:
        print("With NoID: ", [ a for a in m if a != no_id])
    else:
        print("Redundant types: ". sorted(list(m)))

With NoID:  ['http://vivo.brown.edu/ontology/citation#Citation']
With NoID:  ['http://vivo.brown.edu/ontology/citation#Abstract']
With NoID:  ['http://vivo.brown.edu/ontology/citation#Book']
With NoID:  ['http://vivo.brown.edu/ontology/citation#ConferencePaper']
With NoID:  ['http://vivo.brown.edu/ontology/citation#Review']
With NoID:  ['http://vivo.brown.edu/ontology/citation#Patent']
With NoID:  ['http://vivo.brown.edu/ontology/citation#Article']
With NoID:  ['http://vivo.brown.edu/ontology/citation#BookSection']
With NoID:  ['http://vivo.brown.edu/ontology/citation#WorkingPaper']


In [68]:
def triple_match(triple, prop=None, obj=None):
    if prop and obj:
        return triple[1] == prop and triple[2] == obj
    if prop:
        return triple[1] == prop
    if obj:
        return triple[2] == obj
    return True

In [69]:
def filter_mst_no_id(triple):
    return not triple_match(triple,
                            'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType',
                            'http://vivo.brown.edu/ontology/citation#NoID')

good_triple = ('foo', 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType', 'bar')
bad_triple = ('foo', 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType',
              'http://vivo.brown.edu/ontology/citation#NoID')
assert filter_mst_no_id(good_triple) == True
assert filter_mst_no_id(bad_triple) == False

In [70]:
strip_msts = [ t for t in cite_triples if filter_mst_no_id(t) ]

In [71]:
cite_dicts = defaultdict(dict)
for t in strip_msts:
    if t[1] in cite_prop_map:
        cite_dicts[t[0]][cite_prop_map[t[1]]] = t[2]

In [72]:
empty_row = { cite_prop_map[p]: '' for p in cite_prop_map }
rab_rows = []
for c in cite_dicts:
    d = cite_dicts[c]
    d['rabid'] = c
    row = empty_row.copy()
    row.update(d)
    rab_rows.append(RABCitation(**row))

print(rab_rows[0])

RABCitation(authorList='Nguyen, Toan T., Nguyen, Truyen V., Strauss, Walter A', book='', chapter='', conferenceDate='', date='2015-06-01', doi='10.3934/krm.2015.8.615', editorList='', hasAssignee='', hasAuthority='', hasConference='', hasConferenceLocation='', hasContributor='http://vivo.brown.edu/individual/wstrauss', hasCountry='', hasLocation='', hasPublisher='', hasVenue='http://vivo.brown.edu/individual/n6086eb8fe7824cad9423547d403a958d', isbn='', issn='', issue='3', label='Erratum to: Global magnetic confinement for the 1.5D Vlasov-Maxwell system', number='', pages='615-616', patentNumber='', pmcid='', pmid='', publishedIn='Kinetic and Related Models', rabid='http://vivo.brown.edu/individual/n5c6cae127059414ca258636cd3dc482b', reviewOf='', title='', type='http://vivo.brown.edu/ontology/citation#Article', url='', venueFor='', version='', volume='8')


In [81]:
df_cites_rab = pd.DataFrame(rab_rows)

cols = df_cites_rab.columns.tolist()
id_atts = [ 'rabid','type','label','doi','pmid','pmcid','isbn','issn' ]
common_atts = [ 'date','authorList','pages','issue','volume' ]
has_atts = ['hasContributor','hasVenue','hasConference','hasConferenceLocation',
            'hasCountry','hasLocation','hasPublisher','hasAssignee','hasAuthority']
grouped_atts = id_atts + common_atts + has_atts
cols = [ c for c in cols if c not in grouped_atts ]
cols = id_atts + common_atts + cols + has_atts
df_cites_rab = df_cites_rab[ cols ]
df_cites_rab.type = df_cites_rab.type.str.rsplit('#').str.get(1)
df_cites_rab.replace(r'^$', np.nan, inplace=True, regex=True)
df_cites_rab.head()

Unnamed: 0,rabid,type,label,doi,pmid,pmcid,isbn,issn,date,authorList,...,version,hasContributor,hasVenue,hasConference,hasConferenceLocation,hasCountry,hasLocation,hasPublisher,hasAssignee,hasAuthority
0,http://vivo.brown.edu/individual/n5c6cae127059...,Article,Erratum to: Global magnetic confinement for th...,10.3934/krm.2015.8.615,,,,,2015-06-01,"Nguyen, Toan T., Nguyen, Truyen V., Strauss, W...",...,,http://vivo.brown.edu/individual/wstrauss,http://vivo.brown.edu/individual/n6086eb8fe782...,,,,,,,
1,http://vivo.brown.edu/individual/n52747,Article,Learning as a Task or a Virtue: U.S. and Chine...,10.1037/0012-1649.40.4.595,15238046.0,,,,2004-01-01,"Li, Jin",...,,http://vivo.brown.edu/individual/jili,http://vivo.brown.edu/individual/n60865,,,,,,,
2,http://vivo.brown.edu/individual/n8301,Article,Predicting discordance between self-reports of...,10.1007/s10461-012-0163-8,22323006.0,PMC3471653,,,2012-08-01,"Brown JL, Sales JM, DiClemente RJ, Salazar LF,...",...,,http://vivo.brown.edu/individual/lbrownmd,http://vivo.brown.edu/individual/n79279,,,,,,,
3,http://vivo.brown.edu/individual/n98528,Article,Effects of 12-O-tetradecanoylphorbol-13-acetat...,10.1002/mc.2940130304,7619217.0,,,,1995-07-01,"Sears WL, Goto-Mandeville R, Mirapuri M, Braun L",...,,http://vivo.brown.edu/individual/lbraun,http://vivo.brown.edu/individual/n82319,,,,,,,
4,http://vivo.brown.edu/individual/n52835,Article,"Daily co-occurrences of marijuana use, alcohol...",10.1016/j.drugalcdep.2014.09.265,,,,,2015-01-01,"Graves, Hannah, Hernandez, Lynn, Kahler, Chris...",...,,http://vivo.brown.edu/individual/lh15,http://vivo.brown.edu/individual/n48368,,,,,,,


In [82]:
df_cites_rab.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49044 entries, 0 to 49043
Data columns (total 34 columns):
rabid                    49044 non-null object
type                     49044 non-null object
label                    49004 non-null object
doi                      43349 non-null object
pmid                     36381 non-null object
pmcid                    13399 non-null object
isbn                     683 non-null object
issn                     16 non-null object
date                     49034 non-null object
authorList               47600 non-null object
pages                    44188 non-null object
issue                    39882 non-null object
volume                   44166 non-null object
book                     744 non-null object
chapter                  148 non-null object
conferenceDate           36 non-null object
editorList               713 non-null object
number                   17 non-null object
patentNumber             9 non-null object
publishedIn        

## 3rd-party IDs

In [78]:
df_cites_rab.type.value_counts()

Article            42700
Citation            3708
ConferencePaper      930
BookSection          774
Book                 520
Review               245
Abstract             128
WorkingPaper          30
Patent                 9
Name: type, dtype: int64

In [80]:
len(df_cites_rab[ ((df_cites_rab.pmid.notnull()) | (df_cites_rab.doi.notnull())) ])

46799

In [83]:
df_cites_rab[ ((df_cites_rab.pmid.isna()) & (df_cites_rab.doi.isna())) ].type.value_counts()

BookSection        678
Article            644
Book               461
Abstract           118
Review             115
Citation           101
ConferencePaper     92
WorkingPaper        27
Patent               9
Name: type, dtype: int64

In [84]:
df_cites_rab[ df_cites_rab.isbn.notnull() ].type.value_counts()

Book           452
BookSection    224
Article          7
Name: type, dtype: int64

In [39]:
df_cites_rab.type.value_counts()

Article            42700
Citation            3708
ConferencePaper      930
BookSection          774
Book                 520
Review               245
Abstract             128
WorkingPaper          30
Patent                 9
Name: type, dtype: int64

In [29]:
df_cites_far.info()

NameError: name 'df_cites_far' is not defined

In [30]:
len(set(df_cites_rab[ (df_cites_rab.type=='Article') | (df_cites_rab.type=='Citation')].doi) & set(df_articles_far[ df_articles_far.article_id_type_id == 'DOI' ].identifier))

1830

In [31]:
df_cites_rab[ (df_cites_rab.type !='Article') & (df_cites_rab.doi != '') ].type.value_counts()

Citation           3361
ConferencePaper     838
Review              110
BookSection          90
Book                 32
Abstract             10
WorkingPaper          3
Name: type, dtype: int64

In [32]:
df_cites_rab.type.value_counts()

Article            42354
Citation            3661
ConferencePaper      882
NoID                 843
BookSection          528
Book                 499
Review               177
Abstract              72
WorkingPaper          24
Patent                 4
Name: type, dtype: int64