# FAR-VIVO Citation Data Analysis

In [1]:
import csv
import os

import requests
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz, process

from collections import namedtuple, defaultdict, Counter
from functools import reduce

vprod = %env VIVO_PRODUCTION
vstage = %env VIVO_STAGING
vuser = %env VIVO_USER
vpass = %env VIVO_PASSWORD

## Contents
* [Acquire](#Acquire)
* [Load FAR Data](#FAR-Publication-Data)
* [Load VIVO Data](#VIVO-Data)

# Acquire
[back](#Contents)

In [11]:
def get_citation_properties(endpoint):
    query = """
    SELECT DISTINCT ?prop
    WHERE {{
        ?cite a <http://vivo.brown.edu/ontology/citation#Citation> .
        ?cite ?prop ?o.
    }}
    """
    data = { 'email': vuser, 'password': vpass, 'query': query }
    headers = { 'Accept': 'text/csv', 'charset': 'utf-8' }
    resp = requests.post(endpoint, data=data, headers=headers)
    if resp.status_code == 200:
        return resp.text
    else:
        print(resp.text)
        return False

In [3]:
with open('data/rab/query_properties.csv','w+') as f:
    f.write(get_citation_properties(vstage))

In [4]:
def get_citation_data(endpoint):
    query = """
    DESCRIBE ?cite
    WHERE {{ ?cite a <http://vivo.brown.edu/ontology/citation#Citation> .}}
    """
    data = { 'email': vuser, 'password': vpass, 'query': query }
    headers = { 'Accept': 'text/plain', 'charset': 'utf-8' }
    resp = requests.post(endpoint, data=data, headers=headers)
    if resp.status_code == 200:
        return resp.text
    else:
        print(resp.text)
        return False

In [5]:
with open('data/rab/query_citations.nt', 'w+') as f:
    f.write(get_citation_data(vstage))

## FAR Publication Data
[back](#Contents)

In [2]:
def wrap_far_row(row, dtype, idIdx):
    far_id = row[idIdx]
    row[idIdx] = dtype + '_' + far_id
    row.insert(idIdx + 1, far_id)
    row.append(dtype.capitalize())
    return row

In [3]:
def make_far_df(dtype, fname):
    with open(os.path.join('data/far/',fname)) as f:
        rdr = csv.reader(f, escapechar='\\')
        header = next(rdr)
        assert dtype not in header
        header.append(dtype)
        id_idx = header.index('id')
        header.insert(id_idx + 1, 'table_id')
        rows = [ wrap_far_row(r, dtype, id_idx) for r in rdr ]
    return pd.DataFrame(rows, columns=header)

In [4]:
far_files = [ ('article', 'articles.csv'), ('book', 'books.csv'),
             ('chapter', 'chapters.csv'), ('review', 'critical_reviews.csv'),
             ('paper', 'papers.csv'), ('patent', 'patents.csv'),
             ('abstract', 'ph_abstracts.csv') ]
dtypes = [ f[0] for f in far_files ]

df_cites_far = pd.concat(
    [ make_far_df(*f) for f in far_files ], axis=0, ignore_index=True, sort=False)
melted = pd.melt(df_cites_far, id_vars=['id'], value_vars=dtypes,
                var_name='drop_me', value_name='type')
df_cites_far = df_cites_far.join(
    melted.dropna().drop(columns='drop_me').set_index('id'), on='id')
df_cites_far.drop(columns=dtypes, inplace=True)
df_cites_far.replace(r'^(|N)$', np.nan, inplace=True, regex=True)
df_cites_far.head()

Unnamed: 0,id,table_id,activity_report_id,article_type_id,identifier,created_at,updated_at,title,journal,number,...,other,conference,paper_date,patent_status_id,patent_number,patent_title,patent_date,presentation_type_id,abstract_date,type
0,article_6,6,15,PEER,10.1117/1.nph.2.3.031202,2016-01-07 16:51:12,2016-01-07 16:51:12,Modified toolbox for optogenetics in the nonhu...,Neurophotonics,3,...,,,,,,,,,,Article
1,article_7,7,25,PEER,10.1162/neco_a_00681,2016-01-07 17:07:06,2016-01-07 17:07:58,Spatiotemporal Conditional Inference and Hypot...,Neural Computation,1,...,,,,,,,,,,Article
2,article_9,9,760,PEER,10.1038/nature14105,2016-01-07 17:08:08,2016-01-07 17:08:08,Impact jetting as the origin of chondrules,Nature,7534,...,,,,,,,,,,Article
3,article_10,10,760,PEER,10.1002/2015gl065022,2016-01-07 17:08:19,2016-01-07 17:10:52,The fractured Moon: Production and saturation ...,Geophysical Research Letters,17,...,,,,,,,,,,Article
4,article_11,11,25,PEER,10.1073/pnas.1506400112,2016-01-07 17:08:37,2016-01-07 17:08:50,Ambiguity and nonidentifiability in the statis...,Proc Natl Acad Sci USA,20,...,,,,,,,,,,Article


In [5]:
df_cites_far.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9689 entries, 0 to 9688
Data columns (total 34 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    9689 non-null   object 
 1   table_id              9689 non-null   object 
 2   activity_report_id    9689 non-null   object 
 3   article_type_id       6392 non-null   object 
 4   identifier            3897 non-null   object 
 5   created_at            9689 non-null   object 
 6   updated_at            9689 non-null   object 
 7   title                 9574 non-null   object 
 8   journal               6349 non-null   object 
 9   number                3599 non-null   object 
 10  volume                4441 non-null   object 
 11  date                  5839 non-null   object 
 12  coauthors             7029 non-null   object 
 13  book_status_id        7388 non-null   object 
 14  article_id_type_id    5263 non-null   object 
 15  page_numbers         

In [15]:
with open('data/far/activity_reports.csv') as f:
    rdr = csv.reader(f, escapechar='\\')
    header = next(rdr)
    rows = [ r for r in rdr ]
    far_reports = pd.DataFrame(rows, columns=header)
    
with open('data/far/users.csv') as f:
    rdr = csv.reader(f, escapechar='\\')
    header = next(rdr)
    rows = [ r for r in rdr ]
    far_users = pd.DataFrame(rows, columns=header)

far_ids = far_reports.merge(far_users, left_on='user_id', right_on='id', suffixes=('_report', '_user'))
keep=['id_report','email']
far_ids.drop(columns=[ c for c in far_ids.columns if c not in keep], inplace=True)
far_ids.rename(columns={'id_report': 'report_id', 'email': 'user_email'}, inplace=True)
far_ids.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2386 entries, 0 to 2385
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   report_id   2386 non-null   object
 1   user_email  2386 non-null   object
dtypes: object(2)
memory usage: 55.9+ KB


In [16]:
assert len(df_cites_far[ df_cites_far.activity_report_id.isna() ]) == 0
df_cites_far = df_cites_far.merge(far_ids, how='left', left_on='activity_report_id', right_on='report_id')
df_cites_far.drop(columns=['report_id'], inplace=True)
df_cites_far.sample(5)

Unnamed: 0,id,table_id,activity_report_id,article_type_id,identifier,created_at,updated_at,title,journal,number,...,conference,paper_date,patent_status_id,patent_number,patent_title,patent_date,presentation_type_id,abstract_date,type,user_email
8923,paper_1102,1102,1756,,,2017-02-09 22:53:26,2017-02-09 22:53:26,"Cellular mechanisms in Christianson syndrome, ...",,,...,Keystone Symposia Conference Neurological Diso...,2016-01-31,,,,,,,Paper,eric_morrow@brown.edu
9508,abstract_161,161,580,,,2016-03-09 10:13:50,2016-03-09 10:14:53,Hypothesis Test of Mediation Effect in Causal ...,,,...,Joint International Chinese Statistical Associ...,,,,,,CO,,Abstract,yen-tsung_huang@brown.edu
2821,article_3047,3047,468,PEER,,2016-02-26 10:52:16,2016-02-26 10:52:16,A Novel Approach to Realizing Routine HIV Scre...,JMIR Res Protoc,3.0,...,,,,,,,,,Article,amy_nunn@brown.edu
7676,chapter_840,840,1215,,,2017-01-30 21:57:53,2017-01-30 21:57:53,Guantánamo and Community: Visual Approaches to...,,,...,,,,,,,,,Chapter,esther_whitfield@brown.edu
841,article_913,913,307,PEER,10.1016/j.ntt.2015.11.002,2016-01-21 13:08:18,2016-01-21 14:34:33,Effects of embryonic exposure to polychlorinat...,Neurotoxicology and Teratology,,...,,,,,,,,,Article,robbert_creton_phd@brown.edu


In [32]:
df_cites_far[ df_cites_far.duplicated(subset='identifier') ].identifier.value_counts()

10.1016/j.physletb.2016.01.010    5
10.1016/j.physletb.2015.11.042    5
10.1007/jhep01(2016)096           5
10.1103/physrevlett.116.032301    5
10.1016/j.physletb.2015.12.039    5
                                 ..
10.1111/1475-6773.12437           1
10.1177/2325957415614646          1
10.1017/jfm.2015.700              1
10.1515/ngs-2015-0026             1
10.1140/epjc/s10052-016-4504-z    1
Name: identifier, Length: 537, dtype: int64

In [7]:
df_cites_far[ (df_cites_far.duplicated('doi', keep=False)) & (df_cites_far.doi.notnull()) ]

Unnamed: 0,id,activity_report_id,article_type_id,identifier,created_at,updated_at,title,journal,number,volume,...,other,conference,paper_date,patent_status_id,patent_number,patent_title,patent_date,presentation_type_id,abstract_date,type
6471,book_85,740,,,2016-01-21 14:25:28,2016-01-21 14:25:28,"Revisitar el costumbrismo: Cosmopolitismo, ped...",,,,...,,,,,,,,,,Book
6741,book_372,1403,,,2017-01-27 11:41:14,2017-01-27 11:41:14,Revisitar el costumbrismo,,,,...,,,,,,,,,,Book
6862,chapter_6,286,,,2016-01-08 19:35:32,2016-01-08 21:01:04,On the Sample Complexity of Cancer Pathways Id...,,,,...,,,,,,,,,,Chapter
6919,chapter_65,694,,,2016-01-11 17:13:30,2016-01-11 17:13:30,Experimental Mechanics for Graduate Students,,,,...,,,,,,,,,,Chapter
7312,chapter_469,435,,,2016-01-31 15:53:17,2016-01-31 15:53:17,On the Sample Complexity of Cancer Pathways Id...,,,,...,,,,,,,,,,Chapter
7323,chapter_480,508,,,2016-02-02 13:28:56,2016-02-02 13:28:56,ENGINEERING AND CLINICAL ASPECTS OF PHOTOPLETH...,,,,...,,,,,,,,,,Chapter
7422,chapter_583,1258,,,2017-01-10 22:54:29,2017-01-10 22:54:29,Wireless Neurotechnology for Neural Prostheses,,,,...,,,,,,,,,,Chapter
7432,chapter_593,929,,,2017-01-11 14:11:29,2017-01-11 14:11:29,The Aurora and Borealis Stream Processing Engines,,,,...,,,,,,,,,,Chapter
7651,chapter_815,1066,,,2017-01-30 10:45:18,2017-01-30 10:45:18,Experimental Mechanics for Graduate Students,,,,...,,,,,,,,,,Chapter
7776,chapter_943,1483,,,2017-02-02 11:05:57,2017-02-02 11:05:57,ENGINEERING AND CLINICAL ASPECTS OF PHOTOPLETH...,,,,...,,,,,,,,,,Chapter


In [14]:
no_far_meta = set(df_cites_far.columns) - {'id','activity_report_id','created_at','updated_at'}
dupes = df_cites_far[ df_cites_far.duplicated(subset=no_far_meta) ]
dupes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 563 entries, 61 to 9460
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    563 non-null    object 
 1   activity_report_id    563 non-null    object 
 2   article_type_id       552 non-null    object 
 3   identifier            549 non-null    object 
 4   created_at            563 non-null    object 
 5   updated_at            563 non-null    object 
 6   title                 557 non-null    object 
 7   journal               549 non-null    object 
 8   number                376 non-null    object 
 9   volume                514 non-null    object 
 10  date                  552 non-null    object 
 11  coauthors             547 non-null    object 
 12  book_status_id        547 non-null    object 
 13  article_id_type_id    552 non-null    object 
 14  page_numbers          343 non-null    object 
 15  book_type_id         

In [15]:
len(df_cites_far.groupby('title').count())

8262

In [16]:
def fuzzy_far(field, pool):
    return '||'.join([ p[0] for p in process.extract(
        field, pool, scorer=fuzz.partial_ratio) if p[1] > 90 ])

In [17]:
fuzzy_check = { e.strip().lower() for e in df_cites_far.title.to_list() if isinstance(e, str) and e != 'm'}
fuzzy_far('Revisitar el costumbrismo', fuzzy_check)

'revisitar el costumbrismo||revisitar el costumbrismo: cosmopolitismo, pedagogías y modernización en iberoamérica'

In [18]:
df_far_titles = pd.DataFrame(df_cites_far.title)
df_far_titles.dropna(inplace=True)
df_far_titles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9574 entries, 0 to 9688
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   9574 non-null   object
dtypes: object(1)
memory usage: 149.6+ KB


In [19]:
df_far_titles['matches'] = df_far_titles.title.apply(lambda x: fuzzy_far(x.lower().strip(), fuzzy_check))

KeyboardInterrupt: 

In [130]:
df_far_titles['matches'] = fuzzy_far(df_far_titles.title, fuzzy_check)

KeyboardInterrupt: 

## VIVO Data
[back](#Contents)

In [7]:
with open('data/rab/query_properties.csv') as f:
    data = f.readlines()
    # skip header, strip trailing whitespace
    fin_cite_props = [ d.strip() for d in data[1:] ]

In [8]:
with open('data/rab/query_citations.nt') as f:
    fin_rab_cites = f.readlines()

In [9]:
cite_prop_map = { c: c[40:] for c in fin_cite_props if c.startswith('http://vivo.brown.edu/ontology/citation#') }
cite_prop_map['rabid'] = 'rabid'
cite_prop_map['http://www.w3.org/2000/01/rdf-schema#label'] = 'label'
cite_prop_map['http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType'] = 'type'
cite_prop_map.values()

dict_values(['date', 'volume', 'hasContributor', 'authorList', 'pmid', 'issue', 'doi', 'hasVenue', 'pages', 'pmcid', 'publishedIn', 'book', 'hasLocation', 'editorList', 'chapter', 'hasPublisher', 'isbn', 'url', 'hasConferenceLocation', 'conferenceDate', 'hasConference', 'issn', 'reviewOf', 'title', 'number', 'version', 'hasAssignee', 'hasCountry', 'hasAuthority', 'patentNumber', 'venueFor', 'rabid', 'label', 'type'])

In [10]:
RABCitation = namedtuple('RABCitation', sorted(cite_prop_map.values()))

In [11]:
def clean_data_prop(oData):
    return oData.rsplit('"^^<http://www.w3.org/2001/XMLSchema#', maxsplit=1)[0].strip('\"\n .<>')

In [12]:
def parse_triple(rawRow):
    s,p,o = rawRow.split(' ',maxsplit=2)
    s = s.strip('<>')
    p = p.strip('<>')
    o = clean_data_prop(o)
    return (s,p,o)

In [13]:
cite_triples = []
for t in fin_rab_cites:
    cite_triples.append(parse_triple(t))

cite_triples[0]

('http://vivo.brown.edu/individual/n5c6cae127059414ca258636cd3dc482b',
 'http://www.w3.org/2000/01/rdf-schema#label',
 'Erratum to: Global magnetic confinement for the 1.5D Vlasov-Maxwell system')

In [14]:
# Analyzing citations with more than 1 most specific type

no_ids = [ c for c in cite_triples
          if c[1] == 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType' 
          and c[2] == 'http://vivo.brown.edu/ontology/citation#NoID' ]
msts = defaultdict(set)
for c in cite_triples:
    if c[1] == 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType':
        msts[c[0]].add(c[2])
mlts = set()
for m,v in msts.items():
    if len(v) != 1:
        mlts.add(frozenset(v))

no_id = 'http://vivo.brown.edu/ontology/citation#NoID'        
for m in mlts:
    if no_id in m:
        print("With NoID: ", [ a for a in m if a != no_id])
    else:
        print("Redundant types: ". sorted(list(m)))

With NoID:  ['http://vivo.brown.edu/ontology/citation#ConferencePaper']
With NoID:  ['http://vivo.brown.edu/ontology/citation#BookSection']
With NoID:  ['http://vivo.brown.edu/ontology/citation#Review']
With NoID:  ['http://vivo.brown.edu/ontology/citation#Abstract']
With NoID:  ['http://vivo.brown.edu/ontology/citation#Article']
With NoID:  ['http://vivo.brown.edu/ontology/citation#WorkingPaper']
With NoID:  ['http://vivo.brown.edu/ontology/citation#Citation']
With NoID:  ['http://vivo.brown.edu/ontology/citation#Book']
With NoID:  ['http://vivo.brown.edu/ontology/citation#Patent']


In [15]:
def triple_match(triple, prop=None, obj=None):
    if prop and obj:
        return triple[1] == prop and triple[2] == obj
    if prop:
        return triple[1] == prop
    if obj:
        return triple[2] == obj
    return True

In [16]:
def filter_mst_no_id(triple):
    return not triple_match(triple,
                            'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType',
                            'http://vivo.brown.edu/ontology/citation#NoID')

good_triple = ('foo', 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType', 'bar')
bad_triple = ('foo', 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType',
              'http://vivo.brown.edu/ontology/citation#NoID')
assert filter_mst_no_id(good_triple) == True
assert filter_mst_no_id(bad_triple) == False

In [17]:
strip_msts = [ t for t in cite_triples if filter_mst_no_id(t) ]

In [18]:
cite_dicts = defaultdict(dict)
for t in strip_msts:
    if t[1] in cite_prop_map:
        cite_dicts[t[0]][cite_prop_map[t[1]]] = t[2]

In [19]:
empty_row = { cite_prop_map[p]: '' for p in cite_prop_map }
rab_rows = []
for c in cite_dicts:
    d = cite_dicts[c]
    d['rabid'] = c
    row = empty_row.copy()
    row.update(d)
    rab_rows.append(RABCitation(**row))

print(rab_rows[0])

RABCitation(authorList='Nguyen, Toan T., Nguyen, Truyen V., Strauss, Walter A', book='', chapter='', conferenceDate='', date='2015-06-01', doi='10.3934/krm.2015.8.615', editorList='', hasAssignee='', hasAuthority='', hasConference='', hasConferenceLocation='', hasContributor='http://vivo.brown.edu/individual/wstrauss', hasCountry='', hasLocation='', hasPublisher='', hasVenue='http://vivo.brown.edu/individual/n6086eb8fe7824cad9423547d403a958d', isbn='', issn='', issue='3', label='Erratum to: Global magnetic confinement for the 1.5D Vlasov-Maxwell system', number='', pages='615-616', patentNumber='', pmcid='', pmid='', publishedIn='Kinetic and Related Models', rabid='http://vivo.brown.edu/individual/n5c6cae127059414ca258636cd3dc482b', reviewOf='', title='', type='http://vivo.brown.edu/ontology/citation#Article', url='', venueFor='', version='', volume='8')


In [20]:
df_cites_rab = pd.DataFrame(rab_rows)

cols = df_cites_rab.columns.tolist()
id_atts = [ 'rabid','type','label','doi','pmid','pmcid','isbn','issn' ]
common_atts = [ 'date','authorList','pages','issue','volume' ]
has_atts = ['hasContributor','hasVenue','hasConference','hasConferenceLocation',
            'hasCountry','hasLocation','hasPublisher','hasAssignee','hasAuthority']
grouped_atts = id_atts + common_atts + has_atts
cols = [ c for c in cols if c not in grouped_atts ]
cols = id_atts + common_atts + cols + has_atts
df_cites_rab = df_cites_rab[ cols ]
df_cites_rab.type = df_cites_rab.type.str.rsplit('#').str.get(1)
df_cites_rab.replace(r'^$', np.nan, inplace=True, regex=True)
df_cites_rab.head()

Unnamed: 0,rabid,type,label,doi,pmid,pmcid,isbn,issn,date,authorList,...,version,hasContributor,hasVenue,hasConference,hasConferenceLocation,hasCountry,hasLocation,hasPublisher,hasAssignee,hasAuthority
0,http://vivo.brown.edu/individual/n5c6cae127059...,Article,Erratum to: Global magnetic confinement for th...,10.3934/krm.2015.8.615,,,,,2015-06-01,"Nguyen, Toan T., Nguyen, Truyen V., Strauss, W...",...,,http://vivo.brown.edu/individual/wstrauss,http://vivo.brown.edu/individual/n6086eb8fe782...,,,,,,,
1,http://vivo.brown.edu/individual/n52747,Article,Learning as a Task or a Virtue: U.S. and Chine...,10.1037/0012-1649.40.4.595,15238046.0,,,,2004-01-01,"Li, Jin",...,,http://vivo.brown.edu/individual/jili,http://vivo.brown.edu/individual/n60865,,,,,,,
2,http://vivo.brown.edu/individual/n8301,Article,Predicting discordance between self-reports of...,10.1007/s10461-012-0163-8,22323006.0,PMC3471653,,,2012-08-01,"Brown JL, Sales JM, DiClemente RJ, Salazar LF,...",...,,http://vivo.brown.edu/individual/lbrownmd,http://vivo.brown.edu/individual/n79279,,,,,,,
3,http://vivo.brown.edu/individual/n98528,Article,Effects of 12-O-tetradecanoylphorbol-13-acetat...,10.1002/mc.2940130304,7619217.0,,,,1995-07-01,"Sears WL, Goto-Mandeville R, Mirapuri M, Braun L",...,,http://vivo.brown.edu/individual/lbraun,http://vivo.brown.edu/individual/n82319,,,,,,,
4,http://vivo.brown.edu/individual/n52835,Article,"Daily co-occurrences of marijuana use, alcohol...",10.1016/j.drugalcdep.2014.09.265,,,,,2015-01-01,"Graves, Hannah, Hernandez, Lynn, Kahler, Chris...",...,,http://vivo.brown.edu/individual/lh15,http://vivo.brown.edu/individual/n48368,,,,,,,


In [21]:
df_cites_rab.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49044 entries, 0 to 49043
Data columns (total 34 columns):
rabid                    49044 non-null object
type                     49044 non-null object
label                    49004 non-null object
doi                      43349 non-null object
pmid                     36381 non-null object
pmcid                    13399 non-null object
isbn                     683 non-null object
issn                     16 non-null object
date                     49034 non-null object
authorList               47600 non-null object
pages                    44188 non-null object
issue                    39882 non-null object
volume                   44166 non-null object
book                     744 non-null object
chapter                  148 non-null object
conferenceDate           36 non-null object
editorList               713 non-null object
number                   17 non-null object
patentNumber             9 non-null object
publishedIn        

## 3rd-party IDs

In [22]:
df_cites_rab.type.value_counts()

Article            42700
Citation            3708
ConferencePaper      930
BookSection          774
Book                 520
Review               245
Abstract             128
WorkingPaper          30
Patent                 9
Name: type, dtype: int64

In [38]:
with_ids = len(df_cites_rab[ ((df_cites_rab.pmid.notnull()) | (df_cites_rab.doi.notnull())) ])
print("R@B Citations with DOIs or PMIDs: ", with_ids )
print("R@B Citations without: ", len(df_cites_rab) - with_ids)

Citations with DOIs or PMIDs:  46799
Citations without:  2245


In [31]:
df_cites_rab[ ((df_cites_rab.pmid.isna()) & (df_cites_rab.doi.isna())) ].type.value_counts()

BookSection        678
Article            644
Book               461
Abstract           118
Review             115
Citation           101
ConferencePaper     92
WorkingPaper        27
Patent               9
Name: type, dtype: int64

In [33]:
df_cites_rab[ ((df_cites_rab.pmid.isna()) & (df_cites_rab.doi.isna())
               & (df_cites_rab.isbn.notnull())) ].type.value_counts()

Book           406
BookSection    201
Article          5
Name: type, dtype: int64

In [29]:
df_cites_far.type.value_counts()

Article     6392
Paper       1371
Chapter     1012
Book         467
Abstract     314
Patent        96
Review        37
Name: type, dtype: int64

In [49]:
with_ids = len(df_cites_far[ df_cites_far.doi.notnull() | df_cites_far.identifier.notnull() ])
print("FAR Citations with DOIs or PMIDs: ", with_ids )
print("FAR Citations without: ", len(df_cites_far) - with_ids)

FAR Citations with DOIs or PMIDs:  4932
FAR Citations without:  4757


In [52]:
df_cites_far[ df_cites_far.doi.notnull() | df_cites_far.identifier.notnull() ].type.value_counts()

Article    3897
Chapter    1012
Book         23
Name: type, dtype: int64

In [69]:
df_cites_far[ df_cites_far.doi.notnull() | df_cites_far.identifier.notnull() ].type.value_counts()

Article    3897
Chapter     120
Book         23
Name: type, dtype: int64

In [72]:
df_cites_far[ df_cites_far.identifier.notnull() ].article_id_type_id.value_counts()

DOI    3897
Name: article_id_type_id, dtype: int64

In [94]:
rab_doi_map = { d.lower(): d for d in df_cites_rab[df_cites_rab.doi.notnull()].doi }
far_doi_map = { d.lower(): d for d in df_cites_far[ df_cites_far.identifier.notnull() ].identifier }
far_doi_map.update(
    { d.lower(): d for d in df_cites_far[ df_cites_far.doi.notnull() ].doi } )

rab_dois = set(rab_doi_map.keys())
far_dois = set(far_doi_map.keys())

In [95]:
print("RAB DOIs: ",len(rab_dois))
print("FAR DOIs: ",len(far_dois))
print("Shared DOIs: ", len(rab_dois & far_dois))

RAB DOIs:  41821
FAR DOIs:  3157
Shared DOIs:  1944


In [100]:
only_in_far = { far_doi_map[d] for d  in far_dois - rab_dois }
assert len(only_in_far) == len(far_dois) - len(rab_dois & far_dois)

In [104]:
df_far_dois = df_cites_far[ ((df_cites_far.identifier.isin(only_in_far)) | (df_cites_far.doi.isin(only_in_far)))]

In [120]:
df_far_dois.groupby('identifier').identifier.count().nlargest(50)

identifier
10.1007/jhep01(2016)006           6
10.1007/jhep01(2016)079           6
10.1007/jhep01(2016)096           6
10.1016/j.physletb.2015.10.067    6
10.1016/j.physletb.2015.11.042    6
10.1016/j.physletb.2015.12.017    6
10.1016/j.physletb.2015.12.020    6
10.1016/j.physletb.2015.12.039    6
10.1016/j.physletb.2016.01.010    6
10.1103/physrevd.93.012001        6
10.1103/physrevd.93.012003        6
10.1103/physrevlett.116.032301    6
10.1140/epjc/s10052-015-3853-3    6
10.1007/jhep01(2016)166           5
10.1007/jhep02(2016)122           5
10.1007/jhep02(2016)145           5
10.1007/jhep03(2016)125           5
10.1007/jhep04(2016)005           5
10.1007/jhep04(2016)010           5
10.1007/jhep04(2016)035           5
10.1007/jhep04(2016)073           5
10.1007/jhep06(2016)177           5
10.1007/jhep11(2016)056           5
10.1016/j.physletb.2016.01.056    5
10.1016/j.physletb.2016.02.002    5
10.1016/j.physletb.2016.02.047    5
10.1016/j.physletb.2016.03.039    5
10.1016/j.physlet