# FAR-VIVO Citation Data Analysis

In [1]:
import csv
import os

import requests
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz, process

from collections import namedtuple, defaultdict, Counter
from functools import reduce

vprod = %env VIVO_PRODUCTION
vstage = %env VIVO_STAGING
vuser = %env VIVO_USER
vpass = %env VIVO_PASSWORD

## Contents
* [Acquire](#Acquire)
* [Load FAR Data](#FAR-Publication-Data)
* [Load VIVO Data](#VIVO-Data)

# Acquire
[back](#Contents)

In [11]:
def get_citation_properties(endpoint):
    query = """
    SELECT DISTINCT ?prop
    WHERE {{
        ?cite a <http://vivo.brown.edu/ontology/citation#Citation> .
        ?cite ?prop ?o.
    }}
    """
    data = { 'email': vuser, 'password': vpass, 'query': query }
    headers = { 'Accept': 'text/csv', 'charset': 'utf-8' }
    resp = requests.post(endpoint, data=data, headers=headers)
    if resp.status_code == 200:
        return resp.text
    else:
        print(resp.text)
        return False

In [3]:
with open('data/rab/query_properties.csv','w+') as f:
    f.write(get_citation_properties(vstage))

In [4]:
def get_citation_data(endpoint):
    query = """
    DESCRIBE ?cite
    WHERE {{ ?cite a <http://vivo.brown.edu/ontology/citation#Citation> .}}
    """
    data = { 'email': vuser, 'password': vpass, 'query': query }
    headers = { 'Accept': 'text/plain', 'charset': 'utf-8' }
    resp = requests.post(endpoint, data=data, headers=headers)
    if resp.status_code == 200:
        return resp.text
    else:
        print(resp.text)
        return False

In [5]:
with open('data/rab/query_citations.nt', 'w+') as f:
    f.write(get_citation_data(vstage))

## FAR Publication Data
[contents](#Contents)
* [Deduplication](#Deduplicating-FAR-publications)

In [21]:
def wrap_far_row(row, dtype, idIdx):
    row[idIdx] = dtype + '_' + row[idIdx]
    row.append(dtype.capitalize())
    return row

In [22]:
def make_far_df(dtype, fname):
    with open(os.path.join('data/far/',fname)) as f:
        rdr = csv.reader(f, escapechar='\\')
        header = next(rdr)
        assert dtype not in header
        header.append(dtype)
        rows = [ wrap_far_row(r, dtype, header.index('id')) for r in rdr ]
    return pd.DataFrame(rows, columns=header)

In [23]:
far_files = [ ('article', 'articles.csv'), ('book', 'books.csv'),
             ('chapter', 'chapters.csv'), ('review', 'critical_reviews.csv'),
             ('paper', 'papers.csv'), ('patent', 'patents.csv'),
             ('abstract', 'ph_abstracts.csv') ]
dtypes = [ f[0] for f in far_files ]

df_cites_far = pd.concat(
    [ make_far_df(*f) for f in far_files ], axis=0, ignore_index=True, sort=False)
melted = pd.melt(df_cites_far, id_vars=['id'], value_vars=dtypes,
                var_name='drop_me', value_name='type')
df_cites_far = df_cites_far.join(
    melted.dropna().drop(columns='drop_me').set_index('id'), on='id')
df_cites_far.drop(columns=dtypes, inplace=True)
df_cites_far.replace(r'^(|N)$', np.nan, inplace=True, regex=True)
df_cites_far.head()

Unnamed: 0,id,activity_report_id,article_type_id,identifier,created_at,updated_at,title,journal,number,volume,...,other,conference,paper_date,patent_status_id,patent_number,patent_title,patent_date,presentation_type_id,abstract_date,type
0,article_6,15,PEER,10.1117/1.nph.2.3.031202,2016-01-07 16:51:12,2016-01-07 16:51:12,Modified toolbox for optogenetics in the nonhu...,Neurophotonics,3,2,...,,,,,,,,,,Article
1,article_7,25,PEER,10.1162/neco_a_00681,2016-01-07 17:07:06,2016-01-07 17:07:58,Spatiotemporal Conditional Inference and Hypot...,Neural Computation,1,27,...,,,,,,,,,,Article
2,article_9,760,PEER,10.1038/nature14105,2016-01-07 17:08:08,2016-01-07 17:08:08,Impact jetting as the origin of chondrules,Nature,7534,517,...,,,,,,,,,,Article
3,article_10,760,PEER,10.1002/2015gl065022,2016-01-07 17:08:19,2016-01-07 17:10:52,The fractured Moon: Production and saturation ...,Geophysical Research Letters,17,42,...,,,,,,,,,,Article
4,article_11,25,PEER,10.1073/pnas.1506400112,2016-01-07 17:08:37,2016-01-07 17:08:50,Ambiguity and nonidentifiability in the statis...,Proc Natl Acad Sci USA,20,112,...,,,,,,,,,,Article


In [24]:
df_cites_far.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9689 entries, 0 to 9688
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    9689 non-null   object 
 1   activity_report_id    9689 non-null   object 
 2   article_type_id       6392 non-null   object 
 3   identifier            3897 non-null   object 
 4   created_at            9689 non-null   object 
 5   updated_at            9689 non-null   object 
 6   title                 9574 non-null   object 
 7   journal               6349 non-null   object 
 8   number                3599 non-null   object 
 9   volume                4441 non-null   object 
 10  date                  5839 non-null   object 
 11  coauthors             7029 non-null   object 
 12  book_status_id        7388 non-null   object 
 13  article_id_type_id    5263 non-null   object 
 14  page_numbers          3620 non-null   object 
 15  book_type_id         

In [25]:
with open('data/far/activity_reports.csv') as f:
    rdr = csv.reader(f, escapechar='\\')
    header = next(rdr)
    rows = [ r for r in rdr ]
    far_reports = pd.DataFrame(rows, columns=header)
    
with open('data/far/users.csv') as f:
    rdr = csv.reader(f, escapechar='\\')
    header = next(rdr)
    rows = [ r for r in rdr ]
    far_users = pd.DataFrame(rows, columns=header)

far_ids = far_reports.merge(far_users, left_on='user_id', right_on='id', suffixes=('_report', '_user'))
keep=['id_report','email']
far_ids.drop(columns=[ c for c in far_ids.columns if c not in keep], inplace=True)
far_ids.rename(columns={'id_report': 'report_id', 'email': 'user_email'}, inplace=True)
far_ids.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2386 entries, 0 to 2385
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   report_id   2386 non-null   object
 1   user_email  2386 non-null   object
dtypes: object(2)
memory usage: 55.9+ KB


In [26]:
assert len(df_cites_far[ df_cites_far.activity_report_id.isna() ]) == 0
df_cites_far = df_cites_far.merge(far_ids, how='left', left_on='activity_report_id', right_on='report_id')
df_cites_far.drop(columns=['report_id'], inplace=True)
df_cites_far.sample(5)

Unnamed: 0,id,activity_report_id,article_type_id,identifier,created_at,updated_at,title,journal,number,volume,...,conference,paper_date,patent_status_id,patent_number,patent_title,patent_date,presentation_type_id,abstract_date,type,user_email
5204,article_5623,1164,PEER,10.1016/j.icarus.2016.07.007,2017-02-13 14:40:28,2017-02-13 14:40:28,Impact ejecta-induced melting of surface ice d...,Icarus,,280.0,...,,,,,,,,,Article,james_head_iii@brown.edu
8078,paper_178,1080,,,2017-01-12 11:32:44,2017-01-12 11:32:44,The Democracy Effect: a weights-based identifi...,,,,...,Fourth Political Economy Conference at Columbi...,2016-12-10,,,,,,,Paper,pedro_dal_bo@brown.edu
1595,article_1704,900,PEER,,2016-01-28 19:57:48,2016-01-28 19:58:23,Envisioning new information technology for fam...,Gerontechnology,,,...,,,,,,,,,Article,rosa_baier@brown.edu
5969,article_6650,1525,PEER,10.1002/pon.4255,2017-02-22 14:12:21,2017-02-22 14:12:21,Does a peer-led exercise intervention affect s...,Psycho-Oncology,,,...,,,,,,,,,Article,shira_dunsiger@brown.edu
1910,article_2041,721,PEER,10.1016/j.langcom.2014.11.004,2016-01-29 18:21:10,2016-01-29 18:21:10,"Singing for the dead, on and off line: Diversi...",Language & Communication,,44.0,...,,,,,,,,,Article,paja_faudree@brown.edu


### Deduplicating FAR publications
[top](#FAR-Publication-Data)

In [126]:
df_cites_far[ (df_cites_far.duplicated(subset='identifier', keep=False))
             & (df_cites_far.identifier.notnull())  ].sort_values(by='identifier')

Unnamed: 0,id,activity_report_id,article_type_id,identifier,created_at,updated_at,title,journal,number,volume,...,conference,paper_date,patent_status_id,patent_number,patent_title,patent_date,presentation_type_id,abstract_date,type,user_email
6006,article_6688,1530,PEER,10.1001/jama.2016.9374,2017-02-23 16:06:44,2017-02-23 16:06:44,Tube Feeding in US Nursing Home Residents With...,JAMA,7,316,...,,,,,,,,,Article,pedro_gozalo@brown.edu
6041,article_6725,1535,PEER,10.1001/jama.2016.9374,2017-02-24 08:42:08,2017-02-24 08:42:08,Tube Feeding in US Nursing Home Residents With...,JAMA,7,316,...,,,,,,,,,Article,vincent_mor@brown.edu
6040,article_6724,1535,PEER,10.1001/jamainternmed.2015.6508,2017-02-24 08:42:06,2017-02-24 08:42:06,Accountability of Hospitals for Medicare Benef...,JAMA Internal Medicine,1,176,...,,,,,,,,,Article,vincent_mor@brown.edu
2114,article_2253,10,PEER,10.1001/jamainternmed.2015.6508,2016-02-02 09:13:11,2016-02-02 09:13:11,Accountability of Hospitals for Medicare Benef...,JAMA Intern Med,1,176,...,,,,,,,,,Article,momotazur_rahman@brown.edu
6338,article_7035,1780,PEER,10.1001/jamainternmed.2016.0267,2017-03-23 15:41:01,2017-03-23 15:41:01,Quality of Care for White and Hispanic Medicar...,JAMA Internal Medicine,6,176,...,,,,,,,,,Article,maricruz_rivera-hernandez@brown.edu
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6206,article_6897,1526,PEER,10.5993/ajhb.40.2.11,2017-03-02 17:06:38,2017-03-02 17:06:38,Associations of Mindfulness with Glucose Regul...,American Journal of Health Behavior,2,40,...,,,,,,,,,Article,stephen_buka@brown.edu
966,article_1042,559,PEER,10.7326/m15-1059,2016-01-23 14:32:41,2016-01-23 14:32:41,Leukotriene-Receptor Antagonists Versus Placeb...,Ann Intern Med,10,163,...,,,,,,,,,Article,christopher_schmid@brown.edu
2602,article_2763,713,PEER,10.7326/m15-1059,2016-02-16 16:02:29,2016-02-17 09:48:01,Leukotriene-Receptor Antagonists Versus Placeb...,Ann Intern Med,10,163,...,,,,,,,,,Article,ethan_balk@brown.edu
2620,article_2781,525,PEER,10.7448/ias.18.1.20724,2016-02-17 14:49:10,2016-02-17 14:49:10,The dollars and sense of economic incentives t...,Journal of the International AIDS Society,1,18,...,,,,,,,,,Article,omar_galarraga@brown.edu


In [127]:
df_cites_far[ (df_cites_far.duplicated('doi', keep=False)) & (df_cites_far.doi.notnull()) ].sort_values(by='doi')

Unnamed: 0,id,activity_report_id,article_type_id,identifier,created_at,updated_at,title,journal,number,volume,...,conference,paper_date,patent_status_id,patent_number,patent_title,patent_date,presentation_type_id,abstract_date,type,user_email
7422,chapter_583,1258,,,2017-01-10 22:54:29,2017-01-10 22:54:29,Wireless Neurotechnology for Neural Prostheses,,,,...,,,,,,,,,Chapter,david_borton@brown.edu
7861,chapter_1031,955,,,2017-03-21 16:33:32,2017-03-21 16:33:32,Wireless Neurotechnology for Neural Prostheses,,,,...,,,,,,,,,Chapter,arto_nurmikko@brown.edu
6862,chapter_6,286,,,2016-01-08 19:35:32,2016-01-08 21:01:04,On the Sample Complexity of Cancer Pathways Id...,,,,...,,,,,,,,,Chapter,eliezer_upfal@brown.edu
7312,chapter_469,435,,,2016-01-31 15:53:17,2016-01-31 15:53:17,On the Sample Complexity of Cancer Pathways Id...,,,,...,,,,,,,,,Chapter,benjamin_raphael@brown.edu
6919,chapter_65,694,,,2016-01-11 17:13:30,2016-01-11 17:13:30,Experimental Mechanics for Graduate Students,,,,...,,,,,,,,,Chapter,christian_franck@brown.edu
7651,chapter_815,1066,,,2017-01-30 10:45:18,2017-01-30 10:45:18,Experimental Mechanics for Graduate Students,,,,...,,,,,,,,,Chapter,christian_franck@brown.edu
7432,chapter_593,929,,,2017-01-11 14:11:29,2017-01-11 14:11:29,The Aurora and Borealis Stream Processing Engines,,,,...,,,,,,,,,Chapter,Stan_Zdonik@brown.edu
7798,chapter_966,1299,,,2017-02-09 17:01:27,2017-02-09 17:01:27,The Aurora and Borealis Stream Processing Engines,,,,...,,,,,,,,,Chapter,ugur_cetintemel@brown.edu
7323,chapter_480,508,,,2016-02-02 13:28:56,2016-02-02 13:28:56,ENGINEERING AND CLINICAL ASPECTS OF PHOTOPLETH...,,,,...,,,,,,,,,Chapter,domenico_pacifici@brown.edu
7776,chapter_943,1483,,,2017-02-02 11:05:57,2017-02-02 11:05:57,ENGINEERING AND CLINICAL ASPECTS OF PHOTOPLETH...,,,,...,,,,,,,,,Chapter,domenico_pacifici@brown.edu


In [128]:
df_cites_far[ df_cites_far.identifier.notnull() ].article_id_type_id.value_counts()

DOI    3897
Name: article_id_type_id, dtype: int64

In [129]:
print("Articles with PUBMED-type id: {}".format(
    len(df_cites_far[ df_cites_far.article_id_type_id == "PUBMED" ])))

Articles with PUBMED-type id: 0


In [130]:
print("Articles with OTHER-type id and not-null IDENTIFIER value: {}".format(
    len(df_cites_far[ (df_cites_far.article_id_type_id == "OTHER") & df_cites_far.identifier.notnull()])))

Articles with OTHER-type id and not-null IDENTIFIER value: 0


In [131]:
print("FAR publications with DOI: {}".format(
    len(df_cites_far[ df_cites_far.doi.notnull() ])))
df_cites_far[ df_cites_far.doi.notnull()].sample(5)

FAR publications with DOI: 143


Unnamed: 0,id,activity_report_id,article_type_id,identifier,created_at,updated_at,title,journal,number,volume,...,conference,paper_date,patent_status_id,patent_number,patent_title,patent_date,presentation_type_id,abstract_date,type,user_email
6832,book_473,1583,,,2017-02-14 05:42:03,2017-02-14 05:42:03,Interpreting Epidemiologic Evidence: Connectin...,,,,...,,,,,,,,,Book,david_savitz@brown.edu
7861,chapter_1031,955,,,2017-03-21 16:33:32,2017-03-21 16:33:32,Wireless Neurotechnology for Neural Prostheses,,,,...,,,,,,,,,Chapter,arto_nurmikko@brown.edu
6861,chapter_5,513,,,2016-01-08 15:03:13,2016-01-27 10:52:02,Hierarchical Models of the Visual System,,,,...,,,,,,,,,Chapter,thomas_serre@brown.edu
7760,chapter_927,1271,,,2017-02-01 01:47:13,2017-02-01 01:47:13,"Chapter 1. Transport of Ions, DNA Polymers, an...",,,,...,,,,,,,,,Chapter,derek_stein@brown.edu
7150,chapter_301,673,,,2016-01-26 15:53:31,2016-01-26 15:53:31,Population Encoding/Decoding,,,,...,,,,,,,,,Chapter,wilson_truccolo@brown.edu


In [132]:
print("FAR publications with IDENTIFIER: {}".format(
    len(df_cites_far[ df_cites_far.identifier.notnull() ])))
df_cites_far[ df_cites_far.identifier.notnull()].sample(5)

FAR publications with IDENTIFIER: 3897


Unnamed: 0,id,activity_report_id,article_type_id,identifier,created_at,updated_at,title,journal,number,volume,...,conference,paper_date,patent_status_id,patent_number,patent_title,patent_date,presentation_type_id,abstract_date,type,user_email
4486,article_4820,1480,PEER,10.1038/nplants.2016.43,2017-01-31 10:52:20,2017-01-31 10:52:20,The phosphorus cost of agricultural intensific...,Nature Plants,5,2,...,,,,,,,,,Article,leah_vanwey@brown.edu
2740,article_2965,266,PEER,10.1080/1067828x.2013.829013,2016-02-24 14:59:13,2016-02-24 14:59:13,Improving Parenting and Parent-Adolescent Comm...,Journal of Child & Adolescent Substance Abuse,5,24,...,,,,,,,,,Article,nancy_barnett@brown.edu
4883,article_5299,1363,PEER,10.1088/1741-2560/13/2/023001,2017-02-06 09:58:32,2017-02-06 09:58:32,Brain–computer interface devices for patients ...,Journal of Neural Engineering,2,13,...,,,,,,,,,Article,leigh_hochberg@brown.edu
333,article_378,387,PEER,10.1086/680680,2016-01-11 16:28:11,2016-01-11 16:28:11,Emergent Ghettos: Black Neighborhoods in New ...,American Journal of Sociology,4,120,...,,,,,,,,,Article,john_logan@brown.edu
4493,article_4827,1060,PEER,10.1103/physrevd.95.012011,2017-01-31 11:37:19,2017-01-31 11:37:19,Search for supersymmetry in events with one le...,Physical Review D,1,95,...,,,,,,,,,Article,greg_landsberg@brown.edu


In [187]:
id_cols = ['id', 'activity_report_id','user_email', 'type', 'title', 'identifier', 'doi']
far_dedupe = df_cites_far[ id_cols ].copy()
far_dedupe['doi'].fillna(far_dedupe['identifier'], inplace=True)
assert len(df_cites_far[df_cites_far.doi.notnull()]) + len(
    df_cites_far[df_cites_far.identifier.notnull()]) == len(far_dedupe[far_dedupe.doi.notnull()])
far_dedupe.drop(columns='identifier',inplace=True)
print("Total records: {}".format(len(far_dedupe)))
far_dedupe.sample(10)

Total records: 9689


Unnamed: 0,id,activity_report_id,user_email,type,title,doi
2824,article_3050,468,amy_nunn@brown.edu,Article,"Love, lust and the emotional context of multip...",
5136,article_5555,1671,gary_wessel@brown.edu,Article,The diversity of nanos expression in echinoder...,10.1111/ede.12197
5615,article_6276,1238,tim_kraska@brown.edu,Article,Towards a Benchmark for Interactive Data Explo...,
8408,paper_526,1416,andrew_laird@brown.edu,Paper,"The White Goddess in Mexico: Apuleius’ Latin, ...",
4163,article_4484,1711,richard_bennett@brown.edu,Article,Phenotypic plasticity regulates Candida albica...,
2410,article_2566,751,ani_eloyan@brown.edu,Article,Quantitative Intracerebral Hemorrhage Localiza...,10.1161/strokeaha.115.010369
1162,article_1249,292,christoph_rose-petruck@brown.edu,Article,X-ray focusing scheme with continuously variab...,10.1107/s1600577514020451
2281,article_2433,378,richard_freiman@brown.edu,Article,The developmental origins of the mammalian ova...,
871,article_943,403,suzanne_stewart-steinberg@brown.edu,Article,Reclamation,
1134,article_1220,417,paul_myoda@brown.edu,Article,"“Pablo Atchugarry: Adelman, Anuszkiewicz, Fein...",


In [135]:
far_dedupe[ far_dedupe.duplicated(subset='doi', keep=False) & (far_dedupe.doi.notnull()) ].sort_values(by='doi')

Unnamed: 0,id,activity_report_id,user_email,type,title,doi
6006,article_6688,1530,pedro_gozalo@brown.edu,Article,Tube Feeding in US Nursing Home Residents With...,10.1001/jama.2016.9374
6041,article_6725,1535,vincent_mor@brown.edu,Article,Tube Feeding in US Nursing Home Residents With...,10.1001/jama.2016.9374
6040,article_6724,1535,vincent_mor@brown.edu,Article,Accountability of Hospitals for Medicare Benef...,10.1001/jamainternmed.2015.6508
2114,article_2253,10,momotazur_rahman@brown.edu,Article,Accountability of Hospitals for Medicare Benef...,10.1001/jamainternmed.2015.6508
6338,article_7035,1780,maricruz_rivera-hernandez@brown.edu,Article,Quality of Care for White and Hispanic Medicar...,10.1001/jamainternmed.2016.0267
...,...,...,...,...,...,...
6206,article_6897,1526,stephen_buka@brown.edu,Article,Associations of Mindfulness with Glucose Regul...,10.5993/ajhb.40.2.11
2602,article_2763,713,ethan_balk@brown.edu,Article,Leukotriene-Receptor Antagonists Versus Placeb...,10.7326/m15-1059
966,article_1042,559,christopher_schmid@brown.edu,Article,Leukotriene-Receptor Antagonists Versus Placeb...,10.7326/m15-1059
2620,article_2781,525,omar_galarraga@brown.edu,Article,The dollars and sense of economic incentives t...,10.7448/ias.18.1.20724


In [139]:
print("Matched DOIs with different titles: {}".format(
    len(far_dedupe[ far_dedupe.duplicated(subset='doi', keep=False) & (far_dedupe.doi.notnull()) ]) -\
    len(far_dedupe[ far_dedupe.duplicated(subset=['doi','title'], keep=False) & (far_dedupe.doi.notnull()) ])
))

Matched DOIs with different titles: 54


In [181]:
far_dedupe[ far_dedupe.duplicated(subset=['user_email','title'], keep=False) ].sort_values(by=['user_email','title'])

Unnamed: 0,id,activity_report_id,user_email,type,title,doi
7914,paper_7,1289,adam_mccloskey@brown.edu,Paper,Estimation and Inference with a (Nearly) Singu...,
7915,paper_8,1289,adam_mccloskey@brown.edu,Paper,Estimation and Inference with a (Nearly) Singu...,
7916,paper_9,1289,adam_mccloskey@brown.edu,Paper,Estimation and Inference with a (Nearly) Singu...,
6920,chapter_66,758,adam_pautz@brown.edu,Chapter,Experiences are Representations: An Empirical ...,
7826,chapter_994,1371,adam_pautz@brown.edu,Chapter,Experiences are Representations: An Empirical ...,
...,...,...,...,...,...,...
4389,article_4721,1447,zhenchao_qian@brown.edu,Article,Unemployment and the Transition From Separatio...,10.1177/0192513x15600730
1628,article_1737,739,zhenchao_qian@brown.edu,Article,Wealth Inequality among New Immigrants,
4383,article_4715,1447,zhenchao_qian@brown.edu,Article,Wealth Inequality among New Immigrants,10.1177/0731121415589138
2650,article_2814,404,zhijin_wu@brown.edu,Article,Establishing Informative Prior for Gene Expres...,


In [189]:
far_match_etr = far_dedupe[ far_dedupe.duplicated(subset=['user_email','title', 'activity_report_id'], keep=False) ]
far_match_etr.type.value_counts()

Article     300
Paper       197
Patent       57
Abstract     37
Chapter      15
Book          8
Review        4
Name: type, dtype: int64

In [183]:
dd_1 = far_dedupe.drop(far_match_etr.index)
assert (len(dd_1) + len(far_match_etr) == len(far_dedupe))

In [184]:
far_match_et = dd_1[ dd_1.duplicated(subset=['user_email','title'], keep=False) ]
far_match_et.type.value_counts()

Article    218
Chapter     55
Book        28
Patent      13
Paper       12
Name: type, dtype: int64

In [188]:
far_unmatched = far_dedupe.drop_duplicates(subset=['user_email','title'])

In [178]:
far_unmatched = far_dedupe.drop_duplicates(subset=['user_email','title'])
assert (len(far_match_etr) + len(far_match_et) + len(far_unmatched) == len(far_dedupe))

In [180]:
far_unmatched

Unnamed: 0,id,activity_report_id,user_email,type,title,doi
0,article_6,15,david_borton@brown.edu,Article,Modified toolbox for optogenetics in the nonhu...,10.1117/1.nph.2.3.031202
1,article_7,25,matthew_harrison@brown.edu,Article,Spatiotemporal Conditional Inference and Hypot...,10.1162/neco_a_00681
2,article_9,760,brandon_johnson@brown.edu,Article,Impact jetting as the origin of chondrules,10.1038/nature14105
3,article_10,760,brandon_johnson@brown.edu,Article,The fractured Moon: Production and saturation ...,10.1002/2015gl065022
4,article_11,25,matthew_harrison@brown.edu,Article,Ambiguity and nonidentifiability in the statis...,10.1073/pnas.1506400112
...,...,...,...,...,...,...
9684,abstract_346,1592,thomas_trikalinos@brown.edu,Abstract,On the opportunity cost of non-rigorous or non...,
9685,abstract_347,1592,thomas_trikalinos@brown.edu,Abstract,Evidence synthesis for diagnostic tests with p...,
9686,abstract_348,1592,thomas_trikalinos@brown.edu,Abstract,The cost-effectiveness of testing and treatmen...,
9687,abstract_349,1592,thomas_trikalinos@brown.edu,Abstract,Evidence synthesis using randomized and non-ra...,


In [16]:
def fuzzy_far(field, pool):
    return '||'.join([ p[0] for p in process.extract(
        field, pool, scorer=fuzz.partial_ratio) if p[1] > 90 ])

In [17]:
fuzzy_check = { e.strip().lower() for e in df_cites_far.title.to_list() if isinstance(e, str) and e != 'm'}
fuzzy_far('Revisitar el costumbrismo', fuzzy_check)

'revisitar el costumbrismo||revisitar el costumbrismo: cosmopolitismo, pedagogías y modernización en iberoamérica'

In [18]:
df_far_titles = pd.DataFrame(df_cites_far.title)
df_far_titles.dropna(inplace=True)
df_far_titles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9574 entries, 0 to 9688
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   9574 non-null   object
dtypes: object(1)
memory usage: 149.6+ KB


In [19]:
df_far_titles['matches'] = df_far_titles.title.apply(lambda x: fuzzy_far(x.lower().strip(), fuzzy_check))

KeyboardInterrupt: 

In [130]:
df_far_titles['matches'] = fuzzy_far(df_far_titles.title, fuzzy_check)

KeyboardInterrupt: 

## VIVO Data
[back](#Contents)

In [7]:
with open('data/rab/query_properties.csv') as f:
    data = f.readlines()
    # skip header, strip trailing whitespace
    fin_cite_props = [ d.strip() for d in data[1:] ]

In [8]:
with open('data/rab/query_citations.nt') as f:
    fin_rab_cites = f.readlines()

In [9]:
cite_prop_map = { c: c[40:] for c in fin_cite_props if c.startswith('http://vivo.brown.edu/ontology/citation#') }
cite_prop_map['rabid'] = 'rabid'
cite_prop_map['http://www.w3.org/2000/01/rdf-schema#label'] = 'label'
cite_prop_map['http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType'] = 'type'
cite_prop_map.values()

dict_values(['date', 'volume', 'hasContributor', 'authorList', 'pmid', 'issue', 'doi', 'hasVenue', 'pages', 'pmcid', 'publishedIn', 'book', 'hasLocation', 'editorList', 'chapter', 'hasPublisher', 'isbn', 'url', 'hasConferenceLocation', 'conferenceDate', 'hasConference', 'issn', 'reviewOf', 'title', 'number', 'version', 'hasAssignee', 'hasCountry', 'hasAuthority', 'patentNumber', 'venueFor', 'rabid', 'label', 'type'])

In [10]:
RABCitation = namedtuple('RABCitation', sorted(cite_prop_map.values()))

In [11]:
def clean_data_prop(oData):
    return oData.rsplit('"^^<http://www.w3.org/2001/XMLSchema#', maxsplit=1)[0].strip('\"\n .<>')

In [12]:
def parse_triple(rawRow):
    s,p,o = rawRow.split(' ',maxsplit=2)
    s = s.strip('<>')
    p = p.strip('<>')
    o = clean_data_prop(o)
    return (s,p,o)

In [13]:
cite_triples = []
for t in fin_rab_cites:
    cite_triples.append(parse_triple(t))

cite_triples[0]

('http://vivo.brown.edu/individual/n5c6cae127059414ca258636cd3dc482b',
 'http://www.w3.org/2000/01/rdf-schema#label',
 'Erratum to: Global magnetic confinement for the 1.5D Vlasov-Maxwell system')

In [14]:
# Analyzing citations with more than 1 most specific type

no_ids = [ c for c in cite_triples
          if c[1] == 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType' 
          and c[2] == 'http://vivo.brown.edu/ontology/citation#NoID' ]
msts = defaultdict(set)
for c in cite_triples:
    if c[1] == 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType':
        msts[c[0]].add(c[2])
mlts = set()
for m,v in msts.items():
    if len(v) != 1:
        mlts.add(frozenset(v))

no_id = 'http://vivo.brown.edu/ontology/citation#NoID'        
for m in mlts:
    if no_id in m:
        print("With NoID: ", [ a for a in m if a != no_id])
    else:
        print("Redundant types: ". sorted(list(m)))

With NoID:  ['http://vivo.brown.edu/ontology/citation#ConferencePaper']
With NoID:  ['http://vivo.brown.edu/ontology/citation#BookSection']
With NoID:  ['http://vivo.brown.edu/ontology/citation#Review']
With NoID:  ['http://vivo.brown.edu/ontology/citation#Abstract']
With NoID:  ['http://vivo.brown.edu/ontology/citation#Article']
With NoID:  ['http://vivo.brown.edu/ontology/citation#WorkingPaper']
With NoID:  ['http://vivo.brown.edu/ontology/citation#Citation']
With NoID:  ['http://vivo.brown.edu/ontology/citation#Book']
With NoID:  ['http://vivo.brown.edu/ontology/citation#Patent']


In [15]:
def triple_match(triple, prop=None, obj=None):
    if prop and obj:
        return triple[1] == prop and triple[2] == obj
    if prop:
        return triple[1] == prop
    if obj:
        return triple[2] == obj
    return True

In [16]:
def filter_mst_no_id(triple):
    return not triple_match(triple,
                            'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType',
                            'http://vivo.brown.edu/ontology/citation#NoID')

good_triple = ('foo', 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType', 'bar')
bad_triple = ('foo', 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType',
              'http://vivo.brown.edu/ontology/citation#NoID')
assert filter_mst_no_id(good_triple) == True
assert filter_mst_no_id(bad_triple) == False

In [17]:
strip_msts = [ t for t in cite_triples if filter_mst_no_id(t) ]

In [18]:
cite_dicts = defaultdict(dict)
for t in strip_msts:
    if t[1] in cite_prop_map:
        cite_dicts[t[0]][cite_prop_map[t[1]]] = t[2]

In [19]:
empty_row = { cite_prop_map[p]: '' for p in cite_prop_map }
rab_rows = []
for c in cite_dicts:
    d = cite_dicts[c]
    d['rabid'] = c
    row = empty_row.copy()
    row.update(d)
    rab_rows.append(RABCitation(**row))

print(rab_rows[0])

RABCitation(authorList='Nguyen, Toan T., Nguyen, Truyen V., Strauss, Walter A', book='', chapter='', conferenceDate='', date='2015-06-01', doi='10.3934/krm.2015.8.615', editorList='', hasAssignee='', hasAuthority='', hasConference='', hasConferenceLocation='', hasContributor='http://vivo.brown.edu/individual/wstrauss', hasCountry='', hasLocation='', hasPublisher='', hasVenue='http://vivo.brown.edu/individual/n6086eb8fe7824cad9423547d403a958d', isbn='', issn='', issue='3', label='Erratum to: Global magnetic confinement for the 1.5D Vlasov-Maxwell system', number='', pages='615-616', patentNumber='', pmcid='', pmid='', publishedIn='Kinetic and Related Models', rabid='http://vivo.brown.edu/individual/n5c6cae127059414ca258636cd3dc482b', reviewOf='', title='', type='http://vivo.brown.edu/ontology/citation#Article', url='', venueFor='', version='', volume='8')


In [20]:
df_cites_rab = pd.DataFrame(rab_rows)

cols = df_cites_rab.columns.tolist()
id_atts = [ 'rabid','type','label','doi','pmid','pmcid','isbn','issn' ]
common_atts = [ 'date','authorList','pages','issue','volume' ]
has_atts = ['hasContributor','hasVenue','hasConference','hasConferenceLocation',
            'hasCountry','hasLocation','hasPublisher','hasAssignee','hasAuthority']
grouped_atts = id_atts + common_atts + has_atts
cols = [ c for c in cols if c not in grouped_atts ]
cols = id_atts + common_atts + cols + has_atts
df_cites_rab = df_cites_rab[ cols ]
df_cites_rab.type = df_cites_rab.type.str.rsplit('#').str.get(1)
df_cites_rab.replace(r'^$', np.nan, inplace=True, regex=True)
df_cites_rab.head()

Unnamed: 0,rabid,type,label,doi,pmid,pmcid,isbn,issn,date,authorList,...,version,hasContributor,hasVenue,hasConference,hasConferenceLocation,hasCountry,hasLocation,hasPublisher,hasAssignee,hasAuthority
0,http://vivo.brown.edu/individual/n5c6cae127059...,Article,Erratum to: Global magnetic confinement for th...,10.3934/krm.2015.8.615,,,,,2015-06-01,"Nguyen, Toan T., Nguyen, Truyen V., Strauss, W...",...,,http://vivo.brown.edu/individual/wstrauss,http://vivo.brown.edu/individual/n6086eb8fe782...,,,,,,,
1,http://vivo.brown.edu/individual/n52747,Article,Learning as a Task or a Virtue: U.S. and Chine...,10.1037/0012-1649.40.4.595,15238046.0,,,,2004-01-01,"Li, Jin",...,,http://vivo.brown.edu/individual/jili,http://vivo.brown.edu/individual/n60865,,,,,,,
2,http://vivo.brown.edu/individual/n8301,Article,Predicting discordance between self-reports of...,10.1007/s10461-012-0163-8,22323006.0,PMC3471653,,,2012-08-01,"Brown JL, Sales JM, DiClemente RJ, Salazar LF,...",...,,http://vivo.brown.edu/individual/lbrownmd,http://vivo.brown.edu/individual/n79279,,,,,,,
3,http://vivo.brown.edu/individual/n98528,Article,Effects of 12-O-tetradecanoylphorbol-13-acetat...,10.1002/mc.2940130304,7619217.0,,,,1995-07-01,"Sears WL, Goto-Mandeville R, Mirapuri M, Braun L",...,,http://vivo.brown.edu/individual/lbraun,http://vivo.brown.edu/individual/n82319,,,,,,,
4,http://vivo.brown.edu/individual/n52835,Article,"Daily co-occurrences of marijuana use, alcohol...",10.1016/j.drugalcdep.2014.09.265,,,,,2015-01-01,"Graves, Hannah, Hernandez, Lynn, Kahler, Chris...",...,,http://vivo.brown.edu/individual/lh15,http://vivo.brown.edu/individual/n48368,,,,,,,


In [21]:
df_cites_rab.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49044 entries, 0 to 49043
Data columns (total 34 columns):
rabid                    49044 non-null object
type                     49044 non-null object
label                    49004 non-null object
doi                      43349 non-null object
pmid                     36381 non-null object
pmcid                    13399 non-null object
isbn                     683 non-null object
issn                     16 non-null object
date                     49034 non-null object
authorList               47600 non-null object
pages                    44188 non-null object
issue                    39882 non-null object
volume                   44166 non-null object
book                     744 non-null object
chapter                  148 non-null object
conferenceDate           36 non-null object
editorList               713 non-null object
number                   17 non-null object
patentNumber             9 non-null object
publishedIn        

## 3rd-party IDs

In [22]:
df_cites_rab.type.value_counts()

Article            42700
Citation            3708
ConferencePaper      930
BookSection          774
Book                 520
Review               245
Abstract             128
WorkingPaper          30
Patent                 9
Name: type, dtype: int64

In [38]:
with_ids = len(df_cites_rab[ ((df_cites_rab.pmid.notnull()) | (df_cites_rab.doi.notnull())) ])
print("R@B Citations with DOIs or PMIDs: ", with_ids )
print("R@B Citations without: ", len(df_cites_rab) - with_ids)

Citations with DOIs or PMIDs:  46799
Citations without:  2245


In [31]:
df_cites_rab[ ((df_cites_rab.pmid.isna()) & (df_cites_rab.doi.isna())) ].type.value_counts()

BookSection        678
Article            644
Book               461
Abstract           118
Review             115
Citation           101
ConferencePaper     92
WorkingPaper        27
Patent               9
Name: type, dtype: int64

In [33]:
df_cites_rab[ ((df_cites_rab.pmid.isna()) & (df_cites_rab.doi.isna())
               & (df_cites_rab.isbn.notnull())) ].type.value_counts()

Book           406
BookSection    201
Article          5
Name: type, dtype: int64

In [29]:
df_cites_far.type.value_counts()

Article     6392
Paper       1371
Chapter     1012
Book         467
Abstract     314
Patent        96
Review        37
Name: type, dtype: int64

In [49]:
with_ids = len(df_cites_far[ df_cites_far.doi.notnull() | df_cites_far.identifier.notnull() ])
print("FAR Citations with DOIs or PMIDs: ", with_ids )
print("FAR Citations without: ", len(df_cites_far) - with_ids)

FAR Citations with DOIs or PMIDs:  4932
FAR Citations without:  4757


In [52]:
df_cites_far[ df_cites_far.doi.notnull() | df_cites_far.identifier.notnull() ].type.value_counts()

Article    3897
Chapter    1012
Book         23
Name: type, dtype: int64

In [69]:
df_cites_far[ df_cites_far.doi.notnull() | df_cites_far.identifier.notnull() ].type.value_counts()

Article    3897
Chapter     120
Book         23
Name: type, dtype: int64

In [72]:
df_cites_far[ df_cites_far.identifier.notnull() ].article_id_type_id.value_counts()

DOI    3897
Name: article_id_type_id, dtype: int64

In [94]:
rab_doi_map = { d.lower(): d for d in df_cites_rab[df_cites_rab.doi.notnull()].doi }
far_doi_map = { d.lower(): d for d in df_cites_far[ df_cites_far.identifier.notnull() ].identifier }
far_doi_map.update(
    { d.lower(): d for d in df_cites_far[ df_cites_far.doi.notnull() ].doi } )

rab_dois = set(rab_doi_map.keys())
far_dois = set(far_doi_map.keys())

In [95]:
print("RAB DOIs: ",len(rab_dois))
print("FAR DOIs: ",len(far_dois))
print("Shared DOIs: ", len(rab_dois & far_dois))

RAB DOIs:  41821
FAR DOIs:  3157
Shared DOIs:  1944


In [100]:
only_in_far = { far_doi_map[d] for d  in far_dois - rab_dois }
assert len(only_in_far) == len(far_dois) - len(rab_dois & far_dois)

In [104]:
df_far_dois = df_cites_far[ ((df_cites_far.identifier.isin(only_in_far)) | (df_cites_far.doi.isin(only_in_far)))]

In [120]:
df_far_dois.groupby('identifier').identifier.count().nlargest(50)

identifier
10.1007/jhep01(2016)006           6
10.1007/jhep01(2016)079           6
10.1007/jhep01(2016)096           6
10.1016/j.physletb.2015.10.067    6
10.1016/j.physletb.2015.11.042    6
10.1016/j.physletb.2015.12.017    6
10.1016/j.physletb.2015.12.020    6
10.1016/j.physletb.2015.12.039    6
10.1016/j.physletb.2016.01.010    6
10.1103/physrevd.93.012001        6
10.1103/physrevd.93.012003        6
10.1103/physrevlett.116.032301    6
10.1140/epjc/s10052-015-3853-3    6
10.1007/jhep01(2016)166           5
10.1007/jhep02(2016)122           5
10.1007/jhep02(2016)145           5
10.1007/jhep03(2016)125           5
10.1007/jhep04(2016)005           5
10.1007/jhep04(2016)010           5
10.1007/jhep04(2016)035           5
10.1007/jhep04(2016)073           5
10.1007/jhep06(2016)177           5
10.1007/jhep11(2016)056           5
10.1016/j.physletb.2016.01.056    5
10.1016/j.physletb.2016.02.002    5
10.1016/j.physletb.2016.02.047    5
10.1016/j.physletb.2016.03.039    5
10.1016/j.physlet