# FAR-VIVO Citation Data Analysis

In [1]:
import csv
import os

import requests
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz, process

from collections import namedtuple, defaultdict, Counter
from functools import reduce

vprod = %env VIVO_PRODUCTION
vstage = %env VIVO_STAGING
vuser = %env VIVO_USER
vpass = %env VIVO_PASSWORD

## Contents
* [Acquire](#Acquire)
* [Load FAR Data](#FAR-Publication-Data)
* [Load VIVO Data](#VIVO-Data)

# Acquire
[back](#Contents)

In [11]:
def get_citation_properties(endpoint):
    query = """
    SELECT DISTINCT ?prop
    WHERE {{
        ?cite a <http://vivo.brown.edu/ontology/citation#Citation> .
        ?cite ?prop ?o.
    }}
    """
    data = { 'email': vuser, 'password': vpass, 'query': query }
    headers = { 'Accept': 'text/csv', 'charset': 'utf-8' }
    resp = requests.post(endpoint, data=data, headers=headers)
    if resp.status_code == 200:
        return resp.text
    else:
        print(resp.text)
        return False

In [3]:
with open('data/rab/query_properties.csv','w+') as f:
    f.write(get_citation_properties(vstage))

In [4]:
def get_citation_data(endpoint):
    query = """
    DESCRIBE ?cite
    WHERE {{ ?cite a <http://vivo.brown.edu/ontology/citation#Citation> .}}
    """
    data = { 'email': vuser, 'password': vpass, 'query': query }
    headers = { 'Accept': 'text/plain', 'charset': 'utf-8' }
    resp = requests.post(endpoint, data=data, headers=headers)
    if resp.status_code == 200:
        return resp.text
    else:
        print(resp.text)
        return False

In [5]:
with open('data/rab/query_citations.nt', 'w+') as f:
    f.write(get_citation_data(vstage))

## FAR Publication Data
* [Deduplication](#Deduplicating-FAR-publications)
* [^-top](#Contents)

In [2]:
def wrap_far_row(row, dtype, idIdx):
    row[idIdx] = dtype + '_' + row[idIdx]
    row.append(dtype.capitalize())
    return row

In [3]:
def make_far_df(dtype, fname):
    with open(os.path.join('data/far/',fname)) as f:
        rdr = csv.reader(f, escapechar='\\')
        header = next(rdr)
        assert dtype not in header
        header.append(dtype)
        rows = [ wrap_far_row(r, dtype, header.index('id')) for r in rdr ]
    return pd.DataFrame(rows, columns=header)

In [4]:
far_files = [ ('article', 'articles.csv'), ('book', 'books.csv'),
             ('chapter', 'chapters.csv'), ('review', 'critical_reviews.csv'),
             ('paper', 'papers.csv'), ('patent', 'patents.csv'),
             ('abstract', 'ph_abstracts.csv') ]
dtypes = [ f[0] for f in far_files ]

df_cites_far = pd.concat(
    [ make_far_df(*f) for f in far_files ], axis=0, ignore_index=True, sort=False)
melted = pd.melt(df_cites_far, id_vars=['id'], value_vars=dtypes,
                var_name='drop_me', value_name='type')
df_cites_far = df_cites_far.join(
    melted.dropna().drop(columns='drop_me').set_index('id'), on='id')
df_cites_far.drop(columns=dtypes, inplace=True)
df_cites_far.replace(r'^(|N)$', np.nan, inplace=True, regex=True)
df_cites_far.head()

Unnamed: 0,id,activity_report_id,article_type_id,identifier,created_at,updated_at,title,journal,number,volume,...,other,conference,paper_date,patent_status_id,patent_number,patent_title,patent_date,presentation_type_id,abstract_date,type
0,article_6,15,PEER,10.1117/1.nph.2.3.031202,2016-01-07 16:51:12,2016-01-07 16:51:12,Modified toolbox for optogenetics in the nonhu...,Neurophotonics,3,2,...,,,,,,,,,,Article
1,article_7,25,PEER,10.1162/neco_a_00681,2016-01-07 17:07:06,2016-01-07 17:07:58,Spatiotemporal Conditional Inference and Hypot...,Neural Computation,1,27,...,,,,,,,,,,Article
2,article_9,760,PEER,10.1038/nature14105,2016-01-07 17:08:08,2016-01-07 17:08:08,Impact jetting as the origin of chondrules,Nature,7534,517,...,,,,,,,,,,Article
3,article_10,760,PEER,10.1002/2015gl065022,2016-01-07 17:08:19,2016-01-07 17:10:52,The fractured Moon: Production and saturation ...,Geophysical Research Letters,17,42,...,,,,,,,,,,Article
4,article_11,25,PEER,10.1073/pnas.1506400112,2016-01-07 17:08:37,2016-01-07 17:08:50,Ambiguity and nonidentifiability in the statis...,Proc Natl Acad Sci USA,20,112,...,,,,,,,,,,Article


In [5]:
df_cites_far.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9689 entries, 0 to 9688
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    9689 non-null   object 
 1   activity_report_id    9689 non-null   object 
 2   article_type_id       6392 non-null   object 
 3   identifier            3897 non-null   object 
 4   created_at            9689 non-null   object 
 5   updated_at            9689 non-null   object 
 6   title                 9574 non-null   object 
 7   journal               6349 non-null   object 
 8   number                3599 non-null   object 
 9   volume                4441 non-null   object 
 10  date                  5839 non-null   object 
 11  coauthors             7029 non-null   object 
 12  book_status_id        7388 non-null   object 
 13  article_id_type_id    5263 non-null   object 
 14  page_numbers          3620 non-null   object 
 15  book_type_id         

In [6]:
with open('data/far/activity_reports.csv') as f:
    rdr = csv.reader(f, escapechar='\\')
    header = next(rdr)
    rows = [ r for r in rdr ]
    far_reports = pd.DataFrame(rows, columns=header)
    
with open('data/far/users.csv') as f:
    rdr = csv.reader(f, escapechar='\\')
    header = next(rdr)
    rows = [ r for r in rdr ]
    far_users = pd.DataFrame(rows, columns=header)

far_ids = far_reports.merge(far_users, left_on='user_id', right_on='id', suffixes=('_report', '_user'))
keep=['id_report','email']
far_ids.drop(columns=[ c for c in far_ids.columns if c not in keep], inplace=True)
far_ids.rename(columns={'id_report': 'report_id', 'email': 'user_email'}, inplace=True)
far_ids.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2386 entries, 0 to 2385
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   report_id   2386 non-null   object
 1   user_email  2386 non-null   object
dtypes: object(2)
memory usage: 55.9+ KB


In [7]:
assert len(df_cites_far[ df_cites_far.activity_report_id.isna() ]) == 0
df_cites_far = df_cites_far.merge(far_ids, how='left', left_on='activity_report_id', right_on='report_id')
df_cites_far.drop(columns=['report_id'], inplace=True)
df_cites_far.sample(5)

Unnamed: 0,id,activity_report_id,article_type_id,identifier,created_at,updated_at,title,journal,number,volume,...,conference,paper_date,patent_status_id,patent_number,patent_title,patent_date,presentation_type_id,abstract_date,type,user_email
9621,abstract_282,1536,,,2017-02-23 23:02:58,2017-02-23 23:02:58,Recruiting teens for a tailored text message t...,,,,...,144rd Annual Scientific Meeting of the America...,,,,,,CO,2016-10-31,Abstract,deborah_pearlman@brown.edu
8057,paper_157,1422,,,2017-01-11 15:24:17,2017-01-11 15:24:17,How are SNAP Benefits Spent?,,,,...,Quantitative Marketing and Economics,2016-10-21,,,,,,,Paper,jesse_shapiro_1@brown.edu
3974,article_4290,1014,PEER,10.1515/crelle-2016-0028,2017-01-25 17:36:22,2017-01-25 17:36:22,Essential regularity of the model space for th...,Journal für die reine und angewandte Mathemati...,0.0,0.0,...,,,,,,,,,Article,georgios_daskalopoulos@brown.edu
7329,chapter_486,124,,,2016-02-03 14:12:01,2016-02-03 14:12:01,"Ellipsis in Categorial Grammar (note, this is ...",,,,...,,,,,,,,,Chapter,pauline_jacobson@brown.edu
9084,paper_1273,1007,,,2017-02-20 09:23:29,2017-02-20 09:23:29,Critique of Aryeh Kosman’s The Activity of Bei...,,,,...,"Eastern Division, American Philosophical Assoc...",2016-01-09,,,,,,,Paper,mary_louise_gill@brown.edu


### Deduplicating FAR publications
[top](#FAR-Publication-Data)

In [8]:
df_cites_far[ (df_cites_far.duplicated(subset='identifier', keep=False))
             & (df_cites_far.identifier.notnull())  ].sort_values(by='identifier')

Unnamed: 0,id,activity_report_id,article_type_id,identifier,created_at,updated_at,title,journal,number,volume,...,conference,paper_date,patent_status_id,patent_number,patent_title,patent_date,presentation_type_id,abstract_date,type,user_email
6006,article_6688,1530,PEER,10.1001/jama.2016.9374,2017-02-23 16:06:44,2017-02-23 16:06:44,Tube Feeding in US Nursing Home Residents With...,JAMA,7,316,...,,,,,,,,,Article,pedro_gozalo@brown.edu
6041,article_6725,1535,PEER,10.1001/jama.2016.9374,2017-02-24 08:42:08,2017-02-24 08:42:08,Tube Feeding in US Nursing Home Residents With...,JAMA,7,316,...,,,,,,,,,Article,vincent_mor@brown.edu
6040,article_6724,1535,PEER,10.1001/jamainternmed.2015.6508,2017-02-24 08:42:06,2017-02-24 08:42:06,Accountability of Hospitals for Medicare Benef...,JAMA Internal Medicine,1,176,...,,,,,,,,,Article,vincent_mor@brown.edu
2114,article_2253,10,PEER,10.1001/jamainternmed.2015.6508,2016-02-02 09:13:11,2016-02-02 09:13:11,Accountability of Hospitals for Medicare Benef...,JAMA Intern Med,1,176,...,,,,,,,,,Article,momotazur_rahman@brown.edu
6338,article_7035,1780,PEER,10.1001/jamainternmed.2016.0267,2017-03-23 15:41:01,2017-03-23 15:41:01,Quality of Care for White and Hispanic Medicar...,JAMA Internal Medicine,6,176,...,,,,,,,,,Article,maricruz_rivera-hernandez@brown.edu
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6206,article_6897,1526,PEER,10.5993/ajhb.40.2.11,2017-03-02 17:06:38,2017-03-02 17:06:38,Associations of Mindfulness with Glucose Regul...,American Journal of Health Behavior,2,40,...,,,,,,,,,Article,stephen_buka@brown.edu
966,article_1042,559,PEER,10.7326/m15-1059,2016-01-23 14:32:41,2016-01-23 14:32:41,Leukotriene-Receptor Antagonists Versus Placeb...,Ann Intern Med,10,163,...,,,,,,,,,Article,christopher_schmid@brown.edu
2602,article_2763,713,PEER,10.7326/m15-1059,2016-02-16 16:02:29,2016-02-17 09:48:01,Leukotriene-Receptor Antagonists Versus Placeb...,Ann Intern Med,10,163,...,,,,,,,,,Article,ethan_balk@brown.edu
2620,article_2781,525,PEER,10.7448/ias.18.1.20724,2016-02-17 14:49:10,2016-02-17 14:49:10,The dollars and sense of economic incentives t...,Journal of the International AIDS Society,1,18,...,,,,,,,,,Article,omar_galarraga@brown.edu


In [9]:
df_cites_far[ (df_cites_far.duplicated('doi', keep=False)) & (df_cites_far.doi.notnull()) ].sort_values(by='doi')

Unnamed: 0,id,activity_report_id,article_type_id,identifier,created_at,updated_at,title,journal,number,volume,...,conference,paper_date,patent_status_id,patent_number,patent_title,patent_date,presentation_type_id,abstract_date,type,user_email
7422,chapter_583,1258,,,2017-01-10 22:54:29,2017-01-10 22:54:29,Wireless Neurotechnology for Neural Prostheses,,,,...,,,,,,,,,Chapter,david_borton@brown.edu
7861,chapter_1031,955,,,2017-03-21 16:33:32,2017-03-21 16:33:32,Wireless Neurotechnology for Neural Prostheses,,,,...,,,,,,,,,Chapter,arto_nurmikko@brown.edu
6862,chapter_6,286,,,2016-01-08 19:35:32,2016-01-08 21:01:04,On the Sample Complexity of Cancer Pathways Id...,,,,...,,,,,,,,,Chapter,eliezer_upfal@brown.edu
7312,chapter_469,435,,,2016-01-31 15:53:17,2016-01-31 15:53:17,On the Sample Complexity of Cancer Pathways Id...,,,,...,,,,,,,,,Chapter,benjamin_raphael@brown.edu
6919,chapter_65,694,,,2016-01-11 17:13:30,2016-01-11 17:13:30,Experimental Mechanics for Graduate Students,,,,...,,,,,,,,,Chapter,christian_franck@brown.edu
7651,chapter_815,1066,,,2017-01-30 10:45:18,2017-01-30 10:45:18,Experimental Mechanics for Graduate Students,,,,...,,,,,,,,,Chapter,christian_franck@brown.edu
7432,chapter_593,929,,,2017-01-11 14:11:29,2017-01-11 14:11:29,The Aurora and Borealis Stream Processing Engines,,,,...,,,,,,,,,Chapter,Stan_Zdonik@brown.edu
7798,chapter_966,1299,,,2017-02-09 17:01:27,2017-02-09 17:01:27,The Aurora and Borealis Stream Processing Engines,,,,...,,,,,,,,,Chapter,ugur_cetintemel@brown.edu
7323,chapter_480,508,,,2016-02-02 13:28:56,2016-02-02 13:28:56,ENGINEERING AND CLINICAL ASPECTS OF PHOTOPLETH...,,,,...,,,,,,,,,Chapter,domenico_pacifici@brown.edu
7776,chapter_943,1483,,,2017-02-02 11:05:57,2017-02-02 11:05:57,ENGINEERING AND CLINICAL ASPECTS OF PHOTOPLETH...,,,,...,,,,,,,,,Chapter,domenico_pacifici@brown.edu


In [10]:
df_cites_far[ df_cites_far.identifier.notnull() ].article_id_type_id.value_counts()

DOI    3897
Name: article_id_type_id, dtype: int64

In [11]:
print("Articles with PUBMED-type id: {}".format(
    len(df_cites_far[ df_cites_far.article_id_type_id == "PUBMED" ])))

Articles with PUBMED-type id: 0


In [12]:
print("Articles with OTHER-type id and not-null IDENTIFIER value: {}".format(
    len(df_cites_far[ (df_cites_far.article_id_type_id == "OTHER") & df_cites_far.identifier.notnull()])))

Articles with OTHER-type id and not-null IDENTIFIER value: 0


In [13]:
print("FAR publications with DOI: {}".format(
    len(df_cites_far[ df_cites_far.doi.notnull() ])))
df_cites_far[ df_cites_far.doi.notnull()].sample(5)

FAR publications with DOI: 143


Unnamed: 0,id,activity_report_id,article_type_id,identifier,created_at,updated_at,title,journal,number,volume,...,conference,paper_date,patent_status_id,patent_number,patent_title,patent_date,presentation_type_id,abstract_date,type,user_email
7798,chapter_966,1299,,,2017-02-09 17:01:27,2017-02-09 17:01:27,The Aurora and Borealis Stream Processing Engines,,,,...,,,,,,,,,Chapter,ugur_cetintemel@brown.edu
7312,chapter_469,435,,,2016-01-31 15:53:17,2016-01-31 15:53:17,On the Sample Complexity of Cancer Pathways Id...,,,,...,,,,,,,,,Chapter,benjamin_raphael@brown.edu
7776,chapter_943,1483,,,2017-02-02 11:05:57,2017-02-02 11:05:57,ENGINEERING AND CLINICAL ASPECTS OF PHOTOPLETH...,,,,...,,,,,,,,,Chapter,domenico_pacifici@brown.edu
6527,book_147,725,,,2016-01-27 14:25:18,2016-01-27 14:25:18,Why Philosophy Matters for the Study of Religi...,,,,...,,,,,,,,,Book,thomas_lewis@brown.edu
6958,chapter_104,473,,,2016-01-13 10:30:44,2016-01-13 10:30:44,Facilitating Career Advancement for Women in t...,,,,...,,,,,,,,,Chapter,meredith_hastings@brown.edu


In [14]:
print("FAR publications with IDENTIFIER: {}".format(
    len(df_cites_far[ df_cites_far.identifier.notnull() ])))
df_cites_far[ df_cites_far.identifier.notnull()].sample(5)

FAR publications with IDENTIFIER: 3897


Unnamed: 0,id,activity_report_id,article_type_id,identifier,created_at,updated_at,title,journal,number,volume,...,conference,paper_date,patent_status_id,patent_number,patent_title,patent_date,presentation_type_id,abstract_date,type,user_email
2537,article_2695,158,PEER,10.1016/s2214-109x(14)70381-x,2016-02-15 08:47:39,2016-02-15 08:47:39,Dietary quality among men and women in 187 cou...,The Lancet Global Health,3.0,3,...,,,,,,,,,Article,stephen_mcgarvey@brown.edu
5592,article_6253,1084,PEER,10.1016/j.physletb.2016.02.047,2017-02-16 20:06:00,2017-02-16 20:06:00,"Measurement of the ratio <mml:math altimg=""si1...",Physics Letters B,,756,...,,,,,,,,,Article,ulrich_heintz@brown.edu
5575,article_6233,1084,PEER,10.1016/j.physletb.2015.10.086,2017-02-16 20:04:54,2017-02-16 20:04:54,Precise measurement of the top quark mass in d...,Physics Letters B,,752,...,,,,,,,,,Article,ulrich_heintz@brown.edu
2524,article_2682,158,PEER,10.1371/journal.pone.0115628,2016-02-15 08:39:16,2016-02-15 08:39:16,Partition and Poliomyelitis: An Investigation ...,PLoS ONE,3.0,10,...,,,,,,,,,Article,stephen_mcgarvey@brown.edu
5110,article_5529,1666,PEER,10.1037/pro0000066,2017-02-10 19:06:45,2017-02-10 19:06:45,Dimensions of treatment quality most valued by...,Professional Psychology: Research and Practice,2.0,47,...,,,,,,,,,Article,anthony_spirito@brown.edu


In [15]:
id_cols = ['id', 'activity_report_id','user_email', 'type', 'title', 'identifier', 'doi']
far_dedupe = df_cites_far[ id_cols ].copy()
far_dedupe['doi'].fillna(far_dedupe['identifier'], inplace=True)
assert len(df_cites_far[df_cites_far.doi.notnull()]) + len(
    df_cites_far[df_cites_far.identifier.notnull()]) == len(far_dedupe[far_dedupe.doi.notnull()])
far_dedupe.drop(columns='identifier',inplace=True)
print("Total records: {}".format(len(far_dedupe)))
far_dedupe.sample(10)

Total records: 9689


Unnamed: 0,id,activity_report_id,user_email,type,title,doi
6995,chapter_141,538,jonathan_conant@brown.edu,Chapter,Romanness in the Age of Attila,
5894,article_6573,1591,brandon_marshall@brown.edu,Article,Effectiveness of peer-led interventions to inc...,10.1080/09540121.2017.1282105
6149,article_6839,1560,david_m_williams@brown.edu,Article,Self-efficacy versus perceived enjoyment as pr...,10.1080/08870446.2015.1111372
5499,article_6152,1084,ulrich_heintz@brown.edu,Article,Measurement of Long-Range Near-Side Two-Partic...,10.1103/physrevlett.116.172302
8258,paper_367,1094,shreyas_mandre@brown.edu,Paper,Vortex wake interactions and energy harvesting...,
899,article_973,131,e_katz@brown.edu,Article,"Anecdotal History: the New Yorker , Joseph Mi...",10.1093/alh/ajv031
8801,paper_976,1288,ruth_bahar@brown.edu,Paper,A HTM-based Mechanism for Error -Resilient and...,
4090,article_4409,1705,eli_adashi@brown.edu,Article,Access to Infertility Services in the United S...,
6462,book_75,161,kenneth_miller@brown.edu,Book,Biology by Miller & Levine (Texas Edition),
1892,article_2022,612,jung-eun_lee@brown.edu,Article,Seasonal decoupling between vegetation greenne...,


In [43]:
far_dedupe.isnull().sum(axis=0)

id                       0
activity_report_id       0
user_email               0
type                     0
title                  110
doi                   3850
dtype: int64

In [44]:
far_dedupe[ far_dedupe.title.isnull() ].sample(10)

Unnamed: 0,id,activity_report_id,user_email,type,title,doi
9365,patent_94,1760,christopher_moore@brown.edu,Patent,,
9350,patent_79,965,jeffrey_hoffstein@brown.edu,Patent,,
9286,patent_11,130,george_karniadakis@brown.edu,Patent,,
9331,patent_57,385,joseph_rovan@brown.edu,Patent,,
9366,patent_95,918,kyung-suk_kim@brown.edu,Patent,,
1523,article_1623,256,gang_xiao@brown.edu,Article,,
9292,patent_17,636,chun_lee@brown.edu,Patent,,
9288,patent_13,130,george_karniadakis@brown.edu,Patent,,
9306,patent_31,225,roberto_tamassia@brown.edu,Patent,,
9360,patent_89,1698,jeffrey_morgan@brown.edu,Patent,,


In [45]:
far_dedupe[ far_dedupe.title.isnull() ].type.value_counts()

Patent     96
Chapter     6
Article     5
Book        3
Name: type, dtype: int64

In [47]:
far_dedupe[ (far_dedupe.title.isnull()) & (far_dedupe.type != 'Patent') ]

Unnamed: 0,id,activity_report_id,user_email,type,title,doi
143,article_163,409,huajian_gao@brown.edu,Article,,
1523,article_1623,256,gang_xiao@brown.edu,Article,,
1611,article_1720,346,esther_whitfield@brown.edu,Article,,
2549,article_2708,339,mark_lurie@brown.edu,Article,,
2944,article_3172,129,nicolaos_kapouleas@brown.edu,Article,,
6508,book_127,351,ronald_martinez@brown.edu,Book,,
6575,book_199,289,susan_miller@brown.edu,Book,,
6613,book_239,671,tayhas_palmore@brown.edu,Book,,
6975,chapter_121,379,christopher_keith@brown.edu,Chapter,,
7353,chapter_512,354,tara_white@brown.edu,Chapter,,


In [48]:
far_dedupe.title.fillna('', inplace=True)

In [49]:
far_dedupe[ far_dedupe.duplicated(subset='doi', keep=False) & (far_dedupe.doi.notnull()) ].sort_values(by='doi')

Unnamed: 0,id,activity_report_id,user_email,type,title,doi
6041,article_6725,1535,vincent_mor@brown.edu,Article,Tube Feeding in US Nursing Home Residents With...,10.1001/jama.2016.9374
6006,article_6688,1530,pedro_gozalo@brown.edu,Article,Tube Feeding in US Nursing Home Residents With...,10.1001/jama.2016.9374
6040,article_6724,1535,vincent_mor@brown.edu,Article,Accountability of Hospitals for Medicare Benef...,10.1001/jamainternmed.2015.6508
2114,article_2253,10,momotazur_rahman@brown.edu,Article,Accountability of Hospitals for Medicare Benef...,10.1001/jamainternmed.2015.6508
4904,article_5320,1566,amal_trivedi@brown.edu,Article,Quality of Care for White and Hispanic Medicar...,10.1001/jamainternmed.2016.0267
...,...,...,...,...,...,...
2620,article_2781,525,omar_galarraga@brown.edu,Article,The dollars and sense of economic incentives t...,10.7448/ias.18.1.20724
6723,book_354,1405,jennifer_johnson1@brown.edu,Book,"The Battle for Algeria: Sovereignty, Health Ca...",10.9783/9780812292008
6609,book_235,6,jennifer_johnson1@brown.edu,Book,"The Battle for Algeria: Sovereignty, Health Ca...",10.9783/9780812292008
6436,book_49,349,seth_rockman@brown.edu,Book,Slavery's Capitalism: A New History of America...,10.9783/9780812293098


In [50]:
print("Matched DOIs with different titles: {}".format(
    len(far_dedupe[ far_dedupe.duplicated(subset='doi', keep=False) & (far_dedupe.doi.notnull()) ]) -\
    len(far_dedupe[ far_dedupe.duplicated(subset=['doi','title'], keep=False) & (far_dedupe.doi.notnull()) ])
))

Matched DOIs with different titles: 54


In [51]:
far_dedupe[ far_dedupe.duplicated(subset=['user_email','title'], keep=False) ].sort_values(by=['user_email','title'])

Unnamed: 0,id,activity_report_id,user_email,type,title,doi
6920,chapter_66,758,adam_pautz@brown.edu,Chapter,Experiences are Representations: An Empirical ...,
7826,chapter_994,1371,adam_pautz@brown.edu,Chapter,Experiences are Representations: An Empirical ...,
1803,article_1932,360,alberto_saal@brown.edu,Article,Two-component mantle melting-mixing model for ...,10.1016/j.gca.2015.10.033
3411,article_3686,1476,alberto_saal@brown.edu,Article,Two-component mantle melting-mixing model for ...,10.1016/j.gca.2015.10.033
14,article_21,535,alexander_gourevitch@brown.edu,Article,The Limits of a Basic Income: Means and Ends o...,10.1515/bis-2016-0008
...,...,...,...,...,...,...
4389,article_4721,1447,zhenchao_qian@brown.edu,Article,Unemployment and the Transition From Separatio...,10.1177/0192513x15600730
1628,article_1737,739,zhenchao_qian@brown.edu,Article,Wealth Inequality among New Immigrants,10.1177/0731121415589138
4383,article_4715,1447,zhenchao_qian@brown.edu,Article,Wealth Inequality among New Immigrants,10.1177/0731121415589138
2650,article_2814,404,zhijin_wu@brown.edu,Article,Establishing Informative Prior for Gene Expres...,10.1007/s12561-016-9172-x


In [52]:
far_match_etr = far_dedupe[ far_dedupe.duplicated(subset=['user_email','title', 'activity_report_id'], keep=False) ]
far_match_etr.type.value_counts()

Article    267
Patent      57
Chapter      9
Book         7
Review       4
Name: type, dtype: int64

In [53]:
far_match_etr[far_match_etr.type == 'Article'].sort_values(by=['user_email','title'])

Unnamed: 0,id,activity_report_id,user_email,type,title,doi
4674,article_5009,1068,amy_greenwald@brown.edu,Article,On revenue-maximizing walrasian equilibria for...,
419,article_468,214,andrea_simmons@brown.edu,Article,Explorations of the unique anuran ear: The con...,10.1121/2.0000132
424,article_473,214,andrea_simmons@brown.edu,Article,Explorations of the unique anuran ear: The con...,10.1121/1.4933440
1467,article_1562,435,benjamin_raphael@brown.edu,Article,On the Sample Complexity of Cancer Pathways Id...,10.1089/cmb.2015.0100
3854,article_4151,1521,christian_huber@brown.edu,Article,Bubble accumulation and its role in the evolut...,10.1038/nature17401
...,...,...,...,...,...,...
5572,article_6230,1084,ulrich_heintz@brown.edu,Article,Study ofBMeson Production inp+PbCollisions ats...,10.1103/physrevlett.116.032301
5497,article_6150,1084,ulrich_heintz@brown.edu,Article,Transverse momentum spectra of inclusive b jet...,10.1016/j.physletb.2016.01.010
5556,article_6214,1084,ulrich_heintz@brown.edu,Article,Transverse momentum spectra of inclusive b jet...,10.1016/j.physletb.2016.01.010
2125,article_2268,244,william_warren_jr@brown.edu,Article,The visual coupling between neighbors in a vir...,10.1167/15.12.747


In [54]:
far_match_etr[ far_match_etr.user_email == 'uriel_cohen_priva@brown.edu']

Unnamed: 0,id,activity_report_id,user_email,type,title,doi


In [55]:
far_match_etr.sort_values(by=['user_email','title'])

Unnamed: 0,id,activity_report_id,user_email,type,title,doi
4674,article_5009,1068,amy_greenwald@brown.edu,Article,On revenue-maximizing walrasian equilibria for...,
7737,chapter_904,1068,amy_greenwald@brown.edu,Chapter,On revenue-maximizing walrasian equilibria for...,
419,article_468,214,andrea_simmons@brown.edu,Article,Explorations of the unique anuran ear: The con...,10.1121/2.0000132
424,article_473,214,andrea_simmons@brown.edu,Article,Explorations of the unique anuran ear: The con...,10.1121/1.4933440
9370,patent_100,1456,anubhav_tripathi@brown.edu,Patent,,
...,...,...,...,...,...,...
5556,article_6214,1084,ulrich_heintz@brown.edu,Article,Transverse momentum spectra of inclusive b jet...,10.1016/j.physletb.2016.01.010
2125,article_2268,244,william_warren_jr@brown.edu,Article,The visual coupling between neighbors in a vir...,10.1167/15.12.747
2130,article_2273,244,william_warren_jr@brown.edu,Article,The visual coupling between neighbors in a vir...,10.1167/15.12.747
9283,patent_8,389,wolfgang_peti@brown.edu,Patent,,


In [56]:
dd_1 = far_dedupe.drop(far_match_etr.index)
assert (len(dd_1) + len(far_match_etr) == len(far_dedupe))

In [57]:
far_match_titles_different_reports = dd_1[ dd_1.duplicated(subset=['user_email','title'], keep=False) ]
far_match_titles_different_reports.type.value_counts()
assert (len(far_match_etr) + len(far_match_et) + len(far_unmatched) == len(far_dedupe))

NameError: name 'far_match_et' is not defined

In [58]:
dedupe_pabst = far_dedupe[ far_dedupe.type.isin(['Abstract','Paper'])]
print("# of Papers and Abstracts: {}".format(len(dedupe_pabst)))
far_dedupe.drop(dedupe_pabst.index, inplace=True)

# of Papers and Abstracts: 0


In [59]:
far_match_etr_2 = far_dedupe[ far_dedupe.duplicated(subset=['user_email','title', 'activity_report_id'], keep=False) ]
far_match_etr_2.type.value_counts()

Article    267
Patent      57
Chapter      9
Book         7
Review       4
Name: type, dtype: int64

In [60]:
far_match_etr_2[far_match_etr_2.type == 'Article'].sort_values(by=['user_email','title'])

Unnamed: 0,id,activity_report_id,user_email,type,title,doi
4674,article_5009,1068,amy_greenwald@brown.edu,Article,On revenue-maximizing walrasian equilibria for...,
419,article_468,214,andrea_simmons@brown.edu,Article,Explorations of the unique anuran ear: The con...,10.1121/2.0000132
424,article_473,214,andrea_simmons@brown.edu,Article,Explorations of the unique anuran ear: The con...,10.1121/1.4933440
1467,article_1562,435,benjamin_raphael@brown.edu,Article,On the Sample Complexity of Cancer Pathways Id...,10.1089/cmb.2015.0100
3854,article_4151,1521,christian_huber@brown.edu,Article,Bubble accumulation and its role in the evolut...,10.1038/nature17401
...,...,...,...,...,...,...
5572,article_6230,1084,ulrich_heintz@brown.edu,Article,Study ofBMeson Production inp+PbCollisions ats...,10.1103/physrevlett.116.032301
5497,article_6150,1084,ulrich_heintz@brown.edu,Article,Transverse momentum spectra of inclusive b jet...,10.1016/j.physletb.2016.01.010
5556,article_6214,1084,ulrich_heintz@brown.edu,Article,Transverse momentum spectra of inclusive b jet...,10.1016/j.physletb.2016.01.010
2125,article_2268,244,william_warren_jr@brown.edu,Article,The visual coupling between neighbors in a vir...,10.1167/15.12.747


In [61]:
len(far_dedupe[ far_dedupe.doi.notnull() ])

4154

In [62]:
#https://stackoverflow.com/questions/46391128/pandas-fillna-using-groupby
far_dedupe['doi'] = far_dedupe.groupby(['title'], sort=False)['doi'].apply(lambda x: x.ffill().bfill())
len(far_dedupe[ far_dedupe.doi.notnull() ])

4154

In [63]:
far_year_unique = far_dedupe.drop_duplicates(subset=['user_email','title','activity_report_id'])
far_year_unique.sort_values(by=['user_email','title'])

Unnamed: 0,id,activity_report_id,user_email,type,title,doi
724,article_792,260,Stan_Zdonik@brown.edu,Article,A Demonstration of the BigDAWG Polystore System,
723,article_791,260,Stan_Zdonik@brown.edu,Article,An Architecture for Compiling UDF-centric Work...,
3400,article_3674,929,Stan_Zdonik@brown.edu,Article,Integrating real-time and batch processing in ...,10.1109/hpec.2016.7761585
3402,article_3676,929,Stan_Zdonik@brown.edu,Article,Interactive Search and Exploration of Waveform...,10.1145/2882903.2899404
3401,article_3675,929,Stan_Zdonik@brown.edu,Article,Larger-than-memory data management on modern s...,10.1145/2933349.2933358
...,...,...,...,...,...,...
7341,chapter_499,404,zhijin_wu@brown.edu,Chapter,Experimental Design and Power Calculation for ...,
7840,chapter_1009,1563,zhijin_wu@brown.edu,Chapter,Experimental Design and Power Calculation for ...,
5836,article_6508,1563,zhijin_wu@brown.edu,Article,Genotype‐based gene signature of glioma risk i...,
5833,article_6505,1563,zhijin_wu@brown.edu,Article,Mitochondrial-Nuclear Interactions Mediate Sex...,10.1534/genetics.116.192328


In [64]:
ar_count = far_year_unique.groupby(['user_email','title','type'])['activity_report_id'].count()
ar_count[ ar_count > 1 ]

user_email                      title                                                                                                                                                           type   
adam_pautz@brown.edu            Experiences are Representations: An Empirical Argument                                                                                                          Chapter    2
alberto_saal@brown.edu          Two-component mantle melting-mixing model for the generation of mid-ocean ridge basalts: Implications for the volatile content of the Pacific upper mantle      Article    2
alexander_gourevitch@brown.edu  The Limits of a Basic Income: Means and Ends of Workplace Democracy                                                                                             Article    2
allan_bower@brown.edu           Thermodynamics, stress, and Stefan-Maxwell diffusion in solids: application to small-strain materials used in commercial lithium-ion batteries           

In [65]:
df_cites_far.iloc[ [2125, 2130, 1654, 4189] ]

Unnamed: 0,id,activity_report_id,article_type_id,identifier,created_at,updated_at,title,journal,number,volume,...,conference,paper_date,patent_status_id,patent_number,patent_title,patent_date,presentation_type_id,abstract_date,type,user_email
2125,article_2268,244,CONF,10.1167/15.12.747,2016-02-02 11:39:46,2016-02-02 11:39:46,The visual coupling between neighbors in a vir...,Journal of Vision,12.0,15,...,,,,,,,,,Article,william_warren_jr@brown.edu
2130,article_2273,244,PEER,,2016-02-02 13:21:19,2016-02-02 13:21:19,The visual coupling between neighbors in a vir...,Proceedings of the International Conference on...,,18,...,,,,,,,,,Article,william_warren_jr@brown.edu
1654,article_1767,315,PEER,10.1002/rcm.7414,2016-01-29 08:58:29,2016-01-29 08:58:29,Identification of double-bond positions in iso...,Rapid Commun. Mass Spectrom.,1.0,30,...,,,,,,,,,Article,yongsong_huang@brown.edu
4189,article_4510,1309,PEER,10.1002/rcm.7414,2017-01-28 16:19:43,2017-01-28 16:19:43,Identification of double-bond positions in iso...,Rapid Communications in Mass Spectrometry,1.0,30,...,,,,,,,,,Article,yongsong_huang@brown.edu


In [66]:
ar_count[ ar_count > 2 ]

Series([], Name: activity_report_id, dtype: int64)

In [86]:
def fuzzy_far(row):
    return process.extract(row.title, row.title_set, scorer=fuzz.partial_ratio)

# fuzzy_vec = np.vectorize(fuzzy_far)

In [92]:
fuzzy_check = defaultdict(set)
for i, row in df_cites_far.iterrows():
    fuzzy_check[row.user_email].add(str(row.title))
fuzzy_check['david_borton@brown.edu']

{'A brain–spine interface alleviating gait deficits after spinal cord injury in primates',
 'Advances in Retinal Prosthetic Research: A Systematic Review of Engineering and Clinical Characteristics of Current Prosthetic Initiatives',
 'Modified toolbox for optogenetics in the nonhuman primate',
 'Rewiring the Nervous System, Without Wires',
 'Wireless Neurotechnology for Neural Prostheses'}

In [93]:
faculty_titles = pd.Series(fuzzy_check)
df_faculty = far_dedupe.set_index('user_email')
df_faculty

Unnamed: 0_level_0,id,activity_report_id,type,title,doi
user_email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
david_borton@brown.edu,article_6,15,Article,Modified toolbox for optogenetics in the nonhu...,10.1117/1.nph.2.3.031202
matthew_harrison@brown.edu,article_7,25,Article,Spatiotemporal Conditional Inference and Hypot...,10.1162/neco_a_00681
brandon_johnson@brown.edu,article_9,760,Article,Impact jetting as the origin of chondrules,10.1038/nature14105
brandon_johnson@brown.edu,article_10,760,Article,The fractured Moon: Production and saturation ...,10.1002/2015gl065022
matthew_harrison@brown.edu,article_11,25,Article,Ambiguity and nonidentifiability in the statis...,10.1073/pnas.1506400112
...,...,...,...,...,...
anubhav_tripathi@brown.edu,patent_100,1456,Patent,,
anubhav_tripathi@brown.edu,patent_101,1456,Patent,,
anubhav_tripathi@brown.edu,patent_102,1456,Patent,,
anubhav_tripathi@brown.edu,patent_103,1456,Patent,,


In [94]:
df_faculty['title_set'] = faculty_titles
df_faculty.isnull().sum(axis=0)

id                       0
activity_report_id       0
type                     0
title                    0
doi                   3850
title_set                0
dtype: int64

In [95]:
df_faculty.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8004 entries, david_borton@brown.edu to anubhav_tripathi@brown.edu
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  8004 non-null   object
 1   activity_report_id  8004 non-null   object
 2   type                8004 non-null   object
 3   title               8004 non-null   object
 4   doi                 4154 non-null   object
 5   title_set           8004 non-null   object
dtypes: object(6)
memory usage: 437.7+ KB


In [96]:
df_faculty['scored_matches'] = df_faculty.apply(fuzzy_far, axis=1)





In [97]:
df_faculty.head()

Unnamed: 0_level_0,id,activity_report_id,type,title,doi,title_set,scored_matches
user_email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
david_borton@brown.edu,article_6,15,Article,Modified toolbox for optogenetics in the nonhu...,10.1117/1.nph.2.3.031202,"{Rewiring the Nervous System, Without Wires, A...",[(Modified toolbox for optogenetics in the non...
matthew_harrison@brown.edu,article_7,25,Article,Spatiotemporal Conditional Inference and Hypot...,10.1162/neco_a_00681,{Closed Loop Intracortical Brain Computer Inte...,[(Spatiotemporal Conditional Inference and Hyp...
brandon_johnson@brown.edu,article_9,760,Article,Impact jetting as the origin of chondrules,10.1038/nature14105,{The reduction of friction in long runout land...,"[(Impact jetting as the origin of chondrules, ..."
brandon_johnson@brown.edu,article_10,760,Article,The fractured Moon: Production and saturation ...,10.1002/2015gl065022,{The reduction of friction in long runout land...,[(The fractured Moon: Production and saturatio...
matthew_harrison@brown.edu,article_11,25,Article,Ambiguity and nonidentifiability in the statis...,10.1073/pnas.1506400112,{Closed Loop Intracortical Brain Computer Inte...,[(Ambiguity and nonidentifiability in the stat...


In [125]:
far_dedupe.groupby('user_email')['title'].transform(
    lambda x: process.extract(x, fuzzy_check[far_dedupe.loc[x.index].user_email.values[0]], scorer=fuzz.partial_ratio))

TypeError: expected string or bytes-like object

In [None]:
df_far_titles['matches'] = df_far_titles.title.apply(lambda x: fuzzy_far(x.lower().strip(), fuzzy_check))

In [None]:
df_far_titles['matches'] = fuzzy_far(df_far_titles.title, fuzzy_check)

## VIVO Data
[back](#Contents)

In [None]:
with open('data/rab/query_properties.csv') as f:
    data = f.readlines()
    # skip header, strip trailing whitespace
    fin_cite_props = [ d.strip() for d in data[1:] ]

In [None]:
with open('data/rab/query_citations.nt') as f:
    fin_rab_cites = f.readlines()

In [None]:
cite_prop_map = { c: c[40:] for c in fin_cite_props if c.startswith('http://vivo.brown.edu/ontology/citation#') }
cite_prop_map['rabid'] = 'rabid'
cite_prop_map['http://www.w3.org/2000/01/rdf-schema#label'] = 'label'
cite_prop_map['http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType'] = 'type'
cite_prop_map.values()

In [None]:
RABCitation = namedtuple('RABCitation', sorted(cite_prop_map.values()))

In [None]:
def clean_data_prop(oData):
    return oData.rsplit('"^^<http://www.w3.org/2001/XMLSchema#', maxsplit=1)[0].strip('\"\n .<>')

In [None]:
def parse_triple(rawRow):
    s,p,o = rawRow.split(' ',maxsplit=2)
    s = s.strip('<>')
    p = p.strip('<>')
    o = clean_data_prop(o)
    return (s,p,o)

In [None]:
cite_triples = []
for t in fin_rab_cites:
    cite_triples.append(parse_triple(t))

cite_triples[0]

In [None]:
# Analyzing citations with more than 1 most specific type

no_ids = [ c for c in cite_triples
          if c[1] == 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType' 
          and c[2] == 'http://vivo.brown.edu/ontology/citation#NoID' ]
msts = defaultdict(set)
for c in cite_triples:
    if c[1] == 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType':
        msts[c[0]].add(c[2])
mlts = set()
for m,v in msts.items():
    if len(v) != 1:
        mlts.add(frozenset(v))

no_id = 'http://vivo.brown.edu/ontology/citation#NoID'        
for m in mlts:
    if no_id in m:
        print("With NoID: ", [ a for a in m if a != no_id])
    else:
        print("Redundant types: ". sorted(list(m)))

In [None]:
def triple_match(triple, prop=None, obj=None):
    if prop and obj:
        return triple[1] == prop and triple[2] == obj
    if prop:
        return triple[1] == prop
    if obj:
        return triple[2] == obj
    return True

In [None]:
def filter_mst_no_id(triple):
    return not triple_match(triple,
                            'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType',
                            'http://vivo.brown.edu/ontology/citation#NoID')

good_triple = ('foo', 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType', 'bar')
bad_triple = ('foo', 'http://vitro.mannlib.cornell.edu/ns/vitro/0.7#mostSpecificType',
              'http://vivo.brown.edu/ontology/citation#NoID')
assert filter_mst_no_id(good_triple) == True
assert filter_mst_no_id(bad_triple) == False

In [None]:
strip_msts = [ t for t in cite_triples if filter_mst_no_id(t) ]

In [None]:
cite_dicts = defaultdict(dict)
for t in strip_msts:
    if t[1] in cite_prop_map:
        cite_dicts[t[0]][cite_prop_map[t[1]]] = t[2]

In [None]:
empty_row = { cite_prop_map[p]: '' for p in cite_prop_map }
rab_rows = []
for c in cite_dicts:
    d = cite_dicts[c]
    d['rabid'] = c
    row = empty_row.copy()
    row.update(d)
    rab_rows.append(RABCitation(**row))

print(rab_rows[0])

In [None]:
df_cites_rab = pd.DataFrame(rab_rows)

cols = df_cites_rab.columns.tolist()
id_atts = [ 'rabid','type','label','doi','pmid','pmcid','isbn','issn' ]
common_atts = [ 'date','authorList','pages','issue','volume' ]
has_atts = ['hasContributor','hasVenue','hasConference','hasConferenceLocation',
            'hasCountry','hasLocation','hasPublisher','hasAssignee','hasAuthority']
grouped_atts = id_atts + common_atts + has_atts
cols = [ c for c in cols if c not in grouped_atts ]
cols = id_atts + common_atts + cols + has_atts
df_cites_rab = df_cites_rab[ cols ]
df_cites_rab.type = df_cites_rab.type.str.rsplit('#').str.get(1)
df_cites_rab.replace(r'^$', np.nan, inplace=True, regex=True)
df_cites_rab.head()

In [None]:
df_cites_rab.info()

## 3rd-party IDs

In [None]:
df_cites_rab.type.value_counts()

In [None]:
with_ids = len(df_cites_rab[ ((df_cites_rab.pmid.notnull()) | (df_cites_rab.doi.notnull())) ])
print("R@B Citations with DOIs or PMIDs: ", with_ids )
print("R@B Citations without: ", len(df_cites_rab) - with_ids)

In [None]:
df_cites_rab[ ((df_cites_rab.pmid.isna()) & (df_cites_rab.doi.isna())) ].type.value_counts()

In [None]:
df_cites_rab[ ((df_cites_rab.pmid.isna()) & (df_cites_rab.doi.isna())
               & (df_cites_rab.isbn.notnull())) ].type.value_counts()

In [None]:
df_cites_far.type.value_counts()

In [None]:
with_ids = len(df_cites_far[ df_cites_far.doi.notnull() | df_cites_far.identifier.notnull() ])
print("FAR Citations with DOIs or PMIDs: ", with_ids )
print("FAR Citations without: ", len(df_cites_far) - with_ids)

In [None]:
df_cites_far[ df_cites_far.doi.notnull() | df_cites_far.identifier.notnull() ].type.value_counts()

In [None]:
df_cites_far[ df_cites_far.doi.notnull() | df_cites_far.identifier.notnull() ].type.value_counts()

In [None]:
df_cites_far[ df_cites_far.identifier.notnull() ].article_id_type_id.value_counts()

In [None]:
rab_doi_map = { d.lower(): d for d in df_cites_rab[df_cites_rab.doi.notnull()].doi }
far_doi_map = { d.lower(): d for d in df_cites_far[ df_cites_far.identifier.notnull() ].identifier }
far_doi_map.update(
    { d.lower(): d for d in df_cites_far[ df_cites_far.doi.notnull() ].doi } )

rab_dois = set(rab_doi_map.keys())
far_dois = set(far_doi_map.keys())

In [None]:
print("RAB DOIs: ",len(rab_dois))
print("FAR DOIs: ",len(far_dois))
print("Shared DOIs: ", len(rab_dois & far_dois))

In [None]:
only_in_far = { far_doi_map[d] for d  in far_dois - rab_dois }
assert len(only_in_far) == len(far_dois) - len(rab_dois & far_dois)

In [None]:
df_far_dois = df_cites_far[ ((df_cites_far.identifier.isin(only_in_far)) | (df_cites_far.doi.isin(only_in_far)))]

In [None]:
df_far_dois.groupby('identifier').identifier.count().nlargest(50)