In [1]:
import csv
import gzip
import html
import pandas as pd
import simplejson as json

from os.path import join

DATA_DIR = 'raw_data/'
MAG_DIR = '/home/qke100/ke-data/dataset-MAG/'
ORCID_DIR = '/home/qke100/ke-data/dataset-ORCID/'

## Validated papers by AFT users

In [2]:
! head -6 raw_data/authorPub.csv

"pubid","pid","score_total","score_human","score_coa","score_1st","pmid","doi","s2id","citations"
"32","2","1","1","1","1","25671436","10.1371/journal.pone.0117057","9c9e744359358a927e8098416a38a9fe92cb751f","59"
"32","61537","1","0","1","1","25671436","10.1371/journal.pone.0117057","9c9e744359358a927e8098416a38a9fe92cb751f","59"
"32","513408","0.01","0","0.01","0.01","25671436","10.1371/journal.pone.0117057","9c9e744359358a927e8098416a38a9fe92cb751f","59"
"33","2","1","1","1","0.56","25619657","10.1016/j.neuron.2014.12.050","6ef6b76976fbd5befa34c78091d563c05ee209db","106"
"33","7402","1","1","1","0.01","25619657","10.1016/j.neuron.2014.12.050","6ef6b76976fbd5befa34c78091d563c05ee209db","106"


In [3]:
def clean_doi(x):
    """"""
    xs = html.unescape(x.strip().lower())
    for s in ['https://doi.org/', 'http://doi.org/']:
        if xs.startswith(s):
            xs = xs[len(s):]
            break
    return xs.replace('//', '/')

def get_val_paper_df():
    """"""
    result = {}
    for in_path in ['authorPub.csv', 'authorPub2.csv', 'authorPub3.csv']:
        print(in_path)
        with open(join(DATA_DIR, in_path)) as fin:
            reader = csv.reader(fin, delimiter=',')
            next(reader, None)
            for row in reader:
                if row[3] != 'NULL' and int(row[3]) == 1:
                    pubid, pmid, doi = row[0], row[6], row[7]
                    if pmid in ['NULL', '0']:
                        pmid = ''
                    doi = '' if doi in ['NULL', '0'] else clean_doi(doi)
                    result[pubid] = (pmid, doi)
    print(len(result))
    rows = [[pubid] + list(result[pubid]) for pubid in sorted(result, key=lambda x: int(x))]
    return pd.DataFrame(rows, columns=['pubid', 'pmid', 'doi'])

val_paper_df = get_val_paper_df()
val_paper_df.shape

authorPub.csv
authorPub2.csv
authorPub3.csv
303901


(303901, 3)

In [4]:
[(c, (val_paper_df[c] == '').sum()) for c in val_paper_df.columns]

[('pubid', 0), ('pmid', 78612), ('doi', 53376)]

In [5]:
((val_paper_df.pmid != '') & (val_paper_df.doi == '')).sum()

33631

In [6]:
def add_doi_from_pmid():
    """Add DOI based on PubMed ID"""
    result = {}
    for line in open('pmid_doi.json'):
        pmid, doi = json.loads(line)
        result[pmid] = clean_doi(doi)
    print(len(result))
    val_paper_df['doi_pm'] = val_paper_df.pmid.apply(lambda x: result.get(x, ''))

add_doi_from_pmid()
val_paper_df.shape

22782425


(303901, 4)

In [7]:
val_paper_df[
    (val_paper_df.pmid != '') & (val_paper_df.doi != '') & (val_paper_df.doi != val_paper_df.doi_pm)
].sample(10)

Unnamed: 0,pubid,pmid,doi,doi_pm
17000,320425,11204098,10.1037/0278-7393.27.1.202,
1925,6169,3419829,10.1111/j.1475-1313.1988.tb01083.x,
44901,1282858,24319930,10.1163/22134808-00002421,
288752,22342856,32334328,10.1016/j.plefa.2020.102093,
152707,5768469,26325876,10.1167/15.12.188,
162319,6400528,14600497,10.1097/01.wnr.0000097047.56589.a3,10.1097/00001756-200311140-00010
97794,3558374,11277771,10.5555/ol007052d,
148646,5535598,26325752,10.1167/15.12.64,
7112,70270,6363735,10.1001/jama.1984.03340320027021,
74301,2952745,16754761,10.1176/appi.ps.57.6.838,10.1176/ps.2006.57.6.838


In [8]:
def normalize_doi(pmid, doi, doi_pm):
    """"""
    if pmid == '' or doi == doi_pm:
        return doi
    if doi == '':
        return doi_pm
    return doi if doi_pm == '' else doi_pm    

def add_doi_norm():
    """"""
    val_paper_df['doi_norm'] = [
        normalize_doi(pmid, doi, doi_pm)
        for pmid, doi, doi_pm in zip(val_paper_df.pmid, val_paper_df.doi, val_paper_df.doi_pm)]

add_doi_norm()
val_paper_df.shape

(303901, 5)

In [9]:
(val_paper_df.doi_norm == '').sum()

29573

## ORCID papers

In [10]:
people_df = pd.read_hdf('results/people_df.h5')
people_df.shape

(774733, 18)

In [11]:
people_df.head(1)

Unnamed: 0,pid,firstname,middlename,lastname,degrees,location,locid,majorarea,orcid,firstname_norm,middlename_norm,lastname_norm,orcid_norm,MAGInstitution,MAGInstitutionID,magaid_coauthorship,magaid_name_insti,magaid
0,1,Stephen,V.,David,Ph.D.,Oregon Health and Science University,226,"neuro,csd,bme",0000-0003-4135-3104,STEPHEN,V,DAVID,0000-0003-4135-3104,OREGON HEALTH & SCIENCE UNIVERSITY,165690674,2171827615,,2171827615


In [12]:
def load_orcid_pub():
    """"""
    orcids = set(e for e in people_df.orcid_norm if e != '')
    print(len(orcids))
    result = []
    for line in open(join(ORCID_DIR, 'ORCID_2020_10_parsed.json')):
        orcid, name, pubs = json.loads(line)
        if orcid in orcids:
            for pub in pubs:
                result.append([orcid, pub['title'], pub.get('doi', '').lower()])
    return pd.DataFrame(result, columns=['orcid', 'title', 'doi'])

orcid_pub_df = load_orcid_pub()
orcid_pub_df.shape

1659


(57352, 3)

In [13]:
orcid_pub_df.orcid.nunique()

1471

## Add MAG pubid

In [14]:
all_dois = set(e for e in val_paper_df.doi_norm if e != '') | set(e for e in orcid_pub_df.doi if e != '')
len(all_dois)

299807

In [15]:
def load_doi_mag_pubid(dois):
    """"""
    result = {}
    fin = gzip.open(join(MAG_DIR, '202009/mag/Papers.txt.gz'), 'rt')
    for line in fin:
        fields = line.rstrip('\n').split('\t')
        mag_pubid, doi = fields[0], fields[2].lower()
        if doi in dois:
            result[doi] = mag_pubid
    return result

doi_to_mag_pubid = load_doi_mag_pubid(all_dois)
len(doi_to_mag_pubid)

292585

For validated papers:

In [16]:
val_paper_df['mag_pubid'] = val_paper_df.doi_norm.apply(lambda x: doi_to_mag_pubid.get(x, ''))
val_paper_df.shape

(303901, 6)

In [17]:
(val_paper_df.mag_pubid == '').sum()

34352

In [18]:
val_paper_df.to_hdf('results/validation_paper_aft.h5', key='df', mode='w')

ORCID:

In [19]:
orcid_pub_df['mag_pubid'] = orcid_pub_df.doi.apply(lambda x: doi_to_mag_pubid.get(x, ''))
orcid_pub_df.shape

(57352, 4)

In [20]:
orcid_pub_df.to_hdf('results/validation_paper_orcid.h5', key='df', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['orcid', 'title', 'doi', 'mag_pubid'], dtype='object')]

  encoding=encoding,
