In [12]:
import csv
import gzip
import html
import pandas as pd
import simplejson as json

from os.path import join

MAG_DIR = '/home/qke100/ke-data/dataset-MAG/'

In [2]:
people_df = pd.read_hdf('results/people_df.h5')
people_df.shape

(774733, 18)

In [3]:
people_df.head(1)

Unnamed: 0,pid,firstname,middlename,lastname,degrees,location,locid,majorarea,orcid,firstname_norm,middlename_norm,lastname_norm,orcid_norm,MAGInstitution,MAGInstitutionID,magaid_coauthorship,magaid_name_insti,magaid
0,1,Stephen,V.,David,Ph.D.,Oregon Health and Science University,226,"neuro,csd,bme",0000-0003-4135-3104,STEPHEN,V,DAVID,0000-0003-4135-3104,OREGON HEALTH & SCIENCE UNIVERSITY,165690674,2171827615,,2171827615


## MAG authorship

In [4]:
def load_mag_author_pub(mag_aids=None):
    """Return list of MAG paper IDs of given MAG author IDs"""
    if type(mag_aids) is list:
        mag_aids = set(mag_aids)
    print(len(mag_aids))
    result = {}
    for line in open(join(MAG_DIR, 'authorid_to_paperid.json')):
        aid, pubs = json.loads(line)
        if mag_aids is None or aid in mag_aids:
            result[aid] = pubs
    return result

mag_aid_to_pubs = load_mag_author_pub(set(e for e in people_df.magaid if e != ''))
len(mag_aid_to_pubs)

491678


491678

In [5]:
def write_authorship():
    """"""
    with open('dataset/authorship.csv', 'w') as fout:
        writer = csv.writer(fout, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(['PID', 'MAGPaperID'])
        for pid, magaid in zip(people_df.pid, people_df.magaid):
            if magaid != '':
                for p in mag_aid_to_pubs[magaid]:
                    writer.writerow([pid, p])

write_authorship()

## MAG paper ID

In [6]:
authorship_df = pd.read_csv('dataset/authorship.csv', dtype=str)
authorship_df.shape

(26378069, 2)

In [9]:
def get_paper_meta_df():
    """"""
    mag_pubids = set(authorship_df.MAGPaperID)
    print(len(mag_pubids))
    result = {}
    fin = gzip.open(join(MAG_DIR, '202009/mag/Papers.txt.gz'), 'rt')
    for line in fin:
        fields = line.rstrip('\n').split('\t')
        pubid, doi = fields[0], fields[2].lower()
        if pubid in mag_pubids:
            result[pubid] = doi
    fin.close()
    return pd.DataFrame(list(result.items()), columns=['magpubid', 'doi'])

mag_paper_meta_df = get_paper_meta_df()
mag_paper_meta_df.shape

16942415


(16942415, 2)

In [10]:
(mag_paper_meta_df.doi == '').sum()

3328480

In [13]:
def clean_doi(x):
    """"""
    xs = html.unescape(x.strip().lower())
    for s in ['https://doi.org/', 'http://doi.org/']:
        if xs.startswith(s):
            xs = xs[len(s):]
            break
    return xs.replace('//', '/')

def add_pmid_from_doi():
    """Add PubMed ID based on DOI"""
    dois = set(e for e in mag_paper_meta_df.doi if e != '')
    print(len(dois))
    result = {}
    for line in open('pmid_doi.json'):
        pmid, doi = json.loads(line)
        doi = clean_doi(doi)
        if doi != '' and doi in dois:
            result[doi] = pmid
    print(len(result))
    mag_paper_meta_df['pmid'] = mag_paper_meta_df.doi.apply(lambda x: result.get(x, ''))

add_pmid_from_doi()
mag_paper_meta_df.shape

13479211
5148898


(16942415, 3)

In [14]:
mag_paper_meta_df.to_csv('dataset/paper.csv', header=['MAGPaperID', 'DOI', 'PMID'], index=False)