In [1]:
import pandas as pd
import random
import simplejson as json
import unidecode

from itertools import combinations, product
from os.path import join

DATA_DIR = '/home/qke100/su-data/academic-tree/'

## Name variation

In [2]:
people_df = pd.read_hdf('dataset/people_df.h5')
people_df.shape

(759676, 10)

In [3]:
people_df.sample(5)

Unnamed: 0,pid,firstname,middlename,lastname,degrees,majorarea,orcid,orcid_norm,firstname_norm,gender
569663,607467,Tom,,Widiger,,neuro,,,Tom,M
102489,114667,Julia,Q.,Davis,Ph.D.,neuro,,,Julia,F
467247,503836,Sied,W.,Janna,Ph.D.,biomech,,,Sied,M
280518,310960,Sylvia,,Linan-Thompson,,educ,,,Sylvia,F
74043,83631,Juan,,Bornman,Ph.D.,csd,,,Juan,M


In [4]:
people_df.dtypes

pid               object
firstname         object
middlename        object
lastname          object
degrees           object
majorarea         object
orcid             object
orcid_norm        object
firstname_norm    object
gender            object
dtype: object

In [5]:
def get_at_name_var(first, middle, last):
    """Return name variation"""
    first = first.upper().replace('.', '')
    if first != 'SIR' and first.startswith('SIR '):
        first = first[4:]
    middle = middle.upper().replace('.', '')
    last = last.upper()
    result = {'full': (first, middle, last)}
    if len(first) > 1:
        if len(middle) > 1:
            result['mid_init'] = (first, middle[:1], last)
        if len(middle) > 0:
            result['no_mid'] = (first, last)
    fu = unidecode.unidecode(first)
    mu = unidecode.unidecode(middle)
    lu = unidecode.unidecode(last)
    if fu != first or mu != middle or lu != last:
        result['full_norm'] = (fu, mu, lu)
        if len(fu) > 1:
            if len(mu) > 1:
                result['mid_init_norm'] = (fu, mu[:1], lu)
            if len(mu) > 0:
                result['no_mid_norm'] = (fu, lu)
    return result

print(
    get_at_name_var('Benjamin', 'Y.', 'Hayden'),
    get_at_name_var('Benjamin', '', 'Willmore'),
    get_at_name_var('John', 'HR', 'Maunsell'),
    get_at_name_var('Sir John', 'Carew', 'Eccles'),
    get_at_name_var('J.', 'David', 'Sweatt'),
    get_at_name_var('Jörn', '', 'Diedrichsen'),
    get_at_name_var('Rodolfo', 'R.', 'Llinás'),
    sep='\n')

{'full': ('BENJAMIN', 'Y', 'HAYDEN'), 'no_mid': ('BENJAMIN', 'HAYDEN')}
{'full': ('BENJAMIN', '', 'WILLMORE')}
{'full': ('JOHN', 'HR', 'MAUNSELL'), 'mid_init': ('JOHN', 'H', 'MAUNSELL'), 'no_mid': ('JOHN', 'MAUNSELL')}
{'full': ('JOHN', 'CAREW', 'ECCLES'), 'mid_init': ('JOHN', 'C', 'ECCLES'), 'no_mid': ('JOHN', 'ECCLES')}
{'full': ('J', 'DAVID', 'SWEATT')}
{'full': ('JÖRN', '', 'DIEDRICHSEN'), 'full_norm': ('JORN', '', 'DIEDRICHSEN')}
{'full': ('RODOLFO', 'R', 'LLINÁS'), 'no_mid': ('RODOLFO', 'LLINÁS'), 'full_norm': ('RODOLFO', 'R', 'LLINAS'), 'no_mid_norm': ('RODOLFO', 'LLINAS')}


In [7]:
pid_to_name_var = {
    pid: get_at_name_var(first, middle, last)
    for first, middle, last, pid in zip(*[people_df[c] for c in ['firstname_norm', 'middlename', 
                                                                 'lastname','pid']])}
len(pid_to_name_var)

759676

## Combinations of name pair

In [8]:
conn_df = pd.read_hdf('dataset/connect_df.h5')
conn_df.shape

(722872, 6)

In [9]:
conn_df.sample(5)

Unnamed: 0,cid,pid1,pid2,relation,location,locid
712088,1695089,815066,815067,1,Université de Liège,3303
144745,311992,138189,58956,1,"State University of New York, Buffalo",285
336171,751328,374979,367843,1,University of Houston,384
534615,1234985,653500,648556,1,University of Louisiana at Lafayette,3783
624863,1468354,750148,750147,1,Harvard University,86


In [10]:
conn_df.dtypes

cid          int64
pid1        object
pid2        object
relation     int64
location    object
locid        int64
dtype: object

In [11]:
def get_name_pairs():
    """"""
    result = []
    for cid, pid1, pid2 in zip(conn_df.cid, conn_df.pid1, conn_df.pid2):
        names_1 = pid_to_name_var[pid1]
        names_2 = pid_to_name_var[pid2]
        for sub_id, (k1, k2) in enumerate(product(names_1.keys(), names_2.keys()), 1):
            row = [cid, pid1, pid2, '%d_%d' % (cid, sub_id), names_1[k1], names_2[k2], k1, k2]
            result.append(row)
    col = ['cid', 'pid1', 'pid2', 'vid', 'name1', 'name2', 'name1VarCode', 'name2VarCode']
    return pd.DataFrame(result, columns=col)

conn_name_df = get_name_pairs()
conn_name_df.shape

(1914449, 8)

In [12]:
conn_name_df[conn_name_df.cid == 2]

Unnamed: 0,cid,pid1,pid2,vid,name1,name2,name1VarCode,name2VarCode
0,2,2,3,2_1,"(BENJAMIN, Y, HAYDEN)","(JACK, L, GALLANT)",full,full
1,2,2,3,2_2,"(BENJAMIN, Y, HAYDEN)","(JACK, GALLANT)",full,no_mid
2,2,2,3,2_3,"(BENJAMIN, HAYDEN)","(JACK, L, GALLANT)",no_mid,full
3,2,2,3,2_4,"(BENJAMIN, HAYDEN)","(JACK, GALLANT)",no_mid,no_mid


## Match with MAG

Scan MAG papers where both authors are in the author list:

In [13]:
at_name_pair_to_vids = conn_name_df.groupby(['name1', 'name2'])['vid'].apply(list).to_dict()
len(at_name_pair_to_vids)

1904039

In [14]:
at_name_pair_to_vids[('BENJAMIN', 'Y', 'HAYDEN'), ('JACK', 'L', 'GALLANT')]

['2_1']

In [15]:
name_var_codes = ['full', 'full_norm', 'mid_init', 'mid_init_norm', 'no_mid', 'no_mid_norm']

def update_dict(data_dict, key, val_ele):
    """"""
    if key not in data_dict:
        data_dict[key] = []
    data_dict[key].append(val_ele)

def get_mag_name_var(first, middle, last):
    """Return name variation"""
    first = first.replace('.', '')
    middle = middle.replace('.', '')
    result = {'full': (first, middle, last)}    
    if len(first) > 1:
        if len(middle) > 1:
            result['mid_init'] = (first, middle[:1], last)
        if len(middle) > 0:
            result['no_mid'] = (first, last)
    fu = unidecode.unidecode(first)
    mu = unidecode.unidecode(middle)
    lu = unidecode.unidecode(last)
    if fu != first or mu != middle or lu != last:
        result['full_norm'] = (fu, mu, lu)
        if len(fu) > 1:
            if len(mu) > 1:
                result['mid_init_norm'] = (fu, mu[:1], lu)
            if len(mu) > 0:
                result['no_mid_norm'] = (fu, lu)
    return result

def match_each_paper(mag_pid, mag_authors, match_result):
    """"""
    already = set()
    author_names = []
    for seq, aid, full_name, affil, (first, middle, last) in mag_authors: 
        if seq not in already: # authors may have multiple affiliations
            author_names.append((seq, aid, get_mag_name_var(first, middle, last)))
            already.add(seq)
    for a1, a2 in combinations(author_names, 2):
        names_1, names_2 = a1[2], a2[2]
        codes_1 = [k for k in name_var_codes if k in names_1]
        codes_2 = [k for k in name_var_codes if k in names_2]
        for k1, k2 in product(codes_1, codes_2):
            key = (names_1[k1], names_2[k2])
            if key in at_name_pair_to_vids:
                update_dict(match_result, key, (mag_pid, (a1[0], a1[1], k1), (a2[0], a2[1], k2)))
                break

def match_mag(mag_inpath, sample_prob=None):
    """"""
    result = {}
    for idx, line in enumerate(open(mag_inpath), 1):
        if idx % 1000000 == 0:
            print('%d: %d' % (idx, len(result)), end='')
            print('\r', end='')
        if sample_prob is None or random.random() < sample_prob:
            mag_pid, mag_authors = json.loads(line)
            match_each_paper(mag_pid, mag_authors, result)
    print('')
    return result

In [16]:
# match journal papers
match_result_jnl = match_mag('/home/qke100/ke-data/dataset-MAG/paper_author_Journal_parsed.txt')
len(match_result_jnl)

86000000: 241026


241608

In [17]:
# match conference papers
match_result_conf = match_mag('/home/qke100/ke-data/dataset-MAG/paper_author_Conference_parsed.txt')
len(match_result_conf)

4000000: 19963


20839

In [18]:
match_result_jnl[('BENJAMIN', 'Y', 'HAYDEN'), ('JACK', 'L', 'GALLANT')]

[('1982956265', (2, '2082172561', 'full'), (4, '2110165986', 'full')),
 ('2028047007', (2, '2082172561', 'full'), (3, '2110165986', 'full')),
 ('2099224730', (1, '2082172561', 'full'), (2, '2110165986', 'full')),
 ('2114854701', (1, '2082172561', 'full'), (2, '2110165986', 'full')),
 ('2128412813', (1, '2082172561', 'full'), (2, '2110165986', 'full'))]

In [19]:
(('BENJAMIN', 'Y', 'HAYDEN'), ('JACK', 'GALLANT')) not in match_result_jnl

True

In [20]:
(('BENJAMIN', 'HAYDEN'), ('JACK', 'GALLANT')) not in match_result_jnl

True

In [21]:
def get_conn_authorship():
    """Return co-authored papers of mentorship"""
    rows = []
    columns = ['cid', 'pid1', 'pid2', 'vid', 'name1', 'name2']
    for e in zip(*[conn_name_df[c] for c in columns]):
        key = (e[4], e[5])
        if len(at_name_pair_to_vids[key]) == 1:
            for match_result in [match_result_jnl, match_result_conf]:
                if key in match_result:
                    for pid, a_1, a_2 in match_result[key]:
                        rows.append(list(e) + [pid] + list(a_1) + list(a_2))
    col = columns + ['mag_pid','mag_seq_1','mag_aid_1','mag_name_var_1','mag_seq_2','mag_aid_2','mag_name_var_2']
    return pd.DataFrame(rows, columns=col)

matched_authorship_df = get_conn_authorship()
matched_authorship_df.shape

(1172748, 13)

In [22]:
matched_authorship_df[matched_authorship_df.cid == 2]

Unnamed: 0,cid,pid1,pid2,vid,name1,name2,mag_pid,mag_seq_1,mag_aid_1,mag_name_var_1,mag_seq_2,mag_aid_2,mag_name_var_2
0,2,2,3,2_1,"(BENJAMIN, Y, HAYDEN)","(JACK, L, GALLANT)",1982956265,2,2082172561,full,4,2110165986,full
1,2,2,3,2_1,"(BENJAMIN, Y, HAYDEN)","(JACK, L, GALLANT)",2028047007,2,2082172561,full,3,2110165986,full
2,2,2,3,2_1,"(BENJAMIN, Y, HAYDEN)","(JACK, L, GALLANT)",2099224730,1,2082172561,full,2,2110165986,full
3,2,2,3,2_1,"(BENJAMIN, Y, HAYDEN)","(JACK, L, GALLANT)",2114854701,1,2082172561,full,2,2110165986,full
4,2,2,3,2_1,"(BENJAMIN, Y, HAYDEN)","(JACK, L, GALLANT)",2128412813,1,2082172561,full,2,2110165986,full


In [23]:
matched_authorship_df.to_hdf('dataset/connect_mag_coauthorship.h5', key='df', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['pid1', 'pid2', 'vid', 'name1', 'name2', 'mag_pid', 'mag_aid_1',
       'mag_name_var_1', 'mag_aid_2', 'mag_name_var_2'],
      dtype='object')]

  encoding=encoding,


In [24]:
def get_people_id():
    """"""
    result = {}
    columns = ['pid1', 'pid2', 'mag_aid_1', 'mag_aid_2']
    for pid1, pid2, mag_aid_1, mag_aid_2 in zip(*[matched_authorship_df[c] for c in columns]):
        for pid, aid in [(pid1, mag_aid_1), (pid2, mag_aid_2)]:
            if pid not in result:
                result[pid] = {}
            result[pid][aid] = result[pid].get(aid, 0) + 1
    return result

at_pid_to_mag_aid = get_people_id()
len(at_pid_to_mag_aid)

259943

In [25]:
unmatched_at_pid = set(pid_to_name_var.keys()) - set(at_pid_to_mag_aid.keys())
len(unmatched_at_pid)

499733

In [30]:
def write_match_result():
    """"""
    unique, cnt = 0, 0
    result = {}
    fout = open('dataset/at_pid_to_mag_aid.txt', 'w')
    for pid in sorted(at_pid_to_mag_aid, key=lambda x: int(x)):
        aids = at_pid_to_mag_aid[pid]
        if len(aids) == 1:
            unique += 1
        mv = max(aids.values())
        mk = [k for k, v in aids.items() if v == mv]
        if len(mk) == 1:
            fout.write('%s\t%s\n' % (pid, mk[0]))
            cnt += 1
            result[pid] = mk[0]
    fout.close()
    n = len(at_pid_to_mag_aid)
    print('Total matched:', n)
    print('Uniquely matched:', unique, unique / n)
    print('Unambiguously matched:', cnt, cnt / n)
    people_df['magaid'] = people_df.pid.apply(lambda x: result.get(x, ''))

write_match_result()

Total matched: 259943
Uniquely matched: 225914 0.8690905313857269
Unambiguously matched: 255111 0.9814113094024459


In [31]:
people_df.sample(5)

Unnamed: 0,pid,firstname,middlename,lastname,degrees,majorarea,orcid,orcid_norm,firstname_norm,gender,magaid
471735,508334,Kevin,M.,Gribbins,Ph.D.,anatomy,,,Kevin,M,1977651546.0
521305,558085,Wenbin,,Yu,Ph.D.,etree,,,Wenbin,M,2152160864.0
133800,150444,Deborah,A.,Polvani,Ph.D.,chemistry,,,Deborah,F,
171428,198303,Marion,E.,Wittmann,Ph.D.,evol,,,Marion,F,2147329782.0
128013,144215,Gian,F.,Sacco,Ph.D.,physics,,,Gian,M,


In [32]:
people_df.to_hdf('dataset/people_df.h5', key='df', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['pid', 'firstname', 'middlename', 'lastname', 'degrees', 'majorarea',
       'orcid', 'orcid_norm', 'firstname_norm', 'gender', 'magaid'],
      dtype='object')]

  encoding=encoding,


In [34]:
people_df[
    ['pid', 'firstname_norm', 'middlename', 'lastname', 'degrees', 'majorarea', 'orcid_norm', 'gender', 'magaid']
].to_csv('dataset/people.csv', index=False, 
         header=['pid','firstname','middlename','lastname','degrees','majorarea','orcid','gender','magaid'])