In [1]:
import csv
import html
import pandas as pd
import re
import string
import unidecode

from os.path import join

DATA_DIR = 'raw_data/'

In [2]:
! head -6 raw_data/people.csv

"pid","firstname","middlename","lastname","degrees","location","locid","area","majorarea","award","hindex","orcid_id","s2id","homepage","addedby","dateadded","modby"
"1","Stephen","V.","David","Ph.D.","Oregon Health and Science University","226","auditory system, attention","neuro,csd,bme","","30","0000-0003-4135-3104","2321659","http://hearingbrain.org/","david","2005-01-14 15:50:37","david"
"3","Jack","L.","Gallant","","University of California, Berkeley","312","Systems","neuro,psych","","46","NULL","40373111","http://gallantlab.org","david","2005-01-14 15:51:51","david"
"667145","Christian","T.","Totten","Ph.D.","University of Florida, Gainesville","380","Environmental Engineering","etree,environment","NULL","2","NULL","35259558","","pq","2016-07-23 11:14:13","pq"
"5","Kendrick","Norris","Kay","Ph.D.","University of Minnesota, Twin Cities","406","visual system, fMRI, computational methods, neural network models","neuro","","24","","1912661","http://cvnlab.net","david","2005-01-1

In [3]:
def load_author_df():
    """"""
    result = []
    with open(join(DATA_DIR, 'people.csv')) as fin:
        reader = csv.reader(fin, delimiter=',')
        next(reader, None)
        for row in reader:
            result.append(list(row[:7]) + [row[8]] + [row[11]])
    col = ['pid', 'firstname', 'middlename', 'lastname', 'degrees', 'location', 'locid', 'majorarea', 'orcid']
    df = pd.DataFrame(sorted(result, key=lambda x: int(x[0])), columns=col)
    for c in col[1:]:
        df[c] = df[c].apply(lambda x: '' if x == 'NULL' else x)
    return df

people_df = load_author_df()
people_df.shape

(778367, 9)

In [4]:
people_df.head()

Unnamed: 0,pid,firstname,middlename,lastname,degrees,location,locid,majorarea,orcid
0,1,Stephen,V.,David,Ph.D.,Oregon Health and Science University,226,"neuro,csd,bme",0000-0003-4135-3104
1,2,Benjamin,Y.,Hayden,,"University of Minnesota, Twin Cities",406,neuro,0000-0002-7678-4281
2,3,Jack,L.,Gallant,,"University of California, Berkeley",312,"neuro,psych",
3,4,Benjamin,,Willmore,Ph.D.,University of Oxford,228,neuro,
4,5,Kendrick,Norris,Kay,Ph.D.,"University of Minnesota, Twin Cities",406,neuro,


In [5]:
people_df.pid.nunique()

778367

In [6]:
people_df.pid.value_counts()

603621    1
825408    1
343844    1
58824     1
4609      1
         ..
200159    1
106807    1
174614    1
414197    1
530467    1
Name: pid, Length: 778367, dtype: int64

In [7]:
[(c, people_df[c].isna().sum()) for c in people_df.columns]

[('pid', 0),
 ('firstname', 0),
 ('middlename', 0),
 ('lastname', 0),
 ('degrees', 0),
 ('location', 0),
 ('locid', 0),
 ('majorarea', 0),
 ('orcid', 0)]

## Filter PIDs

In [8]:
[(c, (people_df[c] == '').sum()) for c in people_df.columns]

[('pid', 0),
 ('firstname', 567),
 ('middlename', 364406),
 ('lastname', 1110),
 ('degrees', 319825),
 ('location', 11482),
 ('locid', 13),
 ('majorarea', 0),
 ('orcid', 776686)]

### Without first or last name

In [9]:
pid_wo_first = set(people_df[people_df.firstname == '']['pid'])
len(pid_wo_first)

567

In [10]:
pid_wo_last = set(people_df[people_df.lastname == '']['pid'])
len(pid_wo_last)

1110

In [11]:
ds_people = people_df[~people_df.pid.isin(pid_wo_first | pid_wo_last)].copy().reset_index(drop=True)
ds_people.shape

(776975, 9)

### Same name and affiliation with multiple PIDs

In [12]:
inst_to_pids = ds_people.groupby(
        ['firstname', 'middlename', 'lastname', 'location', 'majorarea'])['pid'].apply(set).to_dict()
len(inst_to_pids)

776110

In [13]:
inst_with_mul_pids = [k for k, v in inst_to_pids.items() if len(v) > 1]
len(inst_with_mul_pids)

824

In [14]:
inst_with_mul_pids[1]

('A',
 'Peter',
 'Klimley',
 'University of California, Davis and San Diego State University',
 'physics')

In [15]:
ds_people[
    (ds_people.firstname == 'A') & (ds_people.middlename == 'Peter') & (ds_people.lastname == 'Klimley')]

Unnamed: 0,pid,firstname,middlename,lastname,degrees,location,locid,majorarea,orcid
595864,635445,A,Peter,Klimley,,"University of California, Davis and San Diego ...",22646,physics,
595865,635446,A,Peter,Klimley,,"University of California, Davis and San Diego ...",22646,physics,


In [16]:
mul_pids = frozenset().union(*[inst_to_pids[k] for k in inst_with_mul_pids])
len(mul_pids)

1689

In [17]:
ds_people = ds_people[~ds_people.pid.isin(mul_pids)].copy().reset_index(drop=True)
ds_people.shape

(775286, 9)

### Last name with one character

In [18]:
ds_people = ds_people[ds_people.lastname.str.len() > 1].copy().reset_index(drop=True)
ds_people.shape

(775118, 9)

## Process name

### First name

In [19]:
def norm_firstname(x):
    """"""
    x = re.sub(r'[\000-\010]|[\013-\014]|[\016-\037]', '', x).replace('\\t', '').strip()
    if x[0] == '(' and x[-1] == ')':
        return x[1:-1]
    x = x.rstrip('|').strip()
    x = re.sub(r'\([^)]*\)', '', x).lstrip('(')
    if ',' in x:
        x = x.split(',')[0]
    x = x.replace('<q>', '"').replace('</q>', '"')
    x = x.replace('<i>', '').replace('</i>', '')
    x = re.sub(r'"[^)]*"', '', x)
    x = html.unescape(x).replace('.', ' ')
    x = ' '.join(x.split()).strip()
    x = unidecode.unidecode(x).upper()
    if x != 'SIR' and x.startswith('SIR '):
        return x[4:]
    return x

print(
    norm_firstname('Juyun \\t'),
    norm_firstname('(Max)'),
    norm_firstname('Benjamin'),
    norm_firstname('James, Jr.'),
    norm_firstname('Cabrera,'),
    norm_firstname('(Nicholas'),
    norm_firstname('Mansi "Indian Princess"'),
    norm_firstname('John ("Jack")'),
    norm_firstname('&#34945;'),
    norm_firstname('James <q>Brad</q>'),
    norm_firstname('Franz <i>Ludwig</i>'),
    norm_firstname('\x03\x14\x15\x13\x0b\x14\x04Edward'),
    norm_firstname(','),
    sep='\n'
)

JUYUN
Max
BENJAMIN
JAMES
CABRERA
NICHOLAS
MANSI
JOHN
YUAN 
JAMES
FRANZ LUDWIG
EDWARD



In [20]:
ds_people['firstname_norm'] = ds_people.firstname.apply(norm_firstname)
ds_people.shape

(775118, 10)

In [21]:
set(x for e in ds_people.firstname_norm for x in e.lower() if x not in set(string.ascii_lowercase))

{' ',
 "'",
 '(',
 ')',
 '-',
 '/',
 '0',
 '1',
 '2',
 '5',
 '8',
 ':',
 '>',
 '?',
 '[',
 ']',
 '|'}

In [22]:
def print_dirty_firstname():
    """"""
    result = []
    char = set(['(', ')', '/', '0', '1', '2', '5', '8', ':', '>', '?', '[', '\\', ']', '|'])
    for pid, f, fn in zip(ds_people.pid, ds_people.firstname, ds_people.firstname_norm):
        if fn == '' or len(set(fn) & char) > 0:
            print(pid, f, fn)
            result.append(pid)
    return result

dirty_fn_pid = print_dirty_firstname()

5358 Mani/Madhuri MANI/MADHURI
5360 Jenny/Yun-Chia JENNY/YUN-CHIA
19835 Hyun Ju [Helena] HYUN JU [HELENA]
58000 ¹Ü 1U
58150 »Æ >>AE
58151 Â½ A 1/2 
58152 »ª >>A
90933 Pumpki [Lei] PUMPKI [LEI]
196545 Catherine|Garth CATHERINE|GARTH
220009 Richard|Rinpoche RICHARD|RINPOCHE
229826 Laurel|Murakawa-Leopard LAUREL|MURAKAWA-LEOPARD
252888 Jonathan|Buell JONATHAN|BUELL
252889 Janelle|Merchant JANELLE|MERCHANT
252896 Kristie|Salinas KRISTIE|SALINAS
252897 Andrew|Centner ANDREW|CENTNER
252898 Matthew|Lamar MATTHEW|LAMAR
252899 Jeff|Cerone JEFF|CERONE
255271 Anne|Brack ANNE|BRACK
255281 Jennifer|Clark JENNIFER|CLARK
262891 Lindsay|Knickman LINDSAY|KNICKMAN
262892 Gabrielle|Knickman GABRIELLE|KNICKMAN
262893 Kevin|Schulte KEVIN|SCHULTE
282180 Michele|Counts MICHELE|COUNTS
282192 Georgia|Kimball GEORGIA|KIMBALL
282199 Sarah|Pettengill SARAH|PETTENGILL
282200 Ashley|Meyers ASHLEY|MEYERS
282202 Michelle|Harms MICHELLE|HARMS
282203 Steven|Schmitz STEVEN|SCHMITZ
282204 Rebekah|Kiene REBEKAH|KIENE
2822

In [23]:
ds_people = ds_people[~ds_people.pid.isin(dirty_fn_pid)].copy().reset_index(drop=True)
ds_people.shape

(775017, 10)

### Middle name

In [24]:
def norm_middlename(x):
    """"""
    if x in ['', '(', '1st', '2nd', '/', '$', '`', "'", '&', '(.']:
        return ''
    if x[0] == '(' and x[-1] == ')':
        x = x[1:-1]
    elif x[0] == "'" and x[-1] == "'":
        x = x[1:-1]
    elif x[0] == '[' and x[-1] == ']':
        x = x[1:-1]
    if ',' in x:
        x = x.split(',')[0]
    x = x.replace('_', ' ')
    x = x.replace('<q>', '').replace('</q>', '')
    x = x.replace('<i>', '').replace('</i>', '')
    x = unidecode.unidecode(html.unescape(x)).upper()
    x = re.sub(r'\([^)]*\)', '', x)
    x = re.sub(r'\"[^)]*\"', '', x)
    x = x.replace('.-', '-').replace('.', ' ')
    x = ' '.join(x.split()).lstrip('(').rstrip("'").strip()
    return x

print(
    norm_middlename('[MacFarland]'),
    norm_middlename("Rene'"),
    norm_middlename('<q>Shona</q>'),
    norm_middlename('Li,'),
    norm_middlename('('),
    norm_middlename('Fonseca'),
    norm_middlename('M. A.'),
    norm_middlename('L.U.'),
    norm_middlename('José'),
    norm_middlename('L. (Molly)'),
    norm_middlename('L. "Lee"'),
    norm_middlename('R. (Rip)'),
    norm_middlename('(Kâmil Uğurbil)'),
    norm_middlename('(林節玄)'),
    norm_middlename('(T.J.)'),
    norm_middlename('Thomas P.-N'),
    norm_middlename('Mat&#283;j'),
    sep='\n'
)

MACFARLAND
RENE
SHONA
LI

FONSECA
M A
L U
JOSE
L
L
R
KAMIL UGURBIL
LIN JIE XUAN
T J
THOMAS P-N
MATEJ


In [25]:
ds_people['middlename_norm'] = ds_people.middlename.apply(norm_middlename)
ds_people.shape

(775017, 11)

In [26]:
set(x for e in ds_people.middlename_norm for x in e.lower() if x not in set(string.ascii_lowercase))

{' ', '&', "'", '-', '/', '1', '3', '4', '7', '9', '<', '?', '\\'}

In [27]:
def print_dirty_middlename():
    """"""
    result = []
    char = set(
        ['$', '&', '(', ',', '/', '1', '2', '3', '4', '7', '9', '<', '>', '?', '[', '\\', ']', '_', '`'])
    for pid, m, mn in zip(ds_people.pid, ds_people.middlename, ds_people.middlename_norm):
        if len(set(mn) & char) > 0:
            print(pid, m, mn, sep=' ---> ')
            result.append(pid)
    return result

dirty_mn_pid = print_dirty_middlename()
len(dirty_mn_pid)

57998 ---> ¹ú ---> 1U
57999 ---> Ì« ---> I<<
58149 ---> Ë¾ ---> E 3/4
74675 ---> X/ ---> X/
76348 ---> ? ---> ?
80184 ---> O\\'neal ---> O\\'NEAL
664979 ---> s/o ---> S/O
693244 ---> (1973-....) ---> 1973-
725710 ---> F/Ferreira ---> F/FERREIRA
761071 ---> James/J. ---> JAMES/J
825930 ---> A&M ---> A&M


11

In [28]:
ds_people = ds_people[~ds_people.pid.isin(dirty_mn_pid)].copy().reset_index(drop=True)
ds_people.shape

(775006, 11)

### Last name

In [29]:
def norm_lastname(x):
    """"""
    if x in ['Ph.D.', 'Jr.', 'M.D.']:
        return ''
    if x[0] == '(' and x[-1] == ')':
        x = x[1:-1]
    if ',' in x:
        x = x.split(',')[0]
    x = html.unescape(x).upper()
    x = re.sub(r'\([^)]*\)', '', x).lstrip('(')
    x = ' '.join(x.split()).strip()
    x = unidecode.unidecode(x).strip('.').replace('.', '')
    return x

print(
    norm_lastname('Kujala (neé Saarela)'),
    norm_lastname('Wang (&#29579;)'),
    sep='\n'
)

KUJALA
WANG


In [30]:
ds_people['lastname_norm'] = ds_people.lastname.apply(norm_lastname)
ds_people.shape

(775006, 12)

In [31]:
set(x for e in ds_people.lastname_norm for x in e.lower() if x not in set(string.ascii_lowercase))

{'\x1a',
 ' ',
 '"',
 '&',
 "'",
 '(',
 ')',
 '+',
 '-',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 ':',
 ';',
 '<',
 '>',
 '?',
 '[',
 '\\',
 ']',
 '_',
 '`'}

In [32]:
def print_dirty_lastname():
    """"""
    result = []
    char = set([
        '"', '&', '(', ')', '+', '.', '/', '0', '1', '2', '3', '4', '5', '6', ':', ';', '<', '>',
        '?', '[', '\\', ']', '_', '`'])
    for pid, l, ln in zip(ds_people.pid, ds_people.lastname, ds_people.lastname_norm):
        if len(set(ln) & char) > 0:
            print(pid, l, ln, sep=' ---> ')
            result.append(pid)
    return result

dirty_ln_pid = print_dirty_lastname()
len(dirty_ln_pid)

7067 ---> [Holley] Miner ---> [HOLLEY] MINER
7071 ---> [Himmelheber] Brady ---> [HIMMELHEBER] BRADY
18265 ---> O\\'Brien ---> O\\'BRIEN
23940 ---> Collins 3rd ---> COLLINS 3RD
28922 ---> O"Riordan ---> O"RIORDAN
37799 ---> IÃ±iguez ---> IA+-IGUEZ
55980 ---> La_Marca ---> LA_MARCA
58153 ---> »ª ---> >>a
58154 ---> ´« ---> '<<
59447 ---> test5 ---> TEST5
65966 ---> LÃ¼tcke ---> LA 1/4 TCKE
83688 ---> Lajiness-O\\'Neill ---> LAJINESS-O\\'NEILL
100078 ---> D\\'Souza ---> D\\'SOUZA
101439 ---> Kate) ---> KATE)
102493 ---> O\\'Boyle ---> O\\'BOYLE
103415 ---> Vander_Zwan-Butler ---> VANDER_ZWAN-BUTLER
154801 ---> Reed2 ---> REED2
168078 ---> Ka`opua ---> KA`OPUA
185579 ---> French\tLiang Gao ---> FRENCH\TLIANG GAO
272251 ---> Hammon: Vicki ---> HAMMON: VICKI
286004 ---> K.) Yeo ---> K) YEO
292546 ---> Fisher/Gibson ---> FISHER/GIBSON
335679 ---> Hutchens: Jane ---> HUTCHENS: JANE
349715 ---> Peres: Deborah ---> PERES: DEBORAH
415728 ---> Zahedi" ---> ZAHEDI"
430584 ---> Fang" ---> FANG"
4622

63

In [33]:
ds_people = ds_people[
    (~ds_people.pid.isin(dirty_ln_pid)) & (ds_people.lastname_norm.str.len() > 1)
].copy().reset_index(drop=True)
ds_people.shape

(774733, 12)

## ORCID

In [34]:
orcid_dict = {
    '000-0003-2721-3770': '0000-0003-2721-3770',
    '0000000246091337': '0000-0002-4609-1337',
    '0000-00019801-2963': '0000-0001-9801-2963',
    '00000003-2983-6330': '0000-0003-2983-6330',
    '0000000348394400': '0000-0003-4839-4400',
    '0000–0002–8587–4115': '0000-0002-8587-4115',
    '0000000167270935': '0000-0001-6727-0935',
}

def clean_orcid(x):
    """"""
    if x == '':
        return x
    xs = x.replace('\u200b', '').strip(' ",“”')
    for s in ['https://orcid.org/', 'http://orcid.org/', 'orcid.org/']:
        if xs.startswith(s):
            xs = xs[len(s):].strip()
            break
    parts = xs.split('-')
    if len(xs) == 19 and len(parts) == 4 and all(len(e) == 4 for e in parts):
        return xs
    if xs in orcid_dict:
        return orcid_dict[xs]
    print('=====', xs, '=====')
    return ''

ds_people['orcid_norm'] = ds_people.orcid.apply(clean_orcid)

===== AGVG75 =====
===== A-8474-2013 =====
===== J-5755-2014 =====
===== https://www.ncbi.nlm.nih.gov/sites/myncbi/fletcher.white.1/bibliography/40234758/public/?sort=date&direction=descending =====
===== Katherine M. Mathis =====
===== <div itemscope itemtype="https://schema.org/Person"><a itemprop="sameAs" content="https://orcid.org/0000-0002-9254-5360" href="https://orcid.org/0000-0002-9254-5360" target="orcid.widget" rel="noopener noreferrer" style="vertical-align:top;"><img src="htt =====
===== SANTIAGOID =====
===== 0002-4068-1168 =====
===== B-5365-2009 =====
===== 0000-0001-8507-256 =====
===== C-4348-2013 =====
===== my-orcid =====
===== 0000-0001-6779-247 =====
===== martipab =====


In [35]:
(ds_people.orcid != '').sum(), (ds_people.orcid_norm != '').sum()

(1675, 1661)

In [36]:
ds_people.to_hdf('results/people_df.h5', key='df', mode='w')