In [1]:
import html
import pandas as pd
import re
import unidecode

from os.path import join

MAG_DIR = '/home/qke100/ke-data/dataset-MAG/202009/'

In [2]:
people_df = pd.read_hdf('dataset/people_df.h5')
people_df.shape

(774733, 13)

In [3]:
people_df.head(1)

Unnamed: 0,pid,firstname,middlename,lastname,degrees,location,locid,majorarea,orcid,firstname_norm,middlename_norm,lastname_norm,orcid_norm
0,1,Stephen,V.,David,Ph.D.,Oregon Health and Science University,226,"neuro,csd,bme",0000-0003-4135-3104,STEPHEN,V,DAVID,0000-0003-4135-3104


In [4]:
mentorship_df = pd.read_hdf('dataset/connect_df.h5')
mentorship_df.shape

(743176, 10)

In [5]:
mentorship_df.head(1)

Unnamed: 0,cid,pid1,pid2,relation,location,locid,startdate,stopdate,startyear,stopyear
0,2,2,3,1,"University of California, Berkeley",312,2000-00-00,2005-00-00,2000,2005


In [6]:
def collect_location():
    """"""
    loc_freq = people_df.location.str.upper().value_counts().to_dict()
    print(len(loc_freq))
    loc_freq.update(
        {k: v for k, v in mentorship_df.location.str.upper().value_counts().items() if k not in loc_freq})
    del loc_freq['']
    return pd.DataFrame(sorted(loc_freq.items(), key=lambda x: x[1], reverse=True), columns=['location', 'freq']) 

loc_freq_df = collect_location()
loc_freq_df.shape

24608


(27180, 2)

In [7]:
loc_freq_df.head()

Unnamed: 0,location,freq
0,"UNIVERSITY OF CALIFORNIA, BERKELEY",12372
1,"UNIVERSITY OF CALIFORNIA, LOS ANGELES",10382
2,"UNIVERSITY OF WISCONSIN, MADISON",10026
3,"UNIVERSITY OF MINNESOTA, TWIN CITIES",9504
4,"UNIVERSITY OF MICHIGAN, ANN ARBOR",9300


In [8]:
mag_affil_df = pd.read_csv(join(MAG_DIR, 'mag/Affiliations.txt'), 
                           sep='\t', 
                           names=['mag_affil_id', 'mag_norm_name', 'mag_dis_name'], 
                           usecols=[0, 2, 3],
                           dtype=str)
mag_affil_df.shape

(25795, 3)

In [9]:
mag_affil_df.sample(5)

Unnamed: 0,mag_affil_id,mag_norm_name,mag_dis_name
11099,1309204697,swedish maritime administration,Swedish Maritime Administration
9682,2802463004,side out foundation,Side-Out Foundation
18009,2802927161,donka hospital,Donka Hospital
3259,2799370498,american college of education,American College of Education
17268,2799458587,burnley college,Burnley College


In [10]:
mag_dis_name_id = {a.upper(): b for a, b in zip(mag_affil_df.mag_dis_name, mag_affil_df.mag_affil_id)}
len(mag_dis_name_id)

25794

In [11]:
mag_norm_name_id = {a.upper(): b for a, b in zip(mag_affil_df.mag_norm_name, mag_affil_df.mag_affil_id)}
len(mag_norm_name_id)

25750

In [12]:
loc_alias = {
    'UNIVERSITY OF MINNESOTA, TWIN CITIES': 'UNIVERSITY OF MINNESOTA',
    'UNIVERSITY OF MICHIGAN, ANN ARBOR': 'UNIVERSITY OF MICHIGAN',
    'UNIVERSITY OF WASHINGTON, SEATTLE': 'UNIVERSITY OF WASHINGTON',
    'UNIVERSITY OF FLORIDA, GAINESVILLE': 'UNIVERSITY OF FLORIDA',
    'OHIO STATE UNIVERSITY, COLUMBUS': 'OHIO STATE UNIVERSITY',
    'INDIANA UNIVERSITY, BLOOMINGTON': 'INDIANA UNIVERSITY',
    'RUTGERS UNIVERSITY, NEW BRUNSWICK': 'RUTGERS UNIVERSITY',
    'UNIVERSITY OF BRITISH COLUMBIA, VANCOUVER': 'UNIVERSITY OF BRITISH COLUMBIA',
    'TEACHERS COLLEGE, COLUMBIA UNIVERSITY': 'COLUMBIA UNIVERSITY',
    'STATE UNIVERSITY OF NEW YORK, BUFFALO': 'UNIVERSITY AT BUFFALO',
    'WASHINGTON UNIVERSITY, SAINT LOUIS': 'WASHINGTON UNIVERSITY IN ST LOUIS',
    'VIRGINIA POLYTECHNIC INSTITUTE AND STATE UNIVERSITY': 'VIRGINIA TECH',
    'OKLAHOMA STATE UNIVERSITY': 'OKLAHOMA STATE UNIVERSITY–STILLWATER',
    'UNIVERSITY OF MISSOURI - COLUMBIA': 'UNIVERSITY OF MISSOURI',
    "UNIVERSITY OF HAWAI'I AT MANOA": 'UNIVERSITY OF HAWAII AT MANOA',
    'STATE UNIVERSITY OF NEW YORK, ALBANY': 'UNIVERSITY AT ALBANY, SUNY',
    'STATE UNIVERSITY OF NEW YORK AT BINGHAMTON': 'BINGHAMTON UNIVERSITY',
    'UNIVERSITY OF COLORADO ANSCHUTZ MEDICAL CAMPUS, DENVER': 'ANSCHUTZ MEDICAL CAMPUS',
    'UNIVERSITE LAVAL (CANADA)': 'LAVAL UNIVERSITY',
    'HARVARD MEDICAL SCHOOL': 'HARVARD UNIVERSITY',
    'UNIVERSITEIT ANTWERPEN (BELGIUM)': 'UNIVERSITY OF ANTWERP',
    'SEMMELWEIS EGYETEM': 'SEMMELWEIS UNIVERSITY',
    'RUTGERS THE STATE UNIVERSITY OF NEW JERSEY - NEWARK': 'RUTGERS UNIVERSITY',
    'UNIVERSITY OF MISSOURI - SAINT LOUIS': 'UNIVERSITY OF MISSOURI–ST. LOUIS',
    'TOULOUSE 3': 'PAUL SABATIER UNIVERSITY',
    'NEW SCHOOL UNIVERSITY': 'THE NEW SCHOOL',
    'PARIS 4': 'PARIS-SORBONNE UNIVERSITY',
    'UNIVERSIDAD POLITECNICA DE VALENCIA (SPAIN)': 'POLYTECHNIC UNIVERSITY OF VALENCIA',
    'HEBREW UNIVERSITY, JERUSALEM': 'HEBREW UNIVERSITY OF JERUSALEM',
    'UNIVERSITE LYON 1': 'CLAUDE BERNARD UNIVERSITY LYON 1',
    'VRIJE UNIVERSITEIT AMSTERDAM': 'VU UNIVERSITY AMSTERDAM',
    'UNIVERSITÉ DE BORDEAUX': 'UNIVERSITY OF BORDEAUX',
    'UNIVERSITÄT ZÜRICH': 'UNIVERSITY OF ZURICH',
    'INSTITUTE OF TRANSPERSONAL PSYCHOLOGY': 'SOFIA UNIVERSITY',
    'GEORG-AUGUST-UNIVERSITÄT GÖTTINGEN': 'UNIVERSITY OF GÖTTINGEN',
    'UNIVERSIDAD DE CASTILLA - LA MANCHA (SPAIN)': 'UNIVERSITY OF CASTILLA–LA MANCHA',
    'KAROLINSKA INSTITUTE': 'KAROLINSKA INSTITUTET',
    'UNIVERSITAT DE BARCELONA': 'UNIVERSITY OF BARCELONA',
    'EBERHARD KARLS UNIVERSITÄT TÜBINGEN': 'UNIVERSITY OF TÜBINGEN',
    'CNRS': 'CENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE',
    'WEILL CORNELL MEDICAL COLLEGE': 'CORNELL UNIVERSITY',
    'UNIVERSITY OF MARYLAND SCHOOL OF MEDICINE': 'UNIVERSITY OF MARYLAND, BALTIMORE',
    'UPMC UNIV PARIS 6': 'PIERRE-AND-MARIE-CURIE UNIVERSITY',
    'UNIVERSITY OF GEORGIA, ATHENS': 'UNIVERSITY OF GEORGIA',
    'UNIVERSITY OF PUERTO RICO, RIO PIEDRAS (PUERTO RICO)': 'UNIVERSITY OF PUERTO RICO, RÍO PIEDRAS',
    "QUEEN'S UNIVERSITY, CANADA": "QUEEN'S UNIVERSITY",
    'THE UNIVERSITY OF WESTERN ONTARIO (CANADA)': 'UNIVERSITY OF WESTERN ONTARIO',
    'THE UNIVERSITY OF TEXAS SCHOOL OF PUBLIC HEALTH': 'UNIVERSITY OF TEXAS HEALTH SCIENCE CENTER AT HOUSTON',
    'THE UNIVERSITY OF TEXAS GRADUATE SCHOOL OF BIOMEDICAL SCIENCES AT HOUSTON': 'UNIVERSITY OF TEXAS HEALTH SCIENCE CENTER AT HOUSTON',
    'RUTGERS THE STATE UNIVERSITY OF NEW JERSEY, GRADUATE SCHOOL OF APPLIED AND PROFESSIONAL PSYCHOLOGY': 'RUTGERS UNIVERSITY',
    'UNIVERSITY OF MISSOURI-ROLLA': 'UNIVERSITY OF MISSOURI SYSTEM',
    'LUDWIG-MAXIMILIANS-UNIVERSITÄT MÜNCHEN': 'LUDWIG MAXIMILIAN UNIVERSITY OF MUNICH',
    'UNIVERSITY OF NORTH TEXAS HEALTH SCIENCE CENTER AT FORT WORTH': 'UNIVERSITY OF NORTH TEXAS HEALTH SCIENCE CENTER',
    'SACKLER SCHOOL OF GRADUATE BIOMEDICAL SCIENCES (TUFTS UNIVERSITY)': 'TUFTS UNIVERSITY',
    'PACIFIC GRADUATE SCHOOL OF PSYCHOLOGY': 'PALO ALTO UNIVERSITY',
    'THE UNIVERSITY OF THE ROCKIES': 'UNIVERSITY OF THE ROCKIES',
    "ST. JOHN'S UNIVERSITY (NEW YORK), SCHOOL OF EDUCATION AND HUMAN SERVICES": "ST. JOHN'S UNIVERSITY",
    'RHEINISCH-WESTFAELISCHE TECHNISCHE HOCHSCHULE': 'RWTH AACHEN UNIVERSITY',
    'FRIEDRICH-WILHELMS-UNIVERSITÄT ZU BERLIN': 'HUMBOLDT UNIVERSITY OF BERLIN',
    'TECHNISCHE UNIVERSITÄT DRESDEN': 'DRESDEN UNIVERSITY OF TECHNOLOGY',
}

def get_at_loc_var(x):
    """"""
    result = set([
        x,
        x.replace(', ', '-'),
        x.replace(', ', ' AT ').replace('-', ' '),
        x.replace(' & ', '&'),
        x.replace(', ', ' '),
        ' '.join(e for e in x.split() if e != 'THE').replace(' - ', '–'),
        x.replace(' AT ', ' '),
        x.replace(' AND ', ' & '),
        re.sub(r'\([^)]*\)', '', x).strip().replace('UNIVERSIDAD DE ', 'UNIVERSITY OF ').replace('UNIVERSIDADE DE ', 'UNIVERSITY OF '),
        unidecode.unidecode(x),
        x.split(',')[0],
        x.replace('-', '–'),
        x.replace(',', ' '),
        x.replace(' & ', ' AND '),
        x.replace(' - ', ' '),
        x.replace(' AND ', ' '),
        x.replace('IIT ', 'INDIAN INSTITUTE OF TECHNOLOGY ')
    ])
    return sorted(result, key=lambda x: len(x), reverse=True)

def match_with_mag(at_loc):
    """"""
    x = html.unescape(at_loc).strip().rstrip('.')
    if x in loc_alias:
        xa = loc_alias[x]
        if xa in mag_dis_name_id:
            return mag_dis_name_id[xa], xa
        if xa in mag_norm_name_id:
            return mag_norm_name_id[xa], xa
    for e in get_at_loc_var(x):
        if e in mag_dis_name_id:
            return mag_dis_name_id[e], e
        if e in mag_norm_name_id:
            return mag_norm_name_id[e], e
    for kw in [' SCHOOL ', ' COLLEGE ']:
        if kw in x:
            for e in get_at_loc_var(x[:x.index(kw)]):
                if e in mag_dis_name_id:
                    return mag_dis_name_id[e], e
                if e in mag_norm_name_id:
                    return mag_norm_name_id[e], e
    try:
        e = re.findall('\((.*?)\)', x)[0]
        if e in mag_dis_name_id:
            return mag_dis_name_id[e], e
        if e in mag_norm_name_id:
            return mag_norm_name_id[e], e
    except:
        pass
    if ',' in x:
        e = x.split(',')[1]
        if e in mag_dis_name_id:
            return mag_dis_name_id[e], e
        if e in mag_norm_name_id:
            return mag_norm_name_id[e], e
    return '', ''

def add_mag_affil():
    """"""
    a, b = [], []
    for loc in loc_freq_df.location:
        x, y = match_with_mag(loc)
        a.append(x)
        b.append(y)
    loc_freq_df['mag_affil_id'] = a
    loc_freq_df['mag_affil'] = b

add_mag_affil()
loc_freq_df.shape

(27180, 4)

In [13]:
(loc_freq_df.mag_affil_id == '').sum()

20099

In [14]:
loc_freq_df[loc_freq_df.mag_affil_id == ''].freq.sum()

66953

In [15]:
loc_freq_df[loc_freq_df.mag_affil_id == ''].head(10)

Unnamed: 0,location,freq,mag_affil_id,mag_affil
354,UNIVERSITÉ PARIS 8,262,,
360,THE UNION INSTITUTE,254,,
375,UNIVERSITY OF ILLINOIS,230,,
403,UNIVERSIDAD POLITECNICA DE CARTAGENA (SPAIN),197,,
422,POLYTECHNIC UNIVERSITY,182,,
423,PARIS 3,181,,
435,UNIVERSITÉ PARIS 5,175,,
448,UNIVERSITY PARIS 11,168,,
456,UNIVERSITY LILLE 1,164,,
459,MEDICAL COLLEGE OF GEORGIA,163,,


In [16]:
[k for k in mag_dis_name_id if 'Dresden University of Technology'.upper() in k]

['DRESDEN UNIVERSITY OF TECHNOLOGY']

In [18]:
loc_freq_df.to_hdf('dataset/locations_df.h5', key='df', mode='w')

In [19]:
def update_mag_affil_col():
    """"""
    loc_to_mag_affil = dict(zip(loc_freq_df.location, loc_freq_df.mag_affil))
    loc_to_mag_affil_id = dict(zip(loc_freq_df.location, loc_freq_df.mag_affil_id))
    people_df['MAGInstitution'] = people_df.location.apply(lambda x: loc_to_mag_affil.get(x.upper(), ''))
    people_df['MAGInstitutionID'] = people_df.location.apply(lambda x: loc_to_mag_affil_id.get(x.upper(), ''))
    mentorship_df['MAGInstitution'] = mentorship_df.location.apply(lambda x: loc_to_mag_affil.get(x.upper(), ''))
    mentorship_df['MAGInstitutionID'] = mentorship_df.location.apply(lambda x: loc_to_mag_affil_id.get(x.upper(), ''))

update_mag_affil_col()

In [22]:
people_df.to_hdf('dataset/people_df.h5', key='df', mode='w')

In [23]:
mentorship_df.to_hdf('dataset/connect_df.h5', key='df', mode='w')