## This file merges cleaned UC, Texas, Michigan and Illinois salary datasets with unique person IDs together and standardize field / departments.

In [1]:
import pandas as pd
import numpy as np
import os

os.chdir('/Users/apple/Desktop/research_fellow_documents/merge_sum_salary/')

uc = pd.read_csv('salary_data/uc_salary.csv')
texas = pd.read_csv('salary_data/texas_salary.csv')
michigan = pd.read_csv('salary_data/michigan_salary.csv')
illinois = pd.read_csv('salary_data/illinois_salary.csv')

In [2]:
len(illinois) + len(texas) +len(michigan) + len(uc)

664921

In [3]:
texas = texas.drop(columns=['rid'])
michigan = michigan.drop(columns=['rid','gross_notnull','title_prof'])
illinois = illinois.drop(columns=['rid'])
uc = uc.drop(columns=['yrid'])

#standardize names
illinois['name'] = illinois['name'].str.upper()
texas['name'] = texas['name'].str.upper()

uc['name'] = uc['first_name'] + ' '+ uc['last_name']
michigan['name'] = michigan['first_name'] + ' '+ michigan['last_name']

#standardize ranks
texas['rank'] = np.where(texas['title'].str.contains('Assistant|ASST', na = False, case = False, regex=True),'Assistant', texas['rank'])

In [4]:
uc['univ_sys'] = 'UC'
michigan['univ_sys'] = 'Michigan'
illinois['univ_sys'] = 'Illinois'
texas['univ_sys'] = 'Texas'

### Make prof_type column in UC data - recoding cto

In [5]:
uc_maps = {'Professorial-Tenure': 'Professorial', 
 'Health Sciences Clinical Professor': 'Clinical',
 'Professorial-Non-Tenure':'Professorial',
 'Adjunct Professor':'Adjunct',
 'Professor in Residence':'Professorial',
 'Professor of Clinical':'Clinical',
 'Visiting Professor':'Visiting',
 'Professorial-Recall':'Professorial',
 'Clinical Professor - Volunteer':'Clinical',
 'Professorial-Emeritus':'Emeritus',
 'Acting Professor-Senate':'Professorial', 
 'Acting Professor-Non-Senate': 'Professorial', 
 'Clin Prof-Dentistry-50%+/Tenure': 'Clinical',
 'Miscellaneous Titles-Single Titles':'Professorial', 
 'Other Research':'Professorial'}
uc['prof_type'] = uc['cto'].map(uc_maps)

Create unique person IDs for the merged dataset

In [6]:
df = pd.concat([uc, michigan, texas, illinois], ignore_index = True)
df.uid = df.groupby(['uid','univ_sys']).ngroup()

df.drop_duplicates(['uid'])['univ_sys'].value_counts()

Texas       101505
UC           37918
Illinois     23359
Michigan     12607
Name: univ_sys, dtype: int64

### Make department to field mapping consistent across universities

1. Hand check department - field matching using dept2field.xlsx. 

In [150]:
dept2field = df.groupby(['department','field','cip_code'])['uid'].count().reset_index()
dept2field.to_excel('cip_check/dept2field.xlsx', index = False)

2. create two level field matches: field_macro using 2 digits cip code and field_micro using 4 digits cip code.

In [7]:
dm = pd.read_excel('cip_check/dept2field_check.xlsx')
dm['field_new'] = np.where(dm['field_check'].notnull(), dm['field_check'], dm['field'])

codes = dm[['field','cip_code']].drop_duplicates(['field'])
codes_dict = dict(zip(codes.field, codes.cip_code))

dm['cip_code_new'] = dm['field_new'].map(codes_dict)
dm['cip_code_new'] = dm['cip_code_new'].astype('str')
dm['cip_code_macro'] = dm['cip_code_new'].str.split('.').str[0]
print(len(dm[dm['cip_code_macro'].str.len() == 1]))
dm['cip_code_new'] = np.where(dm['cip_code_macro'].str.len() == 1, '0'+ dm['cip_code_new'], dm['cip_code_new'])
dm['cip_code_macro'] = dm['cip_code_new'].str.split('.').str[0]

182


In [8]:
cip = pd.read_csv('cip_check/CIPCode2020.csv')

cip['CIPCode'] = cip['CIPCode'].str.replace('=|"|','', regex=True)
cip_zip = cip[['CIPCode','CIPTitle']]
dc = dict(zip(cip_zip.CIPCode, cip_zip.CIPTitle))
dm['field_macro'] = dm['cip_code_macro'].map(dc)
dm['cip_code_micro'] = dm['cip_code_new'].str[0:5]
dm['field_micro'] = dm['cip_code_micro'].map(dc)
dm.rename(columns = {'cip_code_new':'cip_code_raw'}, inplace=True)

In [9]:
keep = ['department','cip_code_macro','cip_code_micro','cip_code_raw','field_macro','field_micro']

dm2 = dm[keep]
dm2 = dm2.drop_duplicates()
dm2 = dm2.drop_duplicates(['department'])

print(len(df))
df2 = df.drop(columns = ['cip_code']).merge(dm2, on = ['department'], how='left')
print(len(df2))
df2 = df2[['university', 'uid', 'yr', 'first_name', 'last_name', 'department', 'field', 'field_macro', 'field_micro','title',
       'rank', 'gross_pay','regular_pay', 'overtime_pay', 'other_pay',
       'suspicious', 'gross_pay_sum', 'regular_pay_sum', 'name', 'univ_sys',
       'prof_type', 'fte', 'pay_term', 'gen_fund', 'college', 'division',
       'gen_fund_sum', 'college_code', 'department_code', 'emp_no',
       'multiple_jobs', 'fte_total', 'tenure', 'emplclass', 'cip_code_macro',
       'cip_code_micro', 'cip_code_raw', 'title_code', 'series', 'cto', 'cto_code']]

664921
664921


In [None]:
#standardize names across universities

df2['first_name'] = df2['first_name'].str.upper()
df2['last_name'] = df2['last_name'].str.upper()
dft = df2[df2['univ_sys'] == 'Texas']
dfi = df2[df2['univ_sys'] == 'Illinois']
dfc = df2[df2['univ_sys'] == 'UC']
dfm = df2[df2['univ_sys'] == 'Michigan']
dfi['first_name'] = dfi['name'].str.split(', ').str[-1].str.split(' ').str[0]
dfi['last_name'] = dfi['name'].str.split(', ').str[0]
#dfi['fn2'] = dfi['name'].str.split(', ').str[-1].str.split(' ').str[-1]
df2 = pd.concat([dft, dfi, dfc, dfm], ignore_index = True).sort_values(['univ_sys','university','uid','last_name','first_name','yr'])
df2.to_csv('salary_data/salary_all.csv', index = False)