In [163]:
import os
import warnings
warnings.simplefilter('ignore')

import pandas as pd
#pd.options.display.float_format = "{:,.2f}".format

import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

## Data Preprocessing - UC System Salary
### This file assigns unique person-ID based on her department, title series, salary and year for UC salary data.

**Input: uc_salary (uc.dta)**

**Output: uc_salary_new.csv, with extra column "uid" for each unique individual.**

Deal with 3 cases:

**Case 1**: deal with (university, names) that show up in > 1 departments in the same year. This could be same person with multiple departments, or different persons with same name in different departments. <br>
&nbsp;&nbsp;       - If these rows have same (yr, yrid) then they must be the same person.<br>
&nbsp;&nbsp;       - If not, then they are not the same person.<br>
&nbsp;&nbsp;       - We assume same unique person must be 1) have the same set of fields which do not change over time. 2) Must have the same payroll series over time, which says the same person must have the same "series" across years. This allows us to match names *across years.* 

**Case 2**: for names not in **Case 1**, find out rows with names that show up in the same departments multiple times in the same year. This includes cases when no department information available. Because the same name shows up in the same year and their (yrid, yr) differ, they must be different individuals. 
Then we still make the same assumption as in **Case 1** to match names *across years.* . 

**Case 3**: all the remaining names. This is easy since each university-name pair is associated with only one observation each year.

## Case 1

In [128]:
os.chdir('/Users/apple/Dropbox/web_scrapping_UC/temp/')
uc_salary = pd.read_stata('uc.dta')
#df = pd.read_stata('michigan906.dta')
uc_salary.rename(columns = {'salary':'regular_pay','salary_total':'gross_pay'}, inplace = True)
uc_salary = uc_salary.sort_values(['university','first_name','last_name','yr', 'title'])

uc_salary['first_name_short'] = uc_salary['first_name'].str.split(' ').str[0].str.strip()
uc_salary.rename(columns = {'first_name':'first_name_raw','first_name_short':'first_name'}, inplace = True)

#uc_salary = uc_salary.reset_index().rename(columns = {'index':'rid'})
#uc_salary['uid'] = uc_salary.groupby(['university','first_name','last_name']).ngroup()

In [129]:
uc_salary = uc_salary.reset_index().rename(columns = {'index':'rid'})

#find out names that show up in > 1 departments in the same year
tt2 = uc_salary.groupby(['university','first_name','last_name','yr'])['department'].nunique().reset_index()
tt2_find = tt2[tt2['department'] > 1][['university','first_name','last_name','yr']]

#uc_salary_a = uc_salary.merge(tt_find, on = ['university','first_name','last_name'], how = 'right')
uc_salary_b = uc_salary.merge(tt2_find, on = ['university','first_name','last_name','yr'], how = 'right')

#1. If they have the same yr, yrid then this is 100% same person.
group_vars = ['yr','yrid']

uc_salary_b2 = uc_salary_b.merge(uc_salary_b[group_vars].drop_duplicates(group_vars).reset_index(), on = group_vars)
#uc_salary_b2 = uc_salary_b2.drop(columns = ['uid'])
uc_salary_b2 = uc_salary_b2.rename(columns = {'index':'uid'})

#same unique person must be affiliated with same set of departments, and have same "series" across years
uc_salary_b3 = uc_salary_b2.merge(uc_salary_b2.groupby(['uid']).agg({'field':'unique','series':'unique'}).reset_index().rename(columns={'field':'u_fields','series':'u_series'}), on = ['uid'], how = 'left')

uc_salary_b3['u_series'] = uc_salary_b3['u_series'].apply(lambda x: x.tolist()).astype('str')
uc_salary_b3['u_fields'] = uc_salary_b3['u_fields'].apply(lambda x: x.tolist()).astype('str')
keys = ['university','first_name','last_name','u_fields','u_series']
uc_salary_b4 = uc_salary_b3.drop_duplicates(['uid'])

uc_salary_b5 = uc_salary_b4.merge(uc_salary_b4[keys].drop_duplicates(keys).reset_index(), on=keys).rename(columns={'index':'uid2'})

uc_salary_b6 = uc_salary_b3.merge(uc_salary_b5[['uid','uid2']], on = ['uid'], how='left')

## Case 2 and 3

In [130]:
#look at remaining rows
uc_salary_a = uc_salary[~uc_salary['rid'].isin(uc_salary_b6.rid)]

#find out rows with names that show up in the same departments multiple times in the same year
uc_salary_a1 = uc_salary_a[uc_salary_a.duplicated(['university','last_name','first_name','yr'],keep=False)]
uc_salary_a2 = uc_salary_a.drop_duplicates(['university','last_name','first_name','yr'],keep=False)

uc_salary_a1 = uc_salary_a1.reset_index().rename(columns = {'index':'uid'})
uc_salary_a3 = uc_salary_a2.merge(uc_salary_a1[['university','first_name','last_name']].drop_duplicates(), on = ['university','first_name','last_name'])
uc_salary_a3['uid'] = None

uc_salary_a4 = pd.concat([uc_salary_a1, uc_salary_a3], ignore_index = True)

#if uc_salary_a4 has two different uids then it's IMPOSSIBLE that the two rows are the same person

gps = ['university','first_name','last_name','series','department']
uc_salary_a5 = uc_salary_a4.merge(uc_salary_a4[gps].drop_duplicates(gps).reset_index(), on = gps).rename(columns={'index':'uid2'})
uc_salary_a5.to_csv('/Users/apple/Desktop/uc_salary_a5.csv', index=False)
uc_salary_a5 = pd.read_csv('/Users/apple/Desktop/uc_salary_a5.csv')

#uc_salary_b6
gps = ['university','first_name','last_name']
uc_salary_a2 = uc_salary_a2.merge(uc_salary_a2[gps].drop_duplicates(gps).reset_index(), on=gps).rename(columns = {'index':'uid2'})

## Merge 3 cases together

In [131]:
#uc_salary_b6 = uc_salary_b6.drop(columns = ['u_fields','u_series','uid'])
#uc_salary_a5 = uc_salary_a5.drop(columns='uid')
uc_salary_a5['uid2'] = 'a'+ uc_salary_a5['uid2'].astype('str')
uc_salary_b6['uid2'] = 'b'+ uc_salary_b6['uid2'].astype('str')
uc_salary_a2['uid2'] = 'c'+ uc_salary_a2['uid2'].astype('str')
#same series, year, university name

uc_salary_new = pd.concat([uc_salary_a2, uc_salary_b6, uc_salary_a5], ignore_index=True).sort_values(['rid'])

uc_salary_new_a = uc_salary_new[uc_salary_new.duplicated(['rid'], keep=False)]
uc_salary_new_b = uc_salary_new.drop_duplicates(['rid'], keep=False)
uc_salary_new_a = uc_salary_new_a.sort_values(['rid','uid2'],ascending=False).drop_duplicates(['rid'])

uc_salary_new = pd.concat([uc_salary_new_a, uc_salary_new_b], ignore_index=True).drop(columns=['rid'])
uc_salary_new['uid'] = uc_salary_new.groupby(['uid2']).ngroup()
uc_salary_new = uc_salary_new.sort_values(['uid']).drop(columns=['uid2'])


In [132]:
uc_salary = uc_salary_new

Mark as suspicious if within the same year, one name is associated with two or more rows with cto_code or rank

In [133]:
#uc_salary = pd.read_csv('/Users/apple/Desktop/research_fellow_documents/data_clean/uc_salary.csv')
dups = uc_salary[uc_salary.duplicated(['uid','yr'],keep=False)].sort_values(['uid','yr'])
m1 = dups.groupby(['uid','yr']).agg({'cto_code': 'nunique', 'rank':'nunique'}).reset_index()

a1 = set(m1[m1['cto_code'] > 1]['uid'])
a2 = set(m1[m1['rank'] > 1]['uid'])

uc_salary['suspicious'] = uc_salary['uid'].isin(a1 | a2)
df = uc_salary
df = df.merge(df.groupby(['uid','yr'])['gross_pay'].nunique().reset_index(), on=['uid','yr'])

u1 = df[df['gross_pay_y']>1]
u2 = df[df['gross_pay_y']==1]
m1 = u1.groupby(['uid','yr'])['gross_pay_x','regular_pay'].sum().reset_index()
m2 = u2.groupby(['uid','yr'])['gross_pay_x','regular_pay'].mean().reset_index()
m3 = pd.concat([m1, m2], ignore_index = True)
m3.columns = ['uid','yr','gross_pay_sum','regular_pay_sum']
df = df.merge(m3, on = ['uid','yr'])
df = df.drop(columns = ['gross_pay_y']).rename(columns = {'gross_pay_x':'gross_pay'})
#df['suspicious'] = df['uid'].isin(suspicious)

## Find person i who share the same (first_name, last_name) but with missing middle initials with person j in university k and payment series r, then we assume i and j are the same person.

In [331]:
os.chdir('/Users/apple/Dropbox/web_scrapping_UC/merge_sum_salary/salary_data')
uc = pd.read_csv('uc_salary.csv')
uc = uc.reset_index().rename(columns = {'index':'rid'})

uc['first_name_fp'] = uc['first_name'].str.split(' ').str[0].str.strip()
uc['first_name_sp'] = uc['first_name'].str.split(' ').str[-1].str.strip()

In [333]:
# if there is no second part (middle initial) AND there is someone with the same 
#first_name last_name, then assume they are the same individual.
uc.loc[uc['first_name_fp'] == uc['first_name_sp'], 'first_name_sp'] = None

In [334]:
uc['missing_mi'] = (uc['first_name_sp'].isnull())
mis = uc.groupby(['first_name_fp','last_name','university','series']).agg({'missing_mi':'sum', 'uid': 'nunique'}).reset_index()

In [335]:
mis2 = mis[(mis['missing_mi'] >= 1) & (mis['uid'] > 1)]
mis3 = mis2[['first_name_fp', 'last_name', 'university','series']]
uc = uc.drop(columns = ['missing_mi'])
uc2 = uc.merge(mis3, on = ['first_name_fp', 'last_name', 'university','series'])
uc2['new_uid'] = uc2.groupby(['first_name_fp', 'last_name', 'university','series']).ngroup()

In [338]:
id2dept = uc2[uc2['department'].notnull()].groupby(['new_uid'])['department'].nunique().reset_index()
id2dept_unique = set(id2dept[id2dept['department'] == 1]['new_uid']) #uid with unique department affiliation

match = uc2[(uc2['department'].notnull()) & (uc2['new_uid'].isin(id2dept_unique))].drop_duplicates(['new_uid','department'])

In [339]:
match2 = dict(zip(match['new_uid'],match['department']))

uc2a = uc2[uc2['new_uid'].isin(id2dept_unique)]
uc2b = uc2[~uc2['new_uid'].isin(id2dept_unique)]
uc2a['department'] = uc2a['new_uid'].map(match2)

uc3 = pd.concat([uc2a, uc2b], ignore_index = True)
#uc3 = uc3.drop(columns = ['first_name_fp','first_name_sp'])

In [340]:
uc4 = uc[~uc['rid'].isin(uc3.rid)]
uc4['new_uid'] = uc4['uid']
uc5 = pd.concat([uc3, uc4], ignore_index= True)

In [345]:
tf = uc5[uc5['cip_code'].notnull()][['department','cip_code', 'field']].drop_duplicates()

dept2code = dict(zip(tf.department, tf.cip_code))
dept2field = dict(zip(tf.department, tf.field))
uc5['cip_code'] = uc5['department'].map(dept2code)
uc5['field'] = uc5['department'].map(dept2field)
uc5 = uc5.drop(columns = ['first_name_fp', 'first_name_sp'])
uc5.to_csv('/Users/apple/Dropbox/web_scrapping_UC/merge_sum_salary/salary_data/uc_salary_updated.csv', index=False)