In [None]:
import numpy as np 
import pandas as pd 
import pylab as plt
import matplotlib as mpl
import nltk

The first thing we will do after loading the data is to create to columns where we will start 
the process of normalizing names and job titles.  

In [None]:
df = pd.read_csv( "../input/Salaries.csv", low_memory=False)

Here's a simple example of why we need to do this. It is completely plausible that all of 
the following records belong to the same person.  However there are 3 different strings for 
EmployeeName and two for JobTitle.  

In [None]:
df[df.EmployeeName.isin(['ZULA JONES','Zula Jones','Zula M Jones'])]

In 2011, all caps seemed to be enforced for data entry. After 2011 mixed case went into use. 
Furthermore in 2013 middle initials were added to names.  Job titles were shifted in 2012 to 
contain more abbreviations and fewer words.  Other titles were shifted completely in this 
transform.  For example 'mental health rehabilitation worker' becomes 'patient care assistant'.

In [None]:
years = pd.unique(df.Year)
for y in years:
    print("YEAR: ",y)
    tempdf = df[df.Year == y].copy()
    tempdf.sort_values('EmployeeName',inplace = True)
    print(tempdf[['EmployeeName','JobTitle']].head())
    print('\n')

First steps in regularization, force everything to lower case, remove any extra spaces, and 
replace all '.' in names with spaces.  Additionally replace all '.' in job titles with spaces and 
make sure there is space after commas.

In [None]:
df['ename'] = df.EmployeeName.str.replace('.',' ').str.lower().str.split().str.join(' ')
df['jtitle'] = df.JobTitle.str.replace('.',' ').str.replace(',',', ').str.lower().str.split().str.join(' ')

Job Titles
----------

If we look at the regularized JobTitle over the years perhaps we can figure out which ones are 
obvious matches.  Once this is done it we can check out things like which job titles are gaining
or loosing (see 'library page' versus 'public service trainee'), which titles are added or 
removed ('chief investment officer' anyone? and that title makes a lot of money), which job 
titles are just plain weird (what's a 'wharfinger' anyone?) etc.

In [None]:
jydf = df.groupby(['jtitle','Year']).size().unstack().fillna(0)
jydf['min_counts'] = jydf.min(axis = 1)
#jydf.sort_values([2011,2012,2013,2014])
jydf.loc[['library page', 'public service trainee', 'chief investment officer', 'wharfinger i','wharfinger 1','wharfinger ii','wharfinger 2']]

In [None]:
jydf.loc[[x for x in jydf.index if 'lib' in x]]

Taking small gossipy side trip into the 'chief investment officer', first we check for 'investment'
in the title to see if there has been any title changes.

In [None]:
df[df.jtitle.str.contains('investment')]

No title changes that involve investments, albeit another name to investigate. 
When we check to see if the 'chief investment officer' 
held other city jobs before coming on part-time in 2014 for $339653.70, we find that there is 
only one person with coaker in their name in the data.

In [None]:
df[df.ename.str.contains('coaker')]

In looking into Mr. Coaker, we noticed that Mr. Shaw's title contains 'ret' which often means
retired (and if this is the case it would be well payed).  However it seems that in shifting 
his title in 2012, they may have abbreviated a different word because he is 'manager 8' in 
both 2011 and 2014.

In [None]:
df[df.ename.str.contains(r'^robert.?.?.?shaw')]

Continuing on with job titles, most of the job titles were changed between 2011 and 2012.  If we 
just look at 3 years instead of 4, excluding a single year, we see that most titles stayed in use
when 2011 is excluded.

In [None]:
years = [2011,2012,2013,2014]
print("Excluded Year\t No. of Titles in use all other years\t No. of Titles not always in use")
for y in years:
    jy = df[df.Year != y].groupby(['jtitle','Year']).size().unstack().fillna(0)
    jy['min_counts'] = jy.min(axis = 1)
    print("%9d\t %19d\t %36d"%(y,jy[jy.min_counts > 0].shape[0],jy[jy.min_counts == 0].shape[0]))

The changes between 2011 and 2012 reflect not just a change in case, but also various abbreviations, concatenations, 
and substitutions.

In [None]:
df[df.ename.isin(['aaron craig','carolina reyes ouk','zakhary mallett','ziran zhang'])][['ename','jtitle','Year']].sort_values(['ename','Year'])

For the further regularization of job titles, we will try to do two things. First we replace roman 
numerals with arabic numerals when reasonable (not touching i/ii). This will allow us to 
potentially assign a level to various jobs.  Second we replace a few of the 
easily identifiable abbreviations/misspellings introduced in 2012.  We will neither strive to nor
manage completeness in the second.  Without the use of something like fuzzywuzzy to help score
500+ potential valid title matches, finding the majority is not going to happen. Other issues 
not addressed include dropped or added words or phrases and unclear abbreviations.

In [None]:
jydf.loc[[x for x in jydf.index if 'sergeant' in x and jydf.loc[x].min_counts == 0]]

In [None]:
#Replacement dictionaries for roman numerals
rdict = {'jtitle': {r' iii': r' 3',
                    r' ii ': r' 2 ',
                    r' i ': r' 1 ',
                    r' ii, ': r' 2, ',
                    r' i, ': r' 1, ',
                    r' iv, ': r' 4, ',
                    r' v, ': r' 5, ',
                    r' vi, ': r' 6, ',
                    r' vii, ': r' 7, ',
                    r' viii, ': r' 8, ',
                     r' v ': r' 5 ',
                    r' vi ': r' 6 ',
                    r' vii ': r' 7 ',
                    r' viii ': r' 8 ',
                    r' iv': r' 4',
                    r' xiv': r' 14',
                    r' xxii': r' 22'}}
rdict2 = {'jtitle':{r' i$': r' 1',
                    r' ii$': r' 2',
                    r' iii$': r' 3',
                    r' iv$': r' 4',
                   r' v$': r' 5',
                   r' vi$': r' 6',
                   r' vii$': r' 7',
                   r' viii$': r' 8',
                   r' ix$': r' 9',
                   r' x$': r' 10',
                   r' xi$': r' 11',
                   r' xii$': r' 12',
                   r' xiii$': r' 13',
                   r' xiv$': r' 14',
                   r' xv$': r' 15',
                   r' xvi$': r' 16',
                   r' xvii$': r' 17',
                   r' xviii$': r' 18'}}

ndf = df.replace(rdict, regex=True, inplace=False)
ndf.replace(rdict2, regex=True, inplace=True)
ndf.replace({'jtitle':{r' , ': r', '}}, regex = True, inplace = True)

In [None]:
#Visual check line...
pd.unique(ndf[ndf.jtitle.str.contains('^supv')].jtitle)

In [None]:
#Replacement dictionary for abbr. and misspellings
adict = {'jtitle': {
                   r'asst': r'assistant',
                   r'dir ': r'director ',
                   r' sprv ': r' supervisor ',
                   r' sprv$': r' supervisor',
                   r'sprv1': r'supervisor 1',
                   r'qualitytech': r'quality technician',
                   r'maint ': r'maintenance ',
                   r'asst ': r'assistant ',
                   r'emerg ': r'emergency ',
                   r'emergencycy': r'emergency',
                   r'engr': r'engineer',
                   r'coord ': r'coordinator ',
                   r'coord$': r'coordinator',
                   r' spec ': r' specialist ',
                   r' spec$': r' specialist',
                   r' emp ': r' employee ',
                   r' repr$': r' representative', 
                   r' repres$': r' representative',
                   r' representat$': r' representative', 
                   r' - municipal transportation agency': r', mta',
                   r'safetycomm': r'safety communications',
                   r'trnst': r'transit',
                   r'wrk': r'worker',
                   r'elig ': r'eligibility '}}

ndf2 = ndf.replace(adict, regex=True, inplace=False)

#Unfortunately there are enough ambiguous abbreviations that we either need to switch to 
#trying to use nltk more or we need to convert specific jobs... 


In [None]:
jydf4 = ndf2.groupby(['jtitle','Year']).size().unstack().fillna(0)
jydf4['min_counts'] = jydf4.min(axis = 1)

In [None]:
jydf3 = ndf.groupby(['jtitle','Year']).size().unstack().fillna(0)
jydf3['min_counts'] = jydf3.min(axis = 1)

A quick check shows that converting to roman numerals increased the number of job titles in use
all for years by around 100 titles and adding in the above abbreviations added another 45 titles 
in continuous use.

In [None]:
print(jydf[jydf.min_counts > 0].shape)
print(jydf3[jydf3.min_counts > 0].shape)
print(jydf4[jydf4.min_counts > 0].shape)

In [None]:
#Replacement dictionary for titles
title_dict = {'jtitle': {'water quality technician 3': 'water quality tech 3',
                   'water construction and maintenance superintendent': 'water const&main supt',
                   'track maintenance superintendent, municipal railway': 'track maintenance supt, muni railway',
                   }}

ndf3 = ndf2.replace(title_dict, regex=True, inplace=False)



In [None]:
jydf5 = ndf3.groupby(['jtitle','Year']).size().unstack().fillna(0)
jydf5['min_counts'] = jydf5.min(axis = 1)

In [None]:
print(jydf5[jydf5.min_counts > 0].shape)

The following is a quick side trip into how you could go about matching up titles by looking for 
enames with different titles in 2011 and 2012.

In [None]:
gad = ndf2[ndf2.Year.isin([2011,2012])][['ename','jtitle','Year']]
print(gad.head())

In [None]:
lines = gad.groupby('ename').size()
tlines = lines[lines == 2]
mate1 = gad[gad.ename.isin(tlines.index)]
mate2 = mate1.groupby(['ename','jtitle']).size()
mate3 = mate2[mate2 == 1]
cand = mate3.unstack().index

In [None]:
matchers = gad[gad.ename.isin(cand)]

In [None]:
matchers.sort_values(['ename','Year'])

Employee Name
------------

In 2013 middle initials were added to many names. In order to match names between 2012 and 2013, 
we need columns for the start of the name and the end of the name.  If the name happens to end 
with 'jr', 'ii', or 'iii', we include the last two words at the end of the name.


In [None]:
def find_name_end(name):
    s = name.split()
    last = s[-1]
    if last in ['jr','ii','iii'] and len(s)>2:
        last = ' '.join(s[-2:])
    return last

In [None]:
ndf3['ename_start'] = ndf.ename.apply(lambda x: x.split()[0])
ndf3['ename_end'] = ndf.ename.apply(find_name_end)

Next to see how much trouble we are going to have with the matching, let's pull out some special
cases.  Specifically, cases where the same ename start and ename end appear multiple times in the
same year.

In [None]:
three_pint = ndf3.groupby(['ename_start','ename_end','Year'])
replicates = three_pint.size().unstack()
replicates = replicates.fillna(0)
replicates[replicates.max(axis = 1) > 1]

A question we'd like to be able to consider is of these 3205 possibible name matches how many 
of these combinations represent multiple people who happen to have the same name match verus 
how many represent the same person being listed for multiple jobs.  Both seem to happen.

In [None]:
three_pint.get_group(('zenaida','cajilig',2014)).sort_values('jtitle')

In [None]:
three_pint.get_group(('yu','huang',2014)).sort_values('jtitle')

In [None]:
replicates[2011].argmax()

In [None]:
ndf3[ndf3.ename_end.str.contains(r' ii$')][['ename','ename_start','ename_end']].groupby(['ename_end','ename_start']).size()

In [None]:
tp = ndf3.groupby(['ename_start','ename_end'])
count = 0
for name, g in tp:
    if g.shape[0] > 4:
        count = count + 1
        if count < 90 and count > 50:
            print(name)
            print(g[['ename','jtitle','Year']])
            print(' ')

In [None]:
replicates[replicates.max(axis = 1) < 2]

In [None]:
replicates[(replicates.max(axis = 1) < 2) & (replicates.sum(axis = 1) == 4)]

pd.pivot_table(ndf3, values='Year', index=['ename_start','ename_end'], columns=['BasePay'])

In [None]:
ndf3.groupby(['ename_start','ename_end']).apply(lambda x: x.Year.max())

In [None]:
def playa(g):
    return g.groupby('Year').apply(lambda x: x.BasePay.mean())
ndf3.groupby(['ename_start','ename_end']).apply(playa)


In [None]:


lib3 = ndf3[ndf3.jtitle.str.contains('librarian 3')]

In [None]:
lib3[lib3.Year == 2014].describe()

In [None]:
chared = lib3.sort_values(['ename_end','ename_start','Year'])[['EmployeeName','BasePay','TotalPay','Year']]

In [None]:
chared.shape

In [None]:
chared

In [None]:
for i in range(0,70,60):
    inds = chared.index[i:i+60]
    print(i)
    print(chared.loc[inds])

In [None]:
checkers = pd.unique(lib3['ename_end'])
check2 = pd.unique(lib3['ename'])
check2



In [None]:
for n in checkers:
    tempt = ndf3[(ndf3.ename_end == n) & ndf3.ename.isin(check2) ]
    print (tempt.sort_values(['ename_end','ename_start','Year'])[['EmployeeName','JobTitle','BasePay','OtherPay','Year','Status']])

In [None]:
ndf3[(ndf3.ename_start == 'camille') & (ndf3.ename_end.str.contains('arr'))]

In [None]:
ndf3[(ndf3.ename_start == 'richard') & (ndf3.ename_end == 'le')]

In [None]:
lib3

In [None]:
lib3[['BasePay','TotalPay','TotalPayBenefits','Year']].boxplot( by = 'Year')

In [None]:
lib3.describe()