In [69]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import gender_guesser.detector as gender
# this allows plots to appear directly in the notebook
%matplotlib inline
plt.rcParams['figure.figsize'] = (21,15)
import matplotlib.patches as mpatches
from easymoney.money import EasyPeasy
from economics import CPI
from economics import Inflation

In [70]:
dbname = 'federal_employment'
username = 'stephaniesherman'
# connect:
con = None
con = psycopg2.connect(database = dbname, user = username)

In [71]:
# query:
name_query = """
SELECT employee_id, employee_name 
FROM public.employee_info 
WHERE country = '001' and month = '12' and employee_name != 'NAME WITHHELD BY OPM' and employee_name != 'NAME WITHHELD BY AGENCY' 
and employee_name != 'NAME UNKNOWN'
;
"""
names = pd.read_sql_query(name_query,con)
names.head()

Unnamed: 0,employee_id,employee_name
0,5525563,"GRAYBILL,KRISTIN R."
1,5525582,"TALFORD,ROBERT H."
2,5527405,"JORQUERA,MARIO ENRIQUE"
3,5533994,"COBBS,GLADYS M."
4,5534678,"STRAUSS,LINDA HYMAN"


In [72]:
names.shape

(5195789, 2)

In [73]:
names.drop_duplicates('employee_id', inplace = True)

In [74]:
###data cleaning
names['employee_name']= names['employee_name'].str.replace(',', ' ')
names['employee_name']= names['employee_name'].str.replace('JR', '')
names['employee_name']= names['employee_name'].str.replace('JR.', '')
names['employee_name']= names['employee_name'].str.replace('III', '')
names['employee_name']= names['employee_name'].str.replace('II', '')

In [75]:
parts_of_name=names['employee_name'].str.split(' ').tolist()
split_names=pd.DataFrame.from_records(parts_of_name)

In [76]:
###remove . in middle initial 
#split_names[1]=split_names[1].str.replace('.','')

In [77]:
d = gender.Detector(case_sensitive=False)

In [78]:
first_names = []
for n in split_names.index:
    if split_names[1][n] is None:
        first_names.append(None)
    elif len(split_names[1][n]) > 1:
        first_names.append(split_names[1][n])
    else: first_names.append(split_names[2][n])

In [79]:
split_names.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,GRAYBILL,KRISTIN,R.,,,,,,,,,,,,,,,
1,TALFORD,ROBERT,H.,,,,,,,,,,,,,,,
2,JORQUERA,MARIO,ENRIQUE,,,,,,,,,,,,,,,
3,COBBS,GLADYS,M.,,,,,,,,,,,,,,,
4,STRAUSS,LINDA,HYMAN,,,,,,,,,,,,,,,
5,SHOVEN,HEATHER,A.,,,,,,,,,,,,,,,
6,SCULLY,CAROLYN,,,,,,,,,,,,,,,,
7,MCCLINTOCK,JAMES,T.,,,,,,,,,,,,,,,
8,JONES,MONAJOI,J.,,,,,,,,,,,,,,,
9,WAGNER,JANICE,K.,,,,,,,,,,,,,,,


In [80]:
gender = []
for fn in first_names:
    if fn is None:
        gender.append(None)
    else:
        gender.append(d.get_gender(fn))

In [81]:
names['gender'] = gender

In [82]:
names['gender'].value_counts()

male             256443
female           197585
unknown          146158
mostly_female     20626
mostly_male        9811
andy               3481
Name: gender, dtype: int64

In [84]:
names[names['gender']=='unknown'].tail(100)

Unnamed: 0,employee_id,employee_name,gender
5190123,005483301,NTIM KWAME SAFO,unknown
5190133,005488427,CORTELL RANON DOV,unknown
5190136,005489927,MELENDEZ KASIMMA P,unknown
5190197,005520576,TOBY PROMISE GEOFFREY,unknown
5190245,005560436,MYRIE KENUTE A,unknown
5190296,005600189,ANDERSON ANIUTEA F,unknown
5190308,005608818,AYANA RIBKA ASEFA,unknown
5190342,005639721,JACKSON SETETRA LYNETTE,unknown
5190375,005659113,PAPPU SUGUNA,unknown
5190388,005675431,COHEN-MEGORI SEGAL,unknown


In [62]:
names.to_csv('/Users/stephaniesherman/Dropbox/insight_data_science_program/opm_federal_employment_data/fedscope_buzzfeed/gender.csv',index = False)

In [None]:
##query was this
##COPY public.gender (employee_id, employee_name, gender)
##FROM '/Users/stephaniesherman/Dropbox/insight_data_science_program/opm_federal_employment_data/fedscope_buzzfeed/gender.csv' DELIMITER ',' CSV HEADER;

In [91]:
#########gives all the males and females I have identified

# query:
gen_query = """
SELECT employee_info.*, gender.gender
FROM public.employee_info
LEFT JOIN gender ON employee_info.employee_id = gender.employee_id
WHERE gender.gender = 'female' or gender.gender = 'mostly_female' or gender.gender = 'mostly_male' or gender.gender = 'male';
"""

gend = pd.read_sql_query(gen_query,con)

In [92]:
gend['gender'] = gend['gender'].str.replace('mostly_','')

In [93]:
gend['education_level']=pd.to_numeric(gend['education_level'],errors = coerce)
gend=gend[gend['education_level']!=99.0]

In [101]:
##get only full-time employees
gend = gend[gend['nsftp']=='1.0']

In [103]:
gender_dict = dict({'female':0,'male':1})

In [104]:
gend['gender']=gend['gender'].map(gender_dict)

In [105]:
los_num=dict({'< 1': .5, '1-2': 1.5,'3-4':3.5, '5-9': 7,'10-14':12, '15-19': 17, '20-24':22,'25-29': 27,'30-34': 32,'35+': 35})
gend['length_of_service'] = gend['length_of_service'].map(los_num)

In [107]:
gend['supervisory_status'].value_counts()

8    3574331
2     388964
1     332509
5     143400
3      98193
4      41635
6      17126
7      10847
Name: supervisory_status, dtype: int64

In [108]:
gend[gend['supervisory_status']=='8'].head()

Unnamed: 0,id,employee_id,employee_name,year,month,day,date,agency,sub_agency,state,...,pay_grade,length_of_service,occupation,occupational_cat,adjusted_basic_pay,supervisory_status,type_of_appointment,work_schedule,nsftp,gender
1,80535533,769609,"COLE,ROSALYN",2000,12,13,2000-12-13,VA,TA,39,...,13,17.0,671,A,62343.0,8,10,F,1.0,0
2,80536388,800306,"BOZAKIS,DIANNE K",2000,12,13,2000-12-13,VA,TA,12,...,2,7.0,610,P,42816.0,8,38,F,1.0,0
4,80536519,804466,"LANGFORD,MARVIN E",2000,12,13,2000-12-13,VA,TA,12,...,12,22.0,603,A,65179.0,8,38,F,1.0,1
5,80536531,804917,"MCGUFFIN,DALE R",2000,12,13,2000-12-13,VA,TA,16,...,2,7.0,3566,B,20494.0,8,38,F,1.0,1
6,80536643,810306,"GIROUX,ELIZABETH L",2000,12,13,2000-12-13,VA,TA,36,...,6,12.0,620,T,31760.0,8,38,F,1.0,0


In [109]:
####data cleaning
sup = dict({'8': 0, '5':0, '4': 0,'1':1, '2': 1, '6': 1, '7':1})
gend['supervisory_status']=gend['supervisory_status'].map(sup)

In [120]:
###calculate inflation How many US $ would I need in 2011 to pay for what cost $5 in 2007
ep = EasyPeasy()

In [124]:
gend = gend.dropna()

In [125]:
gend.shape

(4284304, 25)

In [126]:
pay_inflation = map(lambda x,y: ep.normalize(x, region='USA',from_year = y, to_year = 'latest'),gend['adjusted_basic_pay'],gend['year'])

In [127]:
gend['pay_inflation'] = pay_inflation

In [128]:
gend.to_csv('/Users/stephaniesherman/Dropbox/insight_data_science_program/opm_federal_employment_data/fedscope_buzzfeed/gender_across_time_inflation.csv', index = False)
