In [1]:
import pandas as pd
import numpy as np
import os
import copy
import scipy.stats as stats
import math
from scipy.optimize import minimize
from scipy.optimize import Bounds
from functools import partial
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df_husb_wife = pd.read_pickle("df_husb_wife.pkl")

In [3]:
df_husb_wife.loc[df_husb_wife['state_per_fv_h'].isin([])]

Unnamed: 0,FI_per_fv_h,b1q2_per_fv_h,quarter_per_fv_h,visit_per_fv_h,b1q3_per_fv_h,state_per_fv_h,b1q4_per_fv_h,nss_region_per_fv_h,b1q5_per_fv_h,b1q6_per_fv_h,...,grad_h,grad_w,dip_h,dip_w,hs_h,hs_w,north,ratio,weight_h,state_name


In [4]:
# Create the table for LFP status
df_husb_wife.loc[:,"EMP_type"] = None
df_husb_wife.loc[(df_husb_wife["EMP_h"] == 1) & (df_husb_wife["EMP_w"] == 1),"EMP_type"] = "YY"
df_husb_wife.loc[(df_husb_wife["EMP_h"] == 1) & (df_husb_wife["EMP_w"] == 0),"EMP_type"] = "YN"
df_husb_wife.loc[(df_husb_wife["EMP_h"] == 0) & (df_husb_wife["EMP_w"] == 1),"EMP_type"] = "NY"
df_husb_wife.loc[(df_husb_wife["EMP_h"] == 0) & (df_husb_wife["EMP_w"] == 0),"EMP_type"] = "NN"


It is the goal of this exercise to get the gender wage gap.  What I need to do next is, given a skill level, identify the difference in wages. To keep matters from exploding, I will keep two skill levels -- high and low. These should be defined by, I think, college education? Or high school education? Or diploma? What I want is one of these skill levels gets you blue collar jobs and the other one, white collar jobs. 

The definition for white/blue collar, taking it equivalent to having a "regular/salaried wage" doesn't give a clean cut. Maybe see who earns more than median wage in each category.

The thing that works is the 90th percentile. It gives me a "clean" cut around having a diploma. Could keep college also. (I can imagine Rasmus saying this cut is arbitrary.)

Next, I need to see the following things: 

1. What proportion of married men and women are below/above college educated? 
2. What does the distribution of HHs look like across education and employment status? 
3. What is the "truncated" wage distribution like across these skills? 
    * How do I check this truncated thing? People in my dataset are recorded earning 0 wages and working positive hours. Even if I drop 0 wages, I am still left with people who are recorded earning 3-4 rs a day. That certainly doesn't feel possible. What do I do about the truncation point?? READING ON HOW TO GET ORIGINAL DISTRIBUTION FROM TRUNCATED DISTRIBUTION.
        * **Now, I went through a few hoops and this feels like something I can't do anything about. I _could_ condition on working at least 5 hours or something but that is definitely arbitrary.** 
        * **Also, hourly wages is better to use than daily wage.**


In [5]:
df_husb_wife["EMP_type"].value_counts(normalize=True)

EMP_type
YN    0.785507
YY    0.164712
NN    0.039238
NY    0.010543
Name: proportion, dtype: float64

In [6]:
nrows = df_husb_wife.shape[0]

#### How do I come up with a "high skill" vs. "low skill"

What I want is the `P("high wage"|skill)` should exhibit a "sharp" jump when going `skill = low` to `skill = high`

In [35]:

df_husb_wife.loc[:,"wc_h"] = pd.Series(None)
df_husb_wife.loc[df_husb_wife["hourlywage_h"] < df_husb_wife["hourlywage_h"].quantile(0.9),"wc_h"] = 0.0
df_husb_wife.loc[df_husb_wife["hourlywage_h"] >= df_husb_wife["hourlywage_h"].quantile(0.9),"wc_h"] = 1.0

df_husb_wife.loc[:,"wc_w"] = pd.Series(None)
df_husb_wife.loc[df_husb_wife["hourlywage_w"] < df_husb_wife["hourlywage_w"].quantile(0.9),"wc_w"] = 0.0
df_husb_wife.loc[df_husb_wife["hourlywage_w"] >= df_husb_wife["hourlywage_w"].quantile(0.9),"wc_w"] = 1.0

In [36]:
df_husb_wife[["col_h", "wc_h"]].groupby("col_h").mean()

Unnamed: 0_level_0,wc_h
col_h,Unnamed: 1_level_1
0,0.045739
1,0.285125


In [37]:
df_husb_wife[["col_w", "wc_w"]].groupby("col_w").mean()

Unnamed: 0_level_0,wc_w
col_w,Unnamed: 1_level_1
0,0.027897
1,0.313416


In [10]:
################ SO THAT's HOW I COME UP WITH THE HIGH SKILL LOW SKILL DISTINCTION ###################

In [11]:
# 1. What proportion of married men and women are below/above college educated?
df_husb_wife[["col_h", "col_w"]].value_counts().sort_index()/nrows

col_h  col_w
0      0        0.694385
       1        0.069356
1      0        0.101990
       1        0.134269
Name: count, dtype: float64

In [12]:
# 2. What does the distribution of HHs look like across education and employment status? 
df_husb_wife[["col_w", "col_h", "EMP_type"]].groupby(["col_w", "col_h"]).value_counts()/nrows

col_w  col_h  EMP_type
0      0      YN          0.555023
              YY          0.106254
              NN          0.025882
              NY          0.007226
       1      YN          0.086769
              YY          0.009210
              NN          0.005538
              NY          0.000474
1      0      YN          0.051439
              YY          0.014659
              NN          0.001866
              NY          0.001392
       1      YN          0.092277
              YY          0.034589
              NN          0.005952
              NY          0.001451
Name: count, dtype: float64

In [13]:
# 3. What does the wage gap look like across these thresholds for men and women? 
df_husb_wife[["col_h", "col_w","hourlywage_h", "hourlywage_w"]].groupby(["col_h", "col_w"]).median()

Unnamed: 0_level_0,Unnamed: 1_level_0,hourlywage_h,hourlywage_w
col_h,col_w,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,47.169811,25.0
0,1,62.5,53.571429
1,0,80.357143,35.714286
1,1,111.607143,100.0


The ratio of median wages for non-college men and women is around 0.5, while for college educated men and women is 0.9

In [14]:
df_husb_wife[["col_h", "col_w","hourlywage_h", "hourlywage_w"]].groupby(["col_h", "col_w"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,hourlywage_h,hourlywage_w
col_h,col_w,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,59.059896,29.537793
0,1,76.749055,72.445285
1,0,97.522107,55.271724
1,1,133.281124,116.780458


The ratio of mean wages for non-college men and women is around 0.5, while for college educated men and women is around 0.88

Issues that are bothering me:  

1. The biggest issue is that women seem to be accepting very low wages which essentially means that the accepted wage distribution will look exactly like the offer distribution.
2. Stigma Costs: I will probably have to put down a separate distribution for each of the education levels (college vs no college).
    1. I will not do this. I will assume the same stigma costs across education. This is because I cannot identify the stigma cost parameter across education levels.

Next Steps: 
1. Estimate Offer Distribution for men and women across education types: College educated vs Not College Educated
    1. Done.
2. I'll have to look at how stigma cost varies across these education types
    1. Let this be.
3. Establish clearly the fact that the gender wage gap doesn't look very different across the North/South
    1. I have to plot the difference of the density functions of gender wage ratios in North v South.
    2. In presentations, I will have to be parsimonious.


### Hours worked & Job Types
I use this from TUS to maintain consistency

In [15]:
df_husb_wife[df_husb_wife["weeklyhrs_h"].astype(float) > 0]["FT_h"].value_counts(normalize=True)

FT_h
FT    0.940029
PT    0.059971
Name: proportion, dtype: float64

MAKE SURE TO RESET EMPLOYMENT VARIABLE (THAT IS CALCULATED ON A LAST YEAR BASIS) AND WEEKLY HRS, WAGES, THAT USE WEEKLY BASIS.


In [282]:
df_husb_wife[(df_husb_wife["weeklyhrs_w"].astype(float) > 0)]["FT_w"].value_counts(normalize=True)

FT_w
FT    0.708494
PT    0.291506
Name: proportion, dtype: float64

In [283]:
df_husb_wife[df_husb_wife["EMP_w"] == 1]["FT_w"].value_counts(normalize=True)

FT_w
FT    0.703785
PT    0.296215
Name: proportion, dtype: float64

In [287]:
df_husb_wife[df_husb_wife["b5pt1q3_per_fv_h"] == "31"][["FT_h", "wage_h"]].groupby("FT_h").median()

Unnamed: 0_level_0,wage_h
FT_h,Unnamed: 1_level_1
FT,15000.0
PT,10000.0


There is a decent chunk of women that are part time. I don't want to complicate the model too much at this point so I will stick with one job type for now

### Estimating the Offer Distribution

### This is being done in Julia

In [746]:
df_husb_wife.loc[:,"income_h"] = None
df_husb_wife.loc[df_husb_wife["wageFreq_h"] == "m","income_h"] = df_husb_wife.loc[df_husb_wife["wageFreq_h"] == "m","wage_h"]
df_husb_wife.loc[df_husb_wife["wageFreq_h"] == "w","income_h"] = 4*df_husb_wife.loc[df_husb_wife["wageFreq_h"] == "w","wage_h"]

df_husb_wife.loc[:,"income_w"] = None
df_husb_wife.loc[df_husb_wife["wageFreq_w"] == "m","income_w"] = df_husb_wife.loc[df_husb_wife["wageFreq_w"] == "m","wage_w"]
df_husb_wife.loc[df_husb_wife["wageFreq_w"] == "w","income_w"] = 4*df_husb_wife.loc[df_husb_wife["wageFreq_w"] == "w","wage_w"]



In [791]:
# TODO: There is a little bit of an issue here: The EMP variable is calculated using yearly status
# b6q5_per_fv_h is weekly status code. These may differ (although the # of times this happens is small)

df_h = df_husb_wife[(df_husb_wife["hourlywage_h"].astype(float) > 0)][["col_h", "north", "hourlywage_h"]].sort_values("hourlywage_h").groupby(["north", "col_h"])
df_w = df_husb_wife[(df_husb_wife["hourlywage_w"].astype(float) > 0)][["col_w", "north", "hourlywage_w"]].sort_values("hourlywage_w").groupby(["north", "col_w"])


In [792]:
df_h.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,hourlywage_h
north,col_h,Unnamed: 2_level_1
0,0,64.23913
0,1,128.058484
1,0,56.505687
1,1,115.438885


In [793]:
df_h.std()

Unnamed: 0_level_0,Unnamed: 1_level_0,hourlywage_h
north,col_h,Unnamed: 2_level_1
0,0,41.816217
0,1,90.558929
1,0,40.384927
1,1,81.255471


In [794]:
df_w.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,hourlywage_w
north,col_w,Unnamed: 2_level_1
0,0,36.061029
0,1,105.73804
1,0,32.397402
1,1,100.899476


In [795]:
df_w.std()

Unnamed: 0_level_0,Unnamed: 1_level_0,hourlywage_w
north,col_w,Unnamed: 2_level_1
0,0,32.442157
0,1,80.38553
1,0,35.45549
1,1,81.488282


In [796]:
df_h.nth(2)

Unnamed: 0,col_h,north,hourlywage_h
6519,0,1,3.571429
32400,0,0,4.464286
8837,1,0,8.928571
14348,1,1,9.183673


In [799]:
df_w.nth(2)

Unnamed: 0,col_w,north,hourlywage_w
9861,0,0,2.97619
24505,0,1,3.571429
41165,1,1,7.407407
22537,1,0,8.928571


### Is there a material difference in hourly wages in the North v South?

In [809]:

df_h = df_husb_wife[(df_husb_wife["hourlywage_h"].astype(float) > 0)][["col_h", "north", "hourlywage_h"]]
df_w = df_husb_wife[(df_husb_wife["hourlywage_w"].astype(float) > 0)][["col_w", "north", "hourlywage_w"]]

# Males, College
tstat, pval = stats.ttest_ind(df_h[(df_h["col_h"] == 1) & (df_h["north"] == 1)]["hourlywage_h"].astype(float),\
                df_h[(df_h["col_h"] == 1) & (df_h["north"] == 0)]["hourlywage_h"].astype(float))
print(f'Males College: {tstat, pval}')

# Females, College
tstat, pval = stats.ttest_ind(df_w[(df_w["col_w"] == 1) & (df_w["north"] == 1)]["hourlywage_w"].astype(float),\
                df_w[(df_w["col_w"] == 1) & (df_w["north"] == 0)]["hourlywage_w"].astype(float))
print(f'Females College: {tstat, pval}')

# Males, No College
tstat, pval = stats.ttest_ind(df_h[(df_h["col_h"] == 0) & (df_h["north"] == 1)]["hourlywage_h"].astype(float),\
                df_h[(df_h["col_h"] == 0) & (df_h["north"] == 0)]["hourlywage_h"].astype(float))
print(f'Males No College: {tstat, pval}')

# Females, No College
tstat, pval = stats.ttest_ind(df_w[(df_w["col_w"] == 0) & (df_w["north"] == 1)]["hourlywage_w"].astype(float),\
                df_w[(df_w["col_w"] == 0) & (df_w["north"] == 0)]["hourlywage_w"].astype(float))
print(f'Females No College: {tstat, pval}')



Males College: (-5.050404469987602, 4.5743898808144617e-07)
Females College: (-0.9946728983046915, 0.3201044381905187)
Males No College: (-11.756063146526945, 8.944624576009158e-32)
Females No College: (-2.595734608003948, 0.009494940678674946)


### Employment Gap: North v South

In [738]:
# NORTH: Gujarat, Rajasthan, Uttar Pradesh, Uttarakhand, Madhya Pradesh, Delhi, Chhattisgarh, Punjab, and Haryana
# SOUTH: Kerala, Tamil Nadu, Andhra Pradesh, Telangana, Karnataka, and Maharashtra

# Generate north and south states
df_husb_wife["north"] = pd.Series(None)
# df_husb_wife.loc[ "state_per_fv_h"]
df_husb_wife.loc[df_husb_wife["state_per_fv_h"].isin(["05", "03","06","07","08","09", "22", "23","24"]),"north"] = 1
df_husb_wife.loc[df_husb_wife["state_per_fv_h"].isin(["27","28","29","32","33","36"]),"north"] = 0


In [739]:
df_husb_wife[["EMP_w", 'north']].groupby('north').mean()

Unnamed: 0_level_0,EMP_w
north,Unnamed: 1_level_1
0,0.237781
1,0.133604


In [12]:
df_husb_wife[["EMP_h", 'north']].groupby('north').mean()

Unnamed: 0_level_0,EMP_h
north,Unnamed: 1_level_1
0,0.95924
1,0.935021


### Prop of People: North v South

In [737]:
df_husb_wife["north"].value_counts(normalize=True)

north
1    0.507727
0    0.492273
Name: proportion, dtype: float64

### Distribution across Education: North v South

In [811]:
nrows

33768

In [823]:
df_husb_wife[["col_h", "col_w", "north"]].groupby('north').value_counts()/df_husb_wife["north"].value_counts().sum()

north  col_h  col_w
0      0      0        0.341807
       1      1        0.069108
              0        0.042545
       0      1        0.038813
1      0      0        0.347998
       1      1        0.070601
              0        0.052468
       0      1        0.036661
Name: count, dtype: float64

In [41]:
df_husb_wife[df_husb_wife["north"] == 1][["col_h", "col_w"]].value_counts()/df_husb_wife[df_husb_wife["north"] == 1][["col_h", "col_w"]].value_counts().sum()

col_h  col_w
0      0        0.686839
1      1        0.136844
       0        0.103870
0      1        0.072447
Name: count, dtype: float64

In [43]:
df_husb_wife[df_husb_wife["north"] == 0][["col_h", "col_w"]].value_counts()/df_husb_wife[df_husb_wife["north"] == 0][["col_h", "col_w"]].value_counts().sum()

col_h  col_w
0      0        0.694345
1      1        0.140385
       0        0.086425
0      1        0.078844
Name: count, dtype: float64

In [14]:
df_husb_wife[["col_h", "north"]].groupby('north').mean()

Unnamed: 0_level_0,col_h
north,Unnamed: 1_level_1
0,0.226811
1,0.240714


In [15]:
df_husb_wife[["col_w", "north"]].groupby('north').mean()

Unnamed: 0_level_0,col_w
north,Unnamed: 1_level_1
0,0.219229
1,0.209291


In [815]:
df_husb_wife["col_h"].value_counts()

col_h
0    25790
1     7978
Name: count, dtype: int64

In [816]:
df_husb_wife["col_w"].value_counts()

col_w
0    26892
1     6876
Name: count, dtype: int64

In [822]:
df_husb_wife["north"]

(33768,)

In [812]:
7785+1574+969+884+7926+1608+1195+835

22776

### Gender Wage gap: North vs South

In [159]:
df_2 = df_husb_wife[df_husb_wife["hourlywage_h"] > 0]
# df_husb_wife.loc[:,"ratio"] = None
# df_husb_wife.loc[df_husb_wife["hourlywage_h"] > 0, "ratio"] = df_husb_wife.loc[df_husb_wife["hourlywage_h"] > 0, "hourlywage_w"]/df_husb_wife.loc[df_husb_wife["hourlywage_h"] > 0, "ratio"]
df_2.loc[:,"ratio"] = df_2["hourlywage_w"]/df_2["hourlywage_h"]

In [360]:
df_2[["ratio", "col_h", "col_w", "north"]].astype(float).groupby(["north", "col_h","col_w"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ratio
north,col_h,col_w,Unnamed: 3_level_1
0.0,0.0,0.0,0.707178
0.0,0.0,1.0,0.951173
0.0,1.0,0.0,0.61878
0.0,1.0,1.0,0.954345
1.0,0.0,0.0,0.620496
1.0,0.0,1.0,1.097606
1.0,1.0,0.0,0.792876
1.0,1.0,1.0,0.869392


In [740]:
df_husb_wife[["north", "income_h", "col_h"]].groupby(["north", "col_h"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,income_h
north,col_h,Unnamed: 2_level_1
0,0,13913.650448
0,1,28397.269056
1,0,12577.339887
1,1,24680.241525


In [732]:
df_w["income_w"]/df_h["income_h"]

north  col_w
0      0        0.401854
       1         0.77018
1      0        0.374736
       1        0.804683
dtype: object

In [741]:
df_husb_wife[["north", "income_h", "col_h"]].groupby(["north", "col_h"]).std()

Unnamed: 0_level_0,Unnamed: 1_level_0,income_h
north,col_h,Unnamed: 2_level_1
0,0,9834.845402
0,1,20935.346299
1,0,9763.352269
1,1,18970.51474


In [705]:
df_h = df_husb_wife[["north", "income_h", "col_h"]].groupby(["north", "col_h"]).mean()
# df_h.rename(columns={"col_h": "col"})
df_w = df_husb_wife[["north", "income_w", "col_w"]].groupby(["north", "col_w"]).mean()
# df_w.rename(columns={"col_w": "col"})
# df_w["income_w"]/df_h["income_h"]


In [706]:
df_h

Unnamed: 0_level_0,Unnamed: 1_level_0,income_h
north,col_h,Unnamed: 2_level_1
0,0,13913.650448
0,1,28397.269056
1,0,12577.339887
1,1,24680.241525


In [742]:
df_w

Unnamed: 0_level_0,Unnamed: 1_level_0,income_w
north,col_w,Unnamed: 2_level_1
0,0,5591.250536
0,1,21871.001362
1,0,4713.187382
1,1,19859.760915


In [743]:
df_husb_wife[["north", "income_w", "col_w"]].groupby(["north", "col_w"]).std()

Unnamed: 0_level_0,Unnamed: 1_level_0,income_w
north,col_w,Unnamed: 2_level_1
0,0,6472.269682
0,1,18038.154431
1,0,6162.532393
1,1,17652.286679


In [7]:
df_husb_wife[df_husb_wife["north"] == 1]["state_per_fv_h"].value_counts().sort_index()

state_per_fv_h
03    1273
06     994
07     648
08    1638
09    2614
23    1731
24    1413
Name: count, dtype: int64

In [8]:
df2 = copy.deepcopy(df_husb_wife)

In [9]:
df2.loc[df2["state_per_fv_h"].isin(['22', '05']), 'north'] = 1

In [10]:

df_h = df_husb_wife[(df_husb_wife["hourlywage_h"].astype(float) > 0)][["col_h", "north", "hourlywage_h"]].sort_values("hourlywage_h").groupby(["north", "col_h"])
df_w = df_husb_wife[(df_husb_wife["hourlywage_w"].astype(float) > 0)][["col_w", "north", "hourlywage_w"]].sort_values("hourlywage_w").groupby(["north", "col_w"])


In [25]:
nn = df_husb_wife[(df_husb_wife["north"] == 1)].shape[0]
ns = df_husb_wife[(df_husb_wife["north"] == 0)].shape[0]

In [26]:
df_husb_wife[df_husb_wife['north'] == 1]["EMP_type"].value_counts()/nn

EMP_type
YN    0.814761
YY    0.120260
NN    0.057123
NY    0.007856
Name: count, dtype: float64

In [27]:
df_husb_wife[df_husb_wife['north'] == 0]["EMP_type"].value_counts()/ns

EMP_type
YN    0.733857
YY    0.225384
NN    0.028362
NY    0.012397
Name: count, dtype: float64

In [18]:
df_husb_wife[["EMP_type"]].value_counts()/

EMP_type
YN          0.785507
YY          0.164712
NN          0.039238
NY          0.010543
Name: count, dtype: float64