In [83]:
import pandas as pd
import numpy as np
import os
import copy
import scipy.stats as stats
import math
from scipy.optimize import minimize
from scipy.optimize import Bounds


In [93]:
################# GLOBAL VARIABLES ######################

# Definitions from PLFS 2018-19 Annual Report, Concepts and Definitions (2.38.1)
# Problematic codes for self employed criteria: `21` - worked in household enterprises (self-employed) as helper
SELF_EMP_CODES = ["11", "12", "21", "61", "62",]
REG_EMP_CODES = ["31", "71", "72"]
CASUAL_EMP_CODES = ["41", "42", "51"]

NOT_REG_CODES = SELF_EMP_CODES + CASUAL_EMP_CODES

EMP_CODES = SELF_EMP_CODES + REG_EMP_CODES + CASUAL_EMP_CODES
UNEMP_CODES = ["81", "82"]
LF_CODES = EMP_CODES + UNEMP_CODES
NOT_IN_LF_CODES = [str(x) for x in list(range(91,100))]

In [85]:
# Read Main Data File
df_per_fv = pd.read_stata("../../data/raw/plfs/plfs_2018_19/PerV1_2018-19.dta")
df_per_rv = pd.read_stata("../../data/raw/plfs/plfs_2018_19/PerRV_2018-19.dta") 
df_hh_fv = pd.read_stata('../../data/raw/plfs/plfs_2018_19/HHV1_2018-19.dta')
df_hh_rv = pd.read_stata('../../data/raw/plfs/plfs_2018_19/HHRV_2018-19.dta')


In [86]:
df_per_rv.shape

(533264, 104)

### On whether to use Repeat Visit along with the First Visit Data (even if to use it as a pooled cross-section) 

There are two ways in which employment status is determined: 1) What you did in the last 365 days. 2) What you did in the last week. Repeat visit does not have information about the first one. Only the Weekly status. Using either has its pros and cons. While the first may be more accurate picture of participation while the second has more data. I am sticking with First Visit only (for simplicity)

In [88]:
# This block makes a csv file which stores the variable name-label pairs. 
df_per_fv_layout = pd.read_excel("../../data/raw/plfs/plfs_2018_19/Data_LayoutPLFS.xlsx", header=38, usecols=[1], nrows=129)
df_hh_fv_layout = pd.read_excel("../../data/raw/plfs/plfs_2018_19/Data_LayoutPLFS.xlsx", header=2, usecols=[1], nrows=32)
# df_hh_rv_layout = copy.deepcopy(df_hh_fv_layout) 
# df_per_rv_layout = pd.read_excel("../../data/raw/plfs/plfs_2018_19/Data_LayoutPLFS.xlsx", header=171, usecols=[1], nrows=104)

pd.DataFrame({'varName': df_per_fv.columns, 'varLabel': df_per_fv_layout["Full Name"]}).to_csv("../../data/proc/ColNameLabelPerV1_2018_19.csv", index=False)
pd.DataFrame({'varName': df_hh_fv.columns, 'varLabel': df_hh_fv_layout["Full Name"]}).to_csv("../../data/proc/ColNameLabelHHV1_2018_19.csv", index=False)
# pd.DataFrame({'varName': df_hh_rv.columns, 'varLabel': df_hh_rv_layout["Full Name"]}).to_csv("../../data/proc/ColNameLabelHHRV_2018_19.csv", index=False)
# pd.DataFrame({'varName': df_per_rv.columns, 'varLabel': df_per_rv_layout["Full Name"]}).to_csv("../../data/proc/ColNameLabelPerRV_2018_19.csv", index=False)


In [89]:
# Merge HH and Per dataset
# generate merge keys for both hh and per datasets and merge
df_per_fv.loc[:,'HHID'] = df_per_fv.quarter_per_fv + df_per_fv.visit_per_fv + df_per_fv.fsu_per_fv \
                    + df_per_fv.b1q13_per_fv + df_per_fv.b1q14_per_fv + df_per_fv.b1q15_per_fv

df_hh_fv.loc[:,'HHID'] = df_hh_fv.qtr_hh_rv + df_hh_fv.visit_hh_rv + df_hh_fv.b1q1_hh_rv \
                    + df_hh_fv.b1q13_hh_rv + df_hh_fv.b1q14_hh_rv + df_hh_fv.b1q15_hh_rv 


df_temp = pd.merge(left=df_per_fv, right=df_hh_fv, on='HHID', how="inner")


  df_per_fv.loc[:,'HHID'] = df_per_fv.quarter_per_fv + df_per_fv.visit_per_fv + df_per_fv.fsu_per_fv \


In [90]:
# Just so that I don't have to merge again and again
df_merged = copy.deepcopy(df_temp)

In [91]:
df_merged.shape

(420757, 162)

# Create variables first -- then subset!! Otherwise I'll run into problems with selection 

In [92]:
######### Employment Status 
## This is calculated using UPSS (Usual Principal or Subsidiary Status) definition. See Afridi et al. (2022) Appendix B

df_merged.loc[:,"lfp_ps"] = df_merged.b5pt1q3_per_fv.apply(lambda x: 1 if x in LF_CODES else 0)
df_merged["lfp_ss"] = df_merged.b5pt2q3_per_fv.apply(lambda x: 1 if x in LF_CODES else 0)
df_merged["lfp_ps_ss"] = df_merged['lfp_ps'] + df_merged['lfp_ss']

df_merged.loc[:,"emp_ps"] = df_merged.b5pt1q3_per_fv.apply(lambda x: 1 if x in EMP_CODES else 0)
df_merged["emp_ss"] = df_merged.b5pt2q3_per_fv.apply(lambda x: 1 if x in EMP_CODES else 0)
df_merged["emp_ps_ss"] = df_merged['emp_ps'] + df_merged['emp_ss']
/
df_merged.loc[:,'EMP'] = 0
df_merged.loc[df_merged['emp_ps_ss'] > 0, 'EMP'] = 1

df_merged.loc[:,"LFP"] = 0
df_merged.loc[df_merged["lfp_ps_ss"] > 0,"LFP"] = 1

In [10]:
####### Make wage variable.
# These will exist only for those that are employed so I will subset.
# DONT SUBSET ANY MORE THAN YOU HAVE TO!!!!!!
# df_merged = df_merged[df_merged['EMP'] == 1]
df_merged.loc[:, "wage"] = 0
# Wage column names
wage_cols = ['b6q9_per_fv', 'b6q10_per_fv', \
             'b6q9_3pt1_Act1_per_fv', 'b6q9_3pt1_Act2_per_fv', 'b6q9_3pt2_Act1_per_fv', \
             'b6q9_3pt2_Act2_per_fv', 'b6q9_3pt3_Act1_per_fv', 'b6q9_3pt3_Act2_per_fv', \
             'b6q9_3pt4_Act1_per_fv', 'b6q9_3pt4_Act2_per_fv', 'b6q9_3pt5_Act1_per_fv', \
             'b6q9_3pt5_Act2', 'b6q9_Act2_3pt6', 'b6q9_3pt6_Act1', 'b6q9_Act1_3pt7', \
             'b6q9_Act2_3pt7']
# these objects are strings so need to be converted to float.
for col in wage_cols:
    df_merged.loc[:,col] = df_merged.loc[:,col].astype(float)


# Some codes have salaries given for last 30 days.
# so extracting them here:
df_merged.loc[df_merged['b6q5_per_fv'].isin(['31', '71', '72']),'wage'] = df_merged.loc[df_merged['b6q5_per_fv'].isin(['31', '71', '72']),'b6q9_per_fv']
df_merged.loc[df_merged['b6q5_per_fv'].isin(['11', '12', '21', '61', '62']),'wage'] = df_merged.loc[df_merged['b6q5_per_fv'].isin(['11', '12', '21', '61', '62']),'b6q10_per_fv']

# Next, get wages for other codes (from their weekly activities)
# that have wages by day.
df_merged.loc[df_merged['b6q5_per_fv'].isin(['41', '42', '51']),'wage'] = \
                                        df_merged[df_merged['b6q5_per_fv'].isin(['41', '42', '51'])]\
                                               [['b6q9_3pt1_Act1_per_fv', 'b6q9_3pt1_Act2_per_fv', 'b6q9_3pt2_Act1_per_fv', \
                                                 'b6q9_3pt2_Act2_per_fv', 'b6q9_3pt3_Act1_per_fv', 'b6q9_3pt3_Act2_per_fv', \
                                                 'b6q9_3pt4_Act1_per_fv', 'b6q9_3pt4_Act2_per_fv', 'b6q9_3pt5_Act1_per_fv', \
                                                 'b6q9_3pt5_Act2', 'b6q9_Act2_3pt6', 'b6q9_3pt6_Act1', 'b6q9_Act1_3pt7', \
                                                 'b6q9_Act2_3pt7']].sum(axis=1)


# Changing `wage` type from object to float.
df_merged.loc[:,'wage'] = df_merged.loc[:,'wage'].astype(float)



In [11]:

# wageFreq is "monthly" or "weekly": Will need this for computing daily wage or hourly wage
df_merged.loc[:,'wageFreq'] = pd.Series(None)
df_merged.loc[df_merged['b6q5_per_fv'].isin(['11', '12', '21', '61', '62', '31', '71', '72']),'wageFreq'] = "m"
df_merged.loc[df_merged['b6q5_per_fv'].isin(['41', '42', '51']),'wageFreq'] = "w"


In [12]:
## Getting Hourly wage

df_merged.loc[:,"weeklyhrs"] = 0
## Get Hours Worked.
time_cols = ['b6q6_3pt1_Act1_per_fv', 'b6q6_3pt1_Act2_per_fv', 'b6q6_3pt2_Act1_per_fv', \
             'b6q6_3pt2_Act2_per_fv', 'b6q6_3pt3_Act1_per_fv', 'b6q6_3pt3_Act2_per_fv', \
             'b6q6_3pt4_Act1_per_fv', 'b6q6_3pt4_Act2_per_fv', 'b6q6_3pt5_Act1_per_fv', \
             'b6q6_3pt5_Act2', 'b6q6_3pt6_Act1', 'b6q6_3pt6_Act2', 'b6q6_3pt7_Act1', \
             'b6q6_3pt7_Act2']
for col in time_cols:
    df_merged.loc[:,col] = df_merged.loc[:,col].astype(float)
df_merged.loc[df_merged["EMP"] == 1,'weeklyhrs'] = df_merged.loc[df_merged["EMP"] == 1,time_cols].sum(axis=1)
## End Hours Worked
# What will be the denominator?
df_merged["total_hrs"] = df_merged["weeklyhrs"]
df_merged.loc[df_merged["wageFreq"] == "m", "total_hrs"] = 4*df_merged.loc[df_merged["wageFreq"] == "m","weeklyhrs"]
# Get hourly wage
df_merged.loc[:,'hourlywage'] = pd.Series(None)
df_merged.loc[df_merged['weeklyhrs'] > 0, 'hourlywage'] = df_merged.loc[df_merged['weeklyhrs'] > 0,'wage']/df_merged.loc[df_merged['weeklyhrs'] > 0,'total_hrs']



In [13]:
# Full Time Status might be useful.
df_merged.loc[:,'FT'] = pd.Series(None)
df_merged.loc[df_merged['weeklyhrs'] >= 40,'FT'] = "FT"
df_merged.loc[df_merged['weeklyhrs'] < 40 ,'FT'] = "PT"


In [14]:
df_merged[df_merged['FT'] == "FT"]["hourlywage"].astype(float).describe()

count    113250.000000
mean         56.103604
std         102.160203
min         -89.285714
25%          26.785714
50%          40.178571
75%          66.964286
max       26785.714286
Name: hourlywage, dtype: float64

In [15]:
# Get Daily Wage
df_merged.loc[:,'dailywage'] = pd.Series(None)
df_merged.loc[df_merged["wageFreq"] == "m","dailywage"] = df_merged.loc[df_merged["wageFreq"] == "m","wage"]/30
df_merged.loc[df_merged["wageFreq"] == "w","dailywage"] = df_merged.loc[df_merged["wageFreq"] == "w","wage"]/7


In [16]:
df_merged[df_merged['b6q5_per_fv'].isin(['31', '71', '72'])]["dailywage"].astype(float).describe()

count    43197.000000
mean       566.686866
std        552.535926
min          0.000000
25%        250.000000
50%        400.000000
75%        733.333333
max      40000.000000
Name: dailywage, dtype: float64

In [17]:
df_merged[df_merged['b6q5_per_fv'].isin(['11', '12', '21', '61', '62','41', '42', '51'])]["dailywage"].astype(float).describe()

count     97273.000000
mean        280.085356
std         735.975878
min        -666.666667
25%         100.000000
50%         233.333333
75%         357.142857
max      200000.000000
Name: dailywage, dtype: float64

In [18]:
# Subsetting: 
# 1. (Currently) married people. 
df_merged = df_merged[df_merged["b4q7_per_fv"] == "2"]
# 2. Urban.
df_merged = df_merged[df_merged["b1q3_per_fv"] == "2"]
# 3. Drop 3rd gender
df_merged = df_merged[df_merged["b4q5_per_fv"] != "3"]
# 4. Then make a dataframe with husbands and wives matched. Then subset women with age 15-49. 

In [19]:
# Now, I create a dataframe with husbands and wives. 
# First, remove all entries that do not have relation to head of HH as: 1,2,3 or 4. These relations are the only ones where I can identify a marriage. 
df_merged = df_merged[df_merged["b4q4_per_fv"].isin(["1","2","3","4"])]

# Next, need to label each person in a HH as husband/wife. 
df_merged.loc[:,"spouse"] = "husband"
df_merged.loc[df_merged["b4q5_per_fv"] == "2","spouse"] = "wife"

In [20]:
df_merged[["b4q4_per_fv", "spouse"]].value_counts()

b4q4_per_fv  spouse 
1            husband    34593
2            wife       33692
4            wife        8192
3            husband     7859
1            wife        1140
3            wife         838
4            husband      314
2            husband      174
Name: count, dtype: int64

In [22]:
# Make two separate dataframes for husbands and wives, merge each husband with his wife.
# 
# In each HH, there is a possibility that you have two husbands and two wives. 
# Separate the husbands and wives. 
df_husb = df_merged[df_merged["spouse"] == "husband"]
df_wife = df_merged[df_merged["spouse"] == "wife"]

# # Now, define a mapping -- 1<->2, 3<->4 within each hh has to be mapped.
# # The relationship key will be kept the same as b4q4_per_fv for husbands. Relationship key 
# # for wives will be according to the merge map. 
df_husb.loc[:,"rel_key"] = None
df_wife.loc[:,"rel_key"] = None
df_husb.loc[:,"rel_key"] = df_husb.loc[:,"b4q4_per_fv"]
merge_map = {"1":"2", "2":"1", "3":"4", "4":"3"}
df_wife.loc[:,"rel_key"] = df_wife.loc[:,"b4q4_per_fv"].apply(lambda x: merge_map[x])
# # df_husb["rel_key"]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_husb.loc[:,"rel_key"] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_wife.loc[:,"rel_key"] = None


In [23]:
df_husb.shape

(42940, 179)

In [81]:
df_wife.shape

(43862, 179)

In [25]:
# Update column names so that husb. cols have _husb, wives, _wife
df_husb.columns = [col+"_h" for col in df_husb.columns]
df_wife.columns = [col+"_w" for col in df_wife.columns]


In [26]:
df_husb_wife = pd.merge(left=df_husb, right=df_wife, left_on=["HHID_h", "rel_key_h"], right_on=["HHID_w", "rel_key_w"], how="outer", indicator=True)

In [27]:
df_husb_wife["_merge"].value_counts()

_merge
both          43825
right_only     2329
left_only      1524
Name: count, dtype: int64

In [28]:
df_husb_wife = df_husb_wife[df_husb_wife["_merge"] == "both"]

In [29]:
# Now, df_husb_wife is my base dataframe. First I will subset using wife's age: 
df_husb_wife = df_husb_wife[(df_husb_wife["b4q6_per_fv_w"] >= 15) & (df_husb_wife["b4q6_per_fv_w"] <= 49)]


In [78]:
df_husb_wife = df_husb_wife[df_husb_wife["b4q8_per_fv_h"] != ""]
df_husb_wife = df_husb_wife[df_husb_wife["b4q8_per_fv_w"] != ""]

df_husb_wife.loc[:,"hs_h"] = 0
# For those who don't have a "" in their education field, check if edu >= 11
df_husb_wife.loc[df_husb_wife["b4q8_per_fv_h"].astype(float) >= 10 ,"hs_h"] = 1

df_husb_wife.loc[:,"hs_w"] = 0
# For those who don't have a "" in their education field, check if edu >= 11
df_husb_wife.loc[df_husb_wife["b4q8_per_fv_w"].astype(float) >= 10 ,"hs_w"] = 1


df_husb_wife.loc[:,"dip_h"] = 0
# For those who don't have a "" in their education field, check if edu >= 11
df_husb_wife.loc[df_husb_wife["b4q8_per_fv_h"].astype(float) >= 11 ,"dip_h"] = 1

df_husb_wife.loc[:,"dip_w"] = 0
df_husb_wife.loc[df_husb_wife["b4q8_per_fv_w"].astype(float) >= 11 ,"dip_w"] = 1

df_husb_wife.loc[:,"col_h"] = 0
df_husb_wife.loc[df_husb_wife["b4q8_per_fv_h"].astype(float) >= 12,"col_h"] = 1

df_husb_wife.loc[:,"col_w"] = 0
df_husb_wife.loc[df_husb_wife["b4q8_per_fv_w"].astype(float) >= 12,"col_w"] = 1

df_husb_wife.loc[:,"grad_h"] = 0
df_husb_wife.loc[df_husb_wife["b4q8_per_fv_h"].astype(float) >= 13,"grad_h"] = 1

df_husb_wife.loc[:,"grad_w"] = 0
df_husb_wife.loc[df_husb_wife["b4q8_per_fv_w"].astype(float) >= 13,"grad_w"] = 1

In [76]:
df_husb_wife.to_pickle("df_husb_wife.pkl")