In [91]:
import pandas as pd
import numpy as np
import os
import copy


In [92]:
################# GLOBAL VARIABLES ######################

# Definitions from PLFS 2018-19 Annual Report, Concepts and Definitions (2.38.1)
# Problematic codes for self employed criteria: `21` - worked in household enterprises (self-employed) as helper
SELF_EMP_CODES = ["11", "12", "21"]
REG_EMP_CODES = ["31"]
CASUAL_EMP_CODES = ["41", "42", "51", "61", "62", "71", "72"]

NOT_REG_CODES = SELF_EMP_CODES + CASUAL_EMP_CODES

EMP_CODES = SELF_EMP_CODES + REG_EMP_CODES + CASUAL_EMP_CODES
UNEMP_CODES = ["81", "82"]
LF_CODES = EMP_CODES + UNEMP_CODES
NOT_IN_LF_CODES = [str(x) for x in list(range(91,100))]

In [93]:
# Read Main Data File
df_per_fv = pd.read_stata("../../data/raw/plfs/plfs_2018_19/PerV1_2018-19.dta")
df_per_rv = pd.read_stata("../../data/raw/plfs/plfs_2018_19/PerRV_2018-19.dta") 
df_hh_fv = pd.read_stata('../../data/raw/plfs/plfs_2018_19/HHV1_2018-19.dta')
df_hh_rv = pd.read_stata('../../data/raw/plfs/plfs_2018_19/HHRV_2018-19.dta')


In [94]:
df_per_rv.shape

(533264, 104)

### On whether to use Repeat Visit along with the First Visit Data (even if to use it as a pooled cross-section) 

There are two ways in which employment status is determined: 1) What you did in the last 365 days. 2) What you did in the last week. Repeat visit does not have information about the first one. Only the Weekly status. Using either has its pros and cons. While the first may be more accurate picture of participation while the second has more data. I am sticking with First Visit only (for simplicity)

In [95]:
# This block makes a csv file which stores the variable name-label pairs. 
df_per_fv_layout = pd.read_excel("../../data/raw/plfs/plfs_2018_19/Data_LayoutPLFS.xlsx", header=38, usecols=[1], nrows=129)
df_hh_fv_layout = pd.read_excel("../../data/raw/plfs/plfs_2018_19/Data_LayoutPLFS.xlsx", header=2, usecols=[1], nrows=32)
# df_hh_rv_layout = copy.deepcopy(df_hh_fv_layout) 
# df_per_rv_layout = pd.read_excel("../../data/raw/plfs/plfs_2018_19/Data_LayoutPLFS.xlsx", header=171, usecols=[1], nrows=104)

pd.DataFrame({'varName': df_per_fv.columns, 'varLabel': df_per_fv_layout["Full Name"]}).to_csv("../../data/proc/ColNameLabelPerV1_2018_19.csv", index=False)
pd.DataFrame({'varName': df_hh_fv.columns, 'varLabel': df_hh_fv_layout["Full Name"]}).to_csv("../../data/proc/ColNameLabelHHV1_2018_19.csv", index=False)
# pd.DataFrame({'varName': df_hh_rv.columns, 'varLabel': df_hh_rv_layout["Full Name"]}).to_csv("../../data/proc/ColNameLabelHHRV_2018_19.csv", index=False)
# pd.DataFrame({'varName': df_per_rv.columns, 'varLabel': df_per_rv_layout["Full Name"]}).to_csv("../../data/proc/ColNameLabelPerRV_2018_19.csv", index=False)


In [96]:
# Merge HH and Per dataset
# generate merge keys for both hh and per datasets and merge
df_per_fv.loc[:,'HHID'] = df_per_fv.quarter_per_fv + df_per_fv.visit_per_fv + df_per_fv.fsu_per_fv \
                    + df_per_fv.b1q13_per_fv + df_per_fv.b1q14_per_fv + df_per_fv.b1q15_per_fv

df_hh_fv.loc[:,'HHID'] = df_hh_fv.qtr_hh_rv + df_hh_fv.visit_hh_rv + df_hh_fv.b1q1_hh_rv \
                    + df_hh_fv.b1q13_hh_rv + df_hh_fv.b1q14_hh_rv + df_hh_fv.b1q15_hh_rv 


df_temp = pd.merge(left=df_per_fv, right=df_hh_fv, on='HHID', how="inner")


  df_per_fv.loc[:,'HHID'] = df_per_fv.quarter_per_fv + df_per_fv.visit_per_fv + df_per_fv.fsu_per_fv \


In [97]:
# Just so that I don't have to merge again and again
df_merged = copy.deepcopy(df_temp)

In [98]:
df_merged.shape

(420757, 162)

In [99]:
# Subsetting: 
# 1. (Currently) married people. 
df_merged = df_merged[df_merged["b4q7_per_fv"] == "2"]
# 2. Urban.
df_merged = df_merged[df_merged["b1q3_per_fv"] == "2"]
# 3. Drop 3rd gender
df_merged = df_merged[df_merged["b4q5_per_fv"] != "3"]
# 4. Then make a dataframe with husbands and wives matched. Then subset women with age 15-49. 

In [100]:
df_merged.shape

(89513, 162)

In [101]:
######### Employment Status 
## This is calculated using UPSS (Usual Principal or Subsidiary Status) definition. See Afridi et al. (2022) Appendix B

df_merged.loc[:,"lfp_ps"] = df_merged.b5pt1q3_per_fv.apply(lambda x: 1 if x in LF_CODES else 0)
df_merged["lfp_ss"] = df_merged.b5pt2q3_per_fv.apply(lambda x: 1 if x in LF_CODES else 0)
df_merged["lfp_ps_ss"] = df_merged['lfp_ps'] + df_merged['lfp_ss']

df_merged.loc[:,"emp_ps"] = df_merged.b5pt1q3_per_fv.apply(lambda x: 1 if x in EMP_CODES else 0)
df_merged["emp_ss"] = df_merged.b5pt2q3_per_fv.apply(lambda x: 1 if x in EMP_CODES else 0)
df_merged["emp_ps_ss"] = df_merged['emp_ps'] + df_merged['emp_ss']
/
df_merged.loc[:,'EMP'] = 0
df_merged.loc[df_merged['emp_ps'] > 0, 'EMP'] = 1

df_merged.loc[:,"LFP"] = 0
df_merged.loc[df_merged["lfp_ps_ss"] > 0,"LFP"] = 1

In [102]:
df_merged[df_merged["EMP"] == 1].shape

(44824, 170)

In [103]:
df_merged['b6q5_per_fv'].astype(float).value_counts().sort_index()

b6q5_per_fv
11.0    14941
12.0     1734
21.0     1905
31.0    19960
41.0       27
42.0       32
51.0     4875
61.0       89
62.0      436
71.0       59
72.0      315
81.0     1593
82.0      351
91.0      409
92.0    32114
93.0     3894
94.0     5052
95.0      611
97.0     1018
98.0       98
Name: count, dtype: int64

In [111]:
####### Get hourly wages.
# These will exist only for those that are employed so I will subset.
# DONT SUBSET ANY MORE THAN YOU HAVE TO!!!!!!
# df_merged = df_merged[df_merged['EMP'] == 1]
df_merged.loc[:,'hourlywage'] = pd.Series(None)
# Wage column names
wage_cols = ['b6q9_per_fv', 'b6q10_per_fv', \
             'b6q9_3pt1_Act1_per_fv', 'b6q9_3pt1_Act2_per_fv', 'b6q9_3pt2_Act1_per_fv', \
             'b6q9_3pt2_Act2_per_fv', 'b6q9_3pt3_Act1_per_fv', 'b6q9_3pt3_Act2_per_fv', \
             'b6q9_3pt4_Act1_per_fv', 'b6q9_3pt4_Act2_per_fv', 'b6q9_3pt5_Act1_per_fv', \
             'b6q9_3pt5_Act2', 'b6q9_Act2_3pt6', 'b6q9_3pt6_Act1', 'b6q9_Act1_3pt7', \
             'b6q9_Act2_3pt7']
# these objects are strings so need to be converted to float.
for col in wage_cols:
    df_merged.loc[:,col] = df_merged.loc[:,col].astype(float)


# Some codes have salaries given for last 30 days.
# so extracting them here:
df_merged.loc[df_merged['b6q5_per_fv'].isin(['31', '71', '72']),'wage'] = df_merged.loc[df_merged['b6q5_per_fv'].isin(['31', '71', '72']),'b6q9_per_fv']
df_merged.loc[df_merged['b6q5_per_fv'].isin(['11', '12', '21', '61', '62']),'wage'] = df_merged.loc[df_merged['b6q5_per_fv'].isin(['11', '12', '21', '61', '62']),'b6q10_per_fv']

# Next, get wages for other codes (from their weekly activities)
# that have wages by day.
df_merged.loc[df_merged['b6q5_per_fv'].isin(['41', '42', '51']),'wage'] = \
                                        df_merged[df_merged['b6q5_per_fv'].isin(['41', '42', '51'])]\
                                               [['b6q9_3pt1_Act1_per_fv', 'b6q9_3pt1_Act2_per_fv', 'b6q9_3pt2_Act1_per_fv', \
                                                 'b6q9_3pt2_Act2_per_fv', 'b6q9_3pt3_Act1_per_fv', 'b6q9_3pt3_Act2_per_fv', \
                                                 'b6q9_3pt4_Act1_per_fv', 'b6q9_3pt4_Act2_per_fv', 'b6q9_3pt5_Act1_per_fv', \
                                                 'b6q9_3pt5_Act2', 'b6q9_Act2_3pt6', 'b6q9_3pt6_Act1', 'b6q9_Act1_3pt7', \
                                                 'b6q9_Act2_3pt7']].sum(axis=1)


# Changing `wage` type from object to float.
df_merged.loc[:,'wage'] = df_merged.loc[:,'wage'].astype(float)

# wageFreq is "monthly" or "weekly"
df_merged.loc[:,'wageFreq'] = pd.Series(None)
df_merged.loc[df_merged['b6q5_per_fv'].isin(['11', '12', '21', '61', '62', '31', '71', '72']),'wageFreq'] = "m"
df_merged.loc[df_merged['b6q5_per_fv'].isin(['41', '42', '51']),'wageFreq'] = "w"

## End Getting Wages

## Get Hours Worked.
time_cols = ['b6q6_3pt1_Act1_per_fv', 'b6q6_3pt1_Act2_per_fv', 'b6q6_3pt2_Act1_per_fv', \
             'b6q6_3pt2_Act2_per_fv', 'b6q6_3pt3_Act1_per_fv', 'b6q6_3pt3_Act2_per_fv', \
             'b6q6_3pt4_Act1_per_fv', 'b6q6_3pt4_Act2_per_fv', 'b6q6_3pt5_Act1_per_fv', \
             'b6q6_3pt5_Act2', 'b6q6_3pt6_Act1', 'b6q6_3pt6_Act2', 'b6q6_3pt7_Act1', \
             'b6q6_3pt7_Act2']
for col in time_cols:
    df_merged.loc[:,col] = df_merged.loc[:,col].astype(float)
df_merged.loc[:,'weeklyhrs'] = df_merged.loc[:,time_cols].sum(axis=1)
## End Hours Worked

df_merged = df_merged[(df_merged['weeklyhrs'] > 0)]
df_merged = df_merged[df_merged['wage'] > 0]
# 144 hrs time endowment. Drop any with more than that.
# df_merged = df_merged[df_merged['weeklyhrs'] <= 144]
# Can remove lower end?  Those working very very few hrs.
# df_merged = df_merged[df_merged['weeklyhrs'] > df_merged['weeklyhrs'].quantile(0.001)]
df_merged["factor"] = df_merged["weeklyhrs"]
df_merged.loc[df_merged["wageFreq"] == "m", "factor"] = 4*df_merged.loc[df_merged["wageFreq"] == "m","weeklyhrs"]
df_merged['hourlywage'] = df_merged['wage']/df_merged['factor']


df_merged.loc[:,'FT'] = pd.Series(None)
df_merged.loc[df_merged['weeklyhrs'] >= 40,'FT'] = "FT"
df_merged.loc[df_merged['weeklyhrs'] < 40 ,'FT'] = "PT"


In [134]:
### Let's try getting daily wage
df_merged.loc[:,'dailywage'] = pd.Series(None)
df_merged.loc[df_merged["wageFreq"] == "m","dailywage"] = df_merged.loc[df_merged["wageFreq"] == "m","wage"]/30
df_merged.loc[df_merged["wageFreq"] == "w","dailywage"] = df_merged.loc[df_merged["wageFreq"] == "w","wage"]/7

In [141]:
df_merged[df_merged['b6q5_per_fv'].isin(['31', '71', '72'])]["dailywage"].astype(float).describe()

count    19698.000000
mean       694.139779
std        602.690928
min          4.100000
25%        283.333333
50%        500.000000
75%       1000.000000
max      10666.666667
Name: dailywage, dtype: float64

In [148]:
df_merged[df_merged['b6q5_per_fv'].isin(['11', '12', '21', '61', '62','41', '42', '51'])]["dailywage"].astype(float).describe()

count     21337.000000
mean        477.599550
std        1428.411814
min           3.333333
25%         250.000000
50%         350.000000
75%         566.666667
max      200000.000000
Name: dailywage, dtype: float64

In [113]:
df_merged = df_merged[df_merged["weeklyhrs"] < df_merged["weeklyhrs"].astype(float).quantile(0.9996)]

In [136]:
df_merged["weeklyhrs"].astype(float).describe()

count    19698.000000
mean        57.650878
std          9.967689
min          7.000000
25%         56.000000
50%         56.000000
75%         62.000000
max        107.000000
Name: weeklyhrs, dtype: float64

In [115]:
df_merged['hourlywage'].astype(float).describe()

count    41903.000000
mean        78.765028
std        157.227044
min          0.446429
25%         35.714286
50%         53.571429
75%         94.339623
max      26785.714286
Name: hourlywage, dtype: float64

In [190]:
# Just so that I don't have to merge again and again
df_merged = copy.deepcopy(df_temp)

# Subsetting: 
# 1. (Currently) married people. 
df_merged = df_merged[df_merged["b4q7_per_fv"] == "2"]
# 2. Urban.
df_merged = df_merged[df_merged["b1q3_per_fv"] == "2"]
# 3. Drop 3rd gender
df_merged = df_merged[df_merged["b4q5_per_fv"] != "3"]
# 4. Then make a dataframe with husbands and wives matched. Then subset women with age 15-49. 

In [191]:
######### Employment Status 
## This is calculated using UPSS (Usual Principal or Subsidiary Status) definition. See Afridi et al. (2022) Appendix B

df_merged.loc[:,"lfp_ps"] = df_merged.b5pt1q3_per_fv.apply(lambda x: 1 if x in LF_CODES else 0)
df_merged["lfp_ss"] = df_merged.b5pt2q3_per_fv.apply(lambda x: 1 if x in LF_CODES else 0)
df_merged["lfp_ps_ss"] = df_merged['lfp_ps'] + df_merged['lfp_ss']

df_merged.loc[:,"emp_ps"] = df_merged.b5pt1q3_per_fv.apply(lambda x: 1 if x in EMP_CODES else 0)
df_merged["emp_ss"] = df_merged.b5pt2q3_per_fv.apply(lambda x: 1 if x in EMP_CODES else 0)
df_merged["emp_ps_ss"] = df_merged['emp_ps'] + df_merged['emp_ss']

df_merged.loc[:,'EMP'] = 0
df_merged.loc[df_merged['emp_ps'] > 0, 'EMP'] = 1

df_merged.loc[:,"LFP"] = 0
df_merged.loc[df_merged["lfp_ps_ss"] > 0,"LFP"] = 1

In [192]:
# In this block, I create a dataframe with husbands and wives. 
# First, remove all entries that do not have relation to head of HH as: 1,2,3 or 4. These relations are the only ones where I can identify a marriage. 
df_merged = df_merged[df_merged["b4q4_per_fv"].isin(["1","2","3","4"])]

# Next, need to label each person in a HH as husband/wife. 
df_merged.loc[:,"spouse"] = "husband"
df_merged.loc[df_merged["b4q5_per_fv"] == "2","spouse"] = "wife"

In [193]:
df_merged[["b4q4_per_fv", "spouse"]].value_counts()

b4q4_per_fv  spouse 
1            husband    34593
2            wife       33692
4            wife        8192
3            husband     7859
1            wife        1140
3            wife         838
4            husband      314
2            husband      174
Name: count, dtype: int64

Why are husbands and wives not equal? 
I was subsetting by EMployment status. Many  more husbands were employed but not so many wives.

In [194]:
# Make two separate dataframes for husbands and wives, merge each husband with his wife.
# 
# In each HH, there is a possibility that you have two husbands and two wives. 
# Separate the husbands and wives. 
df_husb = df_merged[df_merged["spouse"] == "husband"]
df_wife = df_merged[df_merged["spouse"] == "wife"]

# # Now, define a mapping -- 1<->2, 3<->4 within each hh has to be mapped.
# # The relationship key will be kept the same as b4q4_per_fv for husbands. Relationship key 
# # for wives will be according to the merge map. 
df_husb.loc[:,"rel_key"] = pd.Series(None)
df_wife.loc[:,"rel_key"] = pd.Series(None)
df_husb.loc[:,"rel_key"] = df_husb.loc[:,"b4q4_per_fv"]
merge_map = {"1":"2", "2":"1", "3":"4", "4":"3"}
df_wife.loc[:,"rel_key"] = df_wife.loc[:,"b4q4_per_fv"].apply(lambda x: merge_map[x])
# # df_husb["rel_key"]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_husb.loc[:,"rel_key"] = pd.Series(None)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_wife.loc[:,"rel_key"] = pd.Series(None)


In [195]:
df_husb.shape

(42940, 172)

In [196]:
df_wife.shape

(43862, 172)

In [197]:
# Update column names so that husb. cols have _husb, wives, _wife
df_husb.columns = [col+"_h" for col in df_husb.columns]
df_wife.columns = [col+"_w" for col in df_wife.columns]


In [198]:
df_husb_wife = pd.merge(left=df_husb, right=df_wife, left_on=["HHID_h", "rel_key_h"], right_on=["HHID_w", "rel_key_w"], how="outer", indicator=True)
# df_husb_wife = df_husb_wife[df_husb_wife["_merge"] == "both"]
# df_husb_wife.drop(columns=["_merge"], inplace=True)

In [199]:
df_husb_wife["_merge"].value_counts()

_merge
both          43825
right_only     2329
left_only      1524
Name: count, dtype: int64

In [200]:
df_husb_wife = df_husb_wife[df_husb_wife["_merge"] == "both"]
# Now subset based on wife's age: I need women with age in 15-49
df_husb_wife = df_husb_wife[(df_husb_wife["b4q6_per_fv_w"] >= 15) & (df_husb_wife["b4q6_per_fv_w"] <= 49)]


In [209]:
# Create the table for LFP status
df_husb_wife.loc[:,"EMP_type"] = pd.Series(None)
df_husb_wife.loc[(df_husb_wife["EMP_h"] == 1) & (df_husb_wife["EMP_w"] == 1),"EMP_type"] = "YY"
df_husb_wife.loc[(df_husb_wife["EMP_h"] == 1) & (df_husb_wife["EMP_w"] == 0),"EMP_type"] = "YN"
df_husb_wife.loc[(df_husb_wife["EMP_h"] == 0) & (df_husb_wife["EMP_w"] == 1),"EMP_type"] = "NY"
df_husb_wife.loc[(df_husb_wife["EMP_h"] == 0) & (df_husb_wife["EMP_w"] == 0),"EMP_type"] = "NN"


In [211]:
df_husb_wife["EMP_type"].value_counts(normalize=True)

EMP_type
YN    0.785408
YY    0.164659
NN    0.039396
NY    0.010537
Name: proportion, dtype: float64

In [44]:
# Remove entries that don't satisfy the mapping as above
qtr_month_map = {"Q5": ["7","8","9"], "Q6": ["10","11","12"], "Q7": ["1", "2", "3"], "Q8": ["4", "5", "6"]}
correct_qtr_month = []
for qtr, months in qtr_month_map.items():
    for month in months:
        correct_qtr_month.append(qtr+month)

df_merged['qtr_month'] = df_merged['qtr_hh_rv'] + df_merged['b1q9_hh_rv']
df_merged['correct_qtr_month'] = df_merged['qtr_month'].apply(lambda x: 1 if x in correct_qtr_month else 0)
df_merged = df_merged[df_merged['correct_qtr_month'] == 1]

In [45]:
df_merged[['qtr_hh_rv','b1q9_hh_rv']].value_counts().sort_index()

qtr_hh_rv  b1q9_hh_rv
Q5         7             40749
           8             35746
           9             29892
Q6         10            37904
           11            36243
           12            30697
Q7         1             39117
           2             35621
           3             30166
Q8         4             29047
           5             39392
           6             34415
Name: count, dtype: int64

In [11]:
##### Weights 
df_merged['weight'] = df_merged['MULT_per_fv']/2
df_merged.loc[df_merged['NSS_per_fv'] == df_merged['NSC_per_fv'], 'weight'] = df_merged.loc[df_merged['NSS_per_fv'] == df_merged['NSC_per_fv'],'MULT_per_fv']

In [12]:
# Drop 3rd gender
df_merged = df_merged[df_merged['b4q5_per_fv'] != '3']
# df_copy = df_merged[(df_merged['b4q6_per_fv'] <= 45) & (df_merged['b4q6_per_fv'] >= 20) & (df_merged.b1q3_per_fv == '2')]

# I only need these for married people living between the age of 15-65
df_merged = df_merged[(df_merged['b4q6_per_fv'] <= 65) & (df_merged['b4q6_per_fv'] >= 15) & (df_merged["b4q7_per_fv"] == "2")]
# df_merged.groupby(['b4q5_per_fv']).apply(lambda x: np.average(x.EMP, weights=x.weight))


In [13]:
# df_copy.groupby(['b4q5_per_fv']).apply(lambda x: np.average(x.LFP, weights=x.weight))
# df_merged[['b4q5_per_fv', 'LFP']].groupby(['b4q5_per_fv']).mean()

NameError: name 'df_copy' is not defined

In [None]:
df_merged[['b6q9_per_fv', 'b6q10_per_fv', \
             'b6q9_3pt1_Act1_per_fv', 'b6q9_3pt1_Act2_per_fv', 'b6q9_3pt2_Act1_per_fv', \
             'b6q9_3pt2_Act2_per_fv', 'b6q9_3pt3_Act1_per_fv', 'b6q9_3pt3_Act2_per_fv', \
             'b6q9_3pt4_Act1_per_fv', 'b6q9_3pt4_Act2_per_fv', 'b6q9_3pt5_Act1_per_fv', \
             'b6q9_3pt5_Act2', 'b6q9_Act2_3pt6', 'b6q9_3pt6_Act1', 'b6q9_Act1_3pt7', \
             'b6q9_Act2_3pt7']].count()

In [13]:
## Get Hourly Wages
# These will exist only for those that are employed so I will subset.
df_merged = df_merged[df_merged['EMP'] == 1]
df_merged.loc[:,'hourlywage'] = pd.Series(None)

## First Getting Wages.
df_merged.loc[:,'wage'] = pd.Series(None)
# Wage column names
wage_cols = ['b6q9_per_fv', 'b6q10_per_fv', \
             'b6q9_3pt1_Act1_per_fv', 'b6q9_3pt1_Act2_per_fv', 'b6q9_3pt2_Act1_per_fv', \
             'b6q9_3pt2_Act2_per_fv', 'b6q9_3pt3_Act1_per_fv', 'b6q9_3pt3_Act2_per_fv', \
             'b6q9_3pt4_Act1_per_fv', 'b6q9_3pt4_Act2_per_fv', 'b6q9_3pt5_Act1_per_fv', \
             'b6q9_3pt5_Act2', 'b6q9_Act2_3pt6', 'b6q9_3pt6_Act1', 'b6q9_Act1_3pt7', \
             'b6q9_Act2_3pt7']
# these objects are strings so need to be converted to float.
for col in wage_cols:
    df_merged.loc[:,col] = df_merged.loc[:,col].astype(float)

# Some codes have salaries given for last 30 days.
# so extracting them here:
df_merged.loc[df_merged['b6q5_per_fv'].isin(['31', '71', '72']),'wage'] = df_merged.loc[df_merged['b6q5_per_fv'].isin(['31', '71', '72']),'b6q9_per_fv']
df_merged.loc[df_merged['b6q5_per_fv'].isin(['11', '12', '21', '61', '62']),'wage'] = df_merged.loc[df_merged['b6q5_per_fv'].isin(['11', '12', '21', '61', '62']),'b6q10_per_fv']

# Next, get wages for other codes (from their weekly activities)
# that have wages by day.
df_merged.loc[df_merged['b6q5_per_fv'].isin(['41', '42', '51']),'wage'] = \
                                        df_merged[df_merged['b6q5_per_fv'].isin(['41', '42', '51'])]\
                                               [['b6q9_3pt1_Act1_per_fv', 'b6q9_3pt1_Act2_per_fv', 'b6q9_3pt2_Act1_per_fv', \
                                                 'b6q9_3pt2_Act2_per_fv', 'b6q9_3pt3_Act1_per_fv', 'b6q9_3pt3_Act2_per_fv', \
                                                 'b6q9_3pt4_Act1_per_fv', 'b6q9_3pt4_Act2_per_fv', 'b6q9_3pt5_Act1_per_fv', \
                                                 'b6q9_3pt5_Act2', 'b6q9_Act2_3pt6', 'b6q9_3pt6_Act1', 'b6q9_Act1_3pt7', \
                                                 'b6q9_Act2_3pt7']].sum(axis=1)

# Changing `wage` type from object to float.
df_merged.loc[:,'wage'] = df_merged.loc[:,'wage'].astype(float)

# wageFreq is "monthly" or "weekly"
df_merged.loc[:,'wageFreq'] = pd.Series(None)
df_merged.loc[df_merged['b6q5_per_fv'].isin(['11', '12', '21', '61', '62', '31', '71', '72']),'wageFreq'] = "m"
df_merged.loc[df_merged['b6q5_per_fv'].isin(['41', '42', '51']),'wageFreq'] = "w"

## End Getting Wages

## Get Hours Worked.
time_cols = ['b6q6_3pt1_Act1_per_fv', 'b6q6_3pt1_Act2_per_fv', 'b6q6_3pt2_Act1_per_fv', \
             'b6q6_3pt2_Act2_per_fv', 'b6q6_3pt3_Act1_per_fv', 'b6q6_3pt3_Act2_per_fv', \
             'b6q6_3pt4_Act1_per_fv', 'b6q6_3pt4_Act2_per_fv', 'b6q6_3pt5_Act1_per_fv', \
             'b6q6_3pt5_Act2', 'b6q6_3pt6_Act1', 'b6q6_3pt6_Act2', 'b6q6_3pt7_Act1', \
             'b6q6_3pt7_Act2']
for col in time_cols:
    df_merged.loc[:,col] = df_merged.loc[:,col].astype(float)
df_merged.loc[:,'weeklyhrs'] = df_merged.loc[:,time_cols].sum(axis=1)
## End Hours Worked

df_merged = df_merged[(df_merged['weeklyhrs'] > 0)]

# 144 hrs time endowment. Drop any with more than that.
# df_merged = df_merged[df_merged['weeklyhrs'] <= 144]
# Can remove lower end?  Those working very very few hrs.
# df_merged = df_merged[df_merged['weeklyhrs'] > df_merged['weeklyhrs'].quantile(0.001)]
df_merged["factor"] = df_merged["weeklyhrs"]
df_merged.loc[df_merged["wageFreq"] == "m", "factor"] = 4*df_merged.loc[df_merged["wageFreq"] == "m","weeklyhrs"]
df_merged['hourlywage'] = df_merged['wage']/df_merged['factor']
# df_merged['hourlywage'][df_merged['wageFreq'] == "w"] = df_merged['wage'][df_merged['wageFreq'] == "w"]/df_merged['weeklyhrs'][df_merged['wageFreq'] == "w"]
# df_merged['hourlywage'][df_merged['wageFreq'] == "m"] = df_merged['wage'][df_merged['wageFreq'] == "m"]/(4.34*df_merged['weeklyhrs'][df_merged['wageFreq'] == "m"])
# ## Part Time or Full Time?
df_merged.loc[:,'FT'] = pd.Series(None)
df_merged.loc[df_merged['weeklyhrs'] >= 40,'FT'] = "FT"
df_merged.loc[df_merged['weeklyhrs'] < 40 ,'FT'] = "PT"
# # df_merged['hourlywage'][df_merged['wageFreq'] == "w"] = df_merged['hourlywage'][df_merged['wageFreq'] == "w"]['wage']
# ## End Part Time or Full Time
df_merged['hourlywage'] = df_merged['hourlywage'].astype(float)
df_merged = df_merged[(df_merged['hourlywage'] < df_merged['hourlywage'].quantile(0.999))]
# df_merged['hourlywage'] = sum(df_merged['hourlywage']*df_merged['per_weight_ann'])/sum(df_merged['per_weight_ann'])

In [21]:
df_merged[["b4q5_per_fv", "weeklyhrs", "weight"]].groupby(["b4q5_per_fv"]).apply(lambda x: np.average(x.weeklyhrs, weights=x.weight))

b4q5_per_fv
1    52.813778
2    42.837328
dtype: float64

In [457]:
# Do Self Employed Workers have wages? 
# Check for 21: 
df_merged.loc[df_merged["b5pt1q3_per_fv"] == "12",'hourlywage'].describe()

count    1899.000000
mean       92.854907
std        60.956827
min         0.000000
25%        52.083333
50%        76.754386
75%       117.953193
max       409.836066
Name: hourlywage, dtype: float64

In [589]:
df_merged = df_merged[df_merged["hourlywage"] > 0]
# df_merged = df_merged[df_merged["b5pt1q3_per_fv"].isin(REG_EMP_CODES+CASUAL_EMP_CODES)]
df_merged[df_merged["FT"] == "FT"].groupby(['b4q5_per_fv']).apply(lambda x: np.average(x.hourlywage, weights=x.weight)) 

b4q5_per_fv
1    56.718866
2    45.385898
dtype: float64

In [None]:
df_merged[df_merged['b4q5_per_fv'] == '2'][['hourlywage']]

In [340]:
# Create age categories
bins = [15, 25, 30, 36, 42, 50]
labels = ['15-25', '26-30', '31-36', '37-42', '43-49']

# Create a new column with the age groups
df_merged['age_group'] = pd.cut(df_merged['b4q6_per_fv'], bins=bins, labels=labels, right=False)


In [341]:
df_merged["b4q6_per_fv"][df_merged["age_group"].isna()]

Series([], Name: b4q6_per_fv, dtype: int16)

In [342]:
# Education: Keep only that which is not empty
df_merged = df_merged[df_merged["b4q8_per_fv"] != ""]
# Convert string to float for easy transformations: 
df_merged.loc[:,"b4q8_per_fv"] = df_merged.loc[:,"b4q8_per_fv"].astype(float)
df_merged["b4q8_per_fv"].unique()

array([1.0, 8.0, 7.0, 6.0, 12.0, 10.0, 13.0, 11.0, 5.0, 4.0, 2.0, 3.0],
      dtype=object)

In [343]:
# Assign labels to years of eduction
df_merged.loc[:,"educ"] = pd.Series(None)
df_merged.loc[df_merged["b4q8_per_fv"] <= 4, "educ"] = "UN_EDU"
df_merged.loc[(df_merged["b4q8_per_fv"] <= 8) & (df_merged["b4q8_per_fv"] > 4), "educ"] = "LE_HS"
df_merged.loc[df_merged["b4q8_per_fv"] == 10, "educ"] = "HS"
df_merged.loc[df_merged["b4q8_per_fv"] == 11, "educ"] = "SM_COL"
df_merged.loc[df_merged["b4q8_per_fv"] > 11, "educ"] = "COL_AM"

In [344]:
df_merged.loc[:,"sector"] = pd.Series(None)
df_merged.loc[(df_merged["b5pt1q3_per_fv"].isin(NOT_REG_CODES)), "sector"] = "informal"
df_merged.loc[(df_merged["b5pt1q3_per_fv"].isin(REG_EMP_CODES)), "sector"] = "formal"

In [345]:
df_merged[df_merged["sector"].isna()]["b5pt1q3_per_fv"].unique()

array([], dtype=object)

In [346]:
df_export = df_merged[['b4q5_per_fv', "hourlywage", "age_group", "educ", "state_per_fv", "b1q3_per_fv", "sector", "FT"]]

In [347]:
# Getting dummies for all categorical variables 
df_export.to_stata("../../data/proc/wage_reg.dta")
