### In this File

Having calculated time spent by each primID in each activity, I merge this dataframe with the dataframe containing characteristics for these primIDs.

In [69]:
import pandas as pd
import os
from datetime import datetime
import copy

In [70]:
################# GLOBAL VARIABLES ######################

# Definitions from PLFS 2018-19 Annual Report, Concepts and Definitions (2.38.1)
# Problematic codes for self employed criteria: `21` - worked in household enterprises (self-employed) as helper
SELF_EMP_CODES = ["11", "12", "21"]
REG_EMP_CODES = ["31"]
CASUAL_EMP_CODES = ["41", "42", "51", "61", "62", "71", "72"]

NOT_REG_CODES = SELF_EMP_CODES + CASUAL_EMP_CODES

EMP_CODES = SELF_EMP_CODES + REG_EMP_CODES + CASUAL_EMP_CODES
UNEMP_CODES = ["81", "82"]
LF_CODES = EMP_CODES + UNEMP_CODES
NOT_IN_LF_CODES = [str(x) for x in list(range(91,100))]

CODE_MAP = {"1": "work", "2": "work", "3": "home", "4": "home", "5": "home", "6": "leisure", "7": "leisure", "8": "leisure", "9": "self"}

In [71]:
df_L1L2 = pd.read_pickle("../../proc/df_L1L2.pkl")
df_L3 = pd.read_pickle("../../proc/df_L3.pkl")

In [72]:
# Contains Common-ID with Response Code != 1 in L1 
dropids = pd.read_pickle("../../proc/NonResponseCommonIDs.pkl")
# df_time = df_time.loc[~df_time["Common-ID"].isin(dropids),:] 

In [73]:
df_L3 = df_L3[~df_L3["Common-ID"].isin(dropids)]

In [74]:
# Will contain characteristics of hh/individual
df_chars = pd.merge(df_L1L2, df_L3, how="outer", on="Common-ID", indicator=True)

In [75]:
df_chars._merge.value_counts()

_merge
both          455448
right_only         1
left_only          0
Name: count, dtype: int64

In [76]:
df_chars[df_chars["_merge"] == "right_only"]

item,Sector,District,State,north,Common-ID,Person serial no.,Relation to head,Gender,Age,marital status,...,"imputed value of usual consumption in a month from wages in kind, free collection, gifts, etc (C )",expenditure on purchase of household durable during last 365 days (D),usual monthly consumer expenditure E: [A+B+C+(D/12)],Primary source of energey for cooking,Primary source of energey for lighting,Type of washing of clothes,Type of sweeping of floor,Type of structure of the dwelling unit,member of age 5 years and above needing special care but no care giver is available,_merge
455448,,,,,16792106201910932103012092106030,,,,,,...,,400,0 021,12,2,,,,,right_only


Some error here with `Common-ID`

In [77]:
df_chars = df_chars[df_chars["_merge"] == "both"]
df_chars.drop(columns=["_merge"], inplace=True)

Now, I need to map husbands to wives. So first, subset by marital status

In [78]:
df_chars = df_chars[df_chars["marital status"] == "2"]

In [79]:
df_chars["Relation to head"].value_counts()

Relation to head
1    98554
2    92320
4    19251
3    19004
7     2349
8     2021
6      315
9      141
Name: count, dtype: int64

Within each Household, I have to map a husband to a wife. All I have is `relation to head`. So I subset those that are the simplest to map to each other: `self -1, spouse of head -2, married child -3, spouse of married child -4`

In [80]:
df_chars = df_chars[df_chars["Relation to head"].isin(['1','2','3','4'])]

In [81]:
df_chars.loc[:,'spouse'] = "husband"
df_chars.loc[df_chars['Gender'] == "2", "spouse"] = "wife"

In [82]:
df_chars[['Relation to head', 'spouse']].value_counts().sort_index()

Relation to head  spouse 
1                 husband    93941
                  wife        4613
2                 husband      668
                  wife       91652
3                 husband    17493
                  wife        1511
4                 husband      523
                  wife       18728
Name: count, dtype: int64

There are 4.6k women HoH but only 668 husbands. It looks like the remaining 4k husbands are living away. I need only HHs where the man and the wife are living together. So I will subset those

In [83]:
df_chars.loc[:,"wife"] = 0
df_chars.loc[df_chars["spouse"] == "wife","wife"] = 1

df_chars.loc[:,"husb"] = 0
df_chars.loc[df_chars["spouse"] == "husband","husb"] = 1

In [84]:
df_Nhusb_Nwife = df_chars[["Common-ID", "wife", "husb"]].groupby("Common-ID").sum()


In [85]:
sum(df_Nhusb_Nwife["wife"] == df_Nhusb_Nwife["husb"]), sum(df_Nhusb_Nwife["wife"] != df_Nhusb_Nwife["husb"])

(94760, 9511)

In [86]:
equalHusbWifeids = df_Nhusb_Nwife[df_Nhusb_Nwife["wife"] == df_Nhusb_Nwife["husb"]].index

In [87]:
df_chars = df_chars[df_chars["Common-ID"].isin(equalHusbWifeids)]

In [88]:
# Some horrendous column names need to be changed
rename_cols = {
    'usual principal activity: status (code)': 'principal_activity_status_code',
    'industry of work: 2-digit of NIC 2008': 'industry_nic2_code',
    'Land possessed as on date of survey(code)': 'land_possessed_survey_date_code',
    'usual consumer expenditure in a month for household purposes out of purchase (A)': 'cons_exp_purchase_monthly',
    'imputed value of usual consumption in a month from home grown stock (B)': 'imputed_homegrown_consumption_monthly',
    'imputed value of usual consumption in a month from wages in kind, free collection, gifts, etc (C )': 'imputed_in_kind_consumption_monthly',
    'expenditure on purchase of household durable during last 365 days (D)': 'exp_durable_purchase_annual',
    'usual monthly consumer expenditure E: [A+B+C+(D/12)]': 'cons_exp_total_monthly',
    'Primary source of energey for cooking': 'primary_cooking_energy',
    'Primary source of energey for lighting': 'primary_lighting_energy',
    'Type of washing of clothes': 'clothes_washing_type',
    'Type of sweeping of floor': 'floor_sweeping_type',
    'Type of structure of the dwelling unit': 'dwelling_structure_type',
    'member of age 5 years and above needing special care but no care giver is available': 'member_5plus_needing_care_no_caregiver',
}

df_chars.rename(columns=rename_cols, inplace=True)

In [116]:
df_chars.columns

Index(['Sector', 'District', 'State', 'north', 'Common-ID',
       'Person serial no.', 'Relation to head', 'Gender', 'Age',
       'marital status', 'highest level of education',
       'principal_activity_status_code', 'industry_nic2_code', 'primID',
       'Household size', 'religion', 'Social group ',
       'land_possessed_survey_date_code', 'cons_exp_purchase_monthly',
       'imputed_homegrown_consumption_monthly',
       'imputed_in_kind_consumption_monthly', 'exp_durable_purchase_annual',
       'cons_exp_total_monthly', 'primary_cooking_energy',
       'primary_lighting_energy', 'clothes_washing_type',
       'floor_sweeping_type', 'dwelling_structure_type',
       'member_5plus_needing_care_no_caregiver', 'spouse', 'wife', 'husb'],
      dtype='object', name='item')

In [89]:
# Read time file
df_time = pd.read_pickle("../../proc/df_timeSpent.pkl")


In [90]:
df_time["Common-ID"] = df_time["primID"].apply(lambda x: x[:-3])

In [91]:
# Drop if len(Common-ID) < 32 
df_time = df_time[df_time["Common-ID"].apply(lambda x: len(x.replace(" ", ""))) == 32]

In [92]:
df_time.shape

(1676736, 5)

In [93]:
# Drop Common-ID that had Response_Code != 1
df_time = df_time.loc[~df_time["Common-ID"].isin(dropids),:] 
df_time.shape

(1566196, 5)

In [94]:
df_chars.columns

Index(['Sector', 'District', 'State', 'north', 'Common-ID',
       'Person serial no.', 'Relation to head', 'Gender', 'Age',
       'marital status', 'highest level of education',
       'principal_activity_status_code', 'industry_nic2_code', 'primID',
       'Household size', 'religion', 'Social group ',
       'land_possessed_survey_date_code', 'cons_exp_purchase_monthly',
       'imputed_homegrown_consumption_monthly',
       'imputed_in_kind_consumption_monthly', 'exp_durable_purchase_annual',
       'cons_exp_total_monthly', 'primary_cooking_energy',
       'primary_lighting_energy', 'clothes_washing_type',
       'floor_sweeping_type', 'dwelling_structure_type',
       'member_5plus_needing_care_no_caregiver', 'spouse', 'wife', 'husb'],
      dtype='object', name='item')

In [95]:
df_time.columns

Index(['time_spent', 'primID', 'activity', 'TotalTime', 'Common-ID'], dtype='object')

In [96]:
# Drop Common-ID
df_time.drop(columns=["Common-ID"], inplace=True)

In [97]:
df_merged=pd.merge(df_chars, df_time, on='primID', how="outer", indicator=True)

In [98]:
df_merged["_merge"].value_counts()

_merge
both          849520
right_only    716676
left_only        586
Name: count, dtype: int64

Given the subsets I have already done, it makes sense that there will be a lot of `_merge == right_only`. A few of them are `left_only` - guess they will be the ones with some bad Common-IDs

In [99]:
df_merged[df_merged['_merge'] == "left_only"].head()

Unnamed: 0,Sector,District,State,north,Common-ID,Person serial no.,Relation to head,Gender,Age,marital status,...,floor_sweeping_type,dwelling_structure_type,member_5plus_needing_care_no_caregiver,spouse,wife,husb,time_spent,activity,TotalTime,_merge
476,1,8,1,,TUS12469106201910130803342012002,4,4,2,33,2,...,2,3,,wife,1.0,0.0,,,,left_only
477,1,8,1,,TUS12469106201910130803342012002,7,3,1,29,2,...,2,3,,husband,0.0,1.0,,,,left_only
478,1,8,1,,TUS12469106201910130803342012002,8,4,2,27,2,...,2,3,,wife,1.0,0.0,,,,left_only
5983,2,10,1,,TUS22509106201920131003011012001,1,1,1,36,2,...,2,3,,husband,0.0,1.0,,,,left_only
6040,2,10,1,,TUS22509106201920131003011012012,4,4,2,32,2,...,2,3,2.0,wife,1.0,0.0,,,,left_only


These look okay so I am not sure about the problem here. I guess I will have to drop them. They are not that money so I guess that's okay

In [100]:
df_merged = df_merged[df_merged['_merge'] == 'both']
df_merged.drop(columns=['_merge'], inplace=True)

In [101]:
df_merged.shape

(849520, 35)

In [102]:
# Both urban/rural. Earlier version of the paper had only urban
# df_merged = df_merged[df_merged["Sector"] == "2"]

In [104]:
# Generate some dummy variables: College, Working, Adivasi - Begin here on 25/07/2025
df_merged.loc[:,"col"] = 0
df_merged.loc[df_merged["highest level of education"].astype(float) >= 11,"col"] = 1

df_merged.loc[:,"working"] = 0
df_merged.loc[df_merged["principal_activity_status_code"].isin(EMP_CODES),"working"] = 1

df_merged.loc[:,"Adivasi"] = df_merged.loc[:,"Social group "].apply(lambda x: 1 if x == "1" else 0)

In [105]:
df_merged["Age"] = df_merged["Age"].astype(float)

In [106]:
# Subset by age: Drop HH ids where either the man or the woman are age < 18
df_minage = df_merged[["Common-ID", "Age"]].groupby("Common-ID").min()
df_minage_lt18_ids = df_minage[df_minage["Age"].astype(float) < 18].index
df_minage_lt18_ids.shape

(53,)

In [107]:
df_merged = df_merged[~df_merged["Common-ID"].isin(df_minage_lt18_ids)]

 Now I calculate moments for: 
 1. The entire dataset
 2. Working men vs Working women
 3. Working men and Working women in North v South. 
 
 BEFORE I DO THAT, I need to note one thing about the men's and women's dataframes: For each woman, I will have 4 rows for all her activities, and 4 rows for her husband's activities. That means each woman is represented in 16 rows. Would it be a problem? I can do a check, where I disentangle the two frames and calculate moments that way too. \[My guess is that this won't make much of a difference\] -- **Note (07/25)**: Past me is wrong - the time entries will not be repeated.

In [108]:
# 1. The entire dataset
df_merged["prop_day"] = df_merged["time_spent"]/df_merged["TotalTime"]

df_merged[["activity", "husb", "prop_day"]].groupby(["husb", "activity"]).mean()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged["prop_day"] = df_merged["time_spent"]/df_merged["TotalTime"]


Unnamed: 0_level_0,Unnamed: 1_level_0,prop_day
husb,activity,Unnamed: 2_level_1
0.0,home,0.527062
0.0,leisure,0.352471
0.0,self,
0.0,work,0.120458
1.0,home,0.063612
1.0,leisure,0.389711
1.0,self,
1.0,work,0.546667


In [109]:
df_merged[["activity", "husb", "time_spent"]].groupby(["husb", "activity"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,time_spent
husb,activity,Unnamed: 2_level_1
0.0,home,6.453082
0.0,leisure,4.301844
0.0,self,11.681024
0.0,work,1.563823
1.0,home,0.747537
1.0,leisure,4.527045
1.0,self,11.95294
1.0,work,6.772252


In [110]:
# 2. Working men vs Working women
df_merged[df_merged["working"] == 1][["activity", "time_spent", "husb"]].groupby(["husb", "activity"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,time_spent
husb,activity,Unnamed: 2_level_1
0.0,home,4.68945
0.0,leisure,3.244216
0.0,self,11.012754
0.0,work,5.053579
1.0,home,0.722413
1.0,leisure,4.114857
1.0,self,11.756088
1.0,work,7.406642


In [114]:
df_merged[(df_merged["working"] == 1) & (df_merged["Adivasi"] == 1)][["north", "activity", "time_spent", "husb"]].groupby(["north", "husb", "activity"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,time_spent
north,husb,activity,Unnamed: 3_level_1
0,0.0,home,4.532074
0,0.0,leisure,3.047512
0,0.0,self,11.072842
0,0.0,work,5.347572
0,1.0,home,0.611272
0,1.0,leisure,4.284259
0,1.0,self,11.804348
0,1.0,work,7.300121
1,0.0,home,4.789057
1,0.0,leisure,2.688491


In [111]:
#3. Working men vs Working women, North v South
df_merged[df_merged["working"] == 1][["activity", "time_spent", "husb", "north"]].groupby(["north", "husb", "activity"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,time_spent
north,husb,activity,Unnamed: 3_level_1
0,0.0,home,4.572074
0,0.0,leisure,3.366532
0,0.0,self,10.875015
0,0.0,work,5.18638
0,1.0,home,0.617446
0,1.0,leisure,4.365628
0,1.0,self,11.536657
0,1.0,work,7.480269
1,0.0,home,4.700596
1,0.0,leisure,3.09969


In [112]:
#3. Working Adivasi men vs Working Adivasi women
df_merged[(df_merged["working"] == 1) & (df_merged["Adivasi"] == 1)][["activity", "time_spent", "husb"]].groupby(["husb", "activity"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,time_spent
husb,activity,Unnamed: 2_level_1
0.0,home,4.715865
0.0,leisure,3.082923
0.0,self,11.316608
0.0,work,4.884605
1.0,home,0.815961
1.0,leisure,4.0942
1.0,self,12.106794
1.0,work,6.983045


In [113]:
df_merged[(df_merged["working"] == 0) & (df_merged["Adivasi"] == 1)][["activity", "time_spent", "husb"]].groupby(["husb", "activity"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,time_spent
husb,activity,Unnamed: 2_level_1
0.0,home,6.913399
0.0,leisure,3.97533
0.0,self,12.056364
0.0,work,1.054907
1.0,home,1.098203
1.0,leisure,7.508451
1.0,self,14.089217
1.0,work,1.304129


So it looks like Adivasi's on average ARE very similar to non-adivasis. Maybe, I should see if this holds up in a regression. This regression will be at the HH level, where I will measure the ratio of the average time spent in leisure vs home prod. by men and women: 

$\frac{\text{HP}_w/\text{HP}_w+\text{L}_w}{\text{HP}_m/\text{HP}_m+\text{L}_m}$

Should leisure include Self-Care and Maint.?

In [280]:
# A HH level dataset that stores: HH demographic info, men's avg hours disposition, women avg hours disposition. 
df_hh = df_chars.drop(columns=['Person serial no.', 'Relation to head', 'Gender', 'Age',
       'marital status', 'highest level of education',
       'principal_activity_status_code', 'industry_nic2_code', 'primID', 'spouse', 'wife', 'husb'])
df_hh.columns

Index(['Sector', 'District', 'State', 'north', 'Common-ID', 'Household size',
       'religion', 'Social group ', 'land_possessed_survey_date_code',
       'cons_exp_purchase_monthly', 'imputed_homegrown_consumption_monthly',
       'imputed_in_kind_consumption_monthly', 'exp_durable_purchase_annual',
       'cons_exp_total_monthly', 'primary_cooking_energy',
       'primary_lighting_energy', 'clothes_washing_type',
       'floor_sweeping_type', 'dwelling_structure_type',
       'member_5plus_needing_care_no_caregiver'],
      dtype='object', name='item')

In [281]:
df_hh.drop_duplicates(subset=["Common-ID"], inplace=True)

In [282]:
df_hh.shape

(94760, 20)

In [283]:
df_hh.land_possessed_survey_date_code.unique()

array(['05', '08', '06', '07', '04', '03', '12', '02', '01', '11', '',
       '10', '99'], dtype=object)

In [284]:
df_wHH_time = df_merged[df_merged["wife"] == 1][["Common-ID", "activity", "time_spent"]].groupby(["Common-ID", "activity"]).mean()

In [285]:
df_wHH_time.head(8)

Unnamed: 0_level_0,Unnamed: 1_level_0,time_spent
Common-ID,activity,Unnamed: 2_level_1
TUS10001106201913310301382332001,home,5.75
TUS10001106201913310301382332001,leisure,2.125
TUS10001106201913310301382332001,self,12.125
TUS10001106201913310301382332001,work,4.0
TUS10001106201913310301382332003,home,3.75
TUS10001106201913310301382332003,leisure,7.0
TUS10001106201913310301382332003,self,13.25
TUS10001106201913310301382332003,work,0.0


In [286]:
df_wHH_time.reset_index(inplace=True)

In [287]:
df_mHH_time = df_merged[df_merged["husb"] == 1][["Common-ID", "activity", "time_spent"]].groupby(["Common-ID", "activity"]).mean()

In [288]:
df_mHH_time.head(8)

Unnamed: 0_level_0,Unnamed: 1_level_0,time_spent
Common-ID,activity,Unnamed: 2_level_1
TUS10001106201913310301382332001,home,0.5
TUS10001106201913310301382332001,leisure,3.375
TUS10001106201913310301382332001,self,12.625
TUS10001106201913310301382332001,work,7.5
TUS10001106201913310301382332003,home,0.0
TUS10001106201913310301382332003,leisure,3.25
TUS10001106201913310301382332003,self,13.75
TUS10001106201913310301382332003,work,7.0


In [289]:
df_mHH_time.reset_index(inplace=True)

In [290]:
df_wHH_time.shape

(378328, 3)

In [291]:
df_mHH_time.shape

(378160, 3)

In [292]:
df_HH_time = pd.merge(df_mHH_time, df_wHH_time, on=["Common-ID", "activity"], how="outer", indicator=True)

In [293]:
df_HH_time["_merge"].value_counts()

_merge
both          377780
right_only       548
left_only        380
Name: count, dtype: int64

In [294]:
df_HH_time = df_HH_time[df_HH_time["_merge"] == "both"]
df_HH_time.drop(columns=["_merge"], inplace=True)
df_HH_time.rename(columns={"time_spent_x": "time_spent_m", "time_spent_y": "time_spent_w"}, inplace=True)

In [295]:
# Long -> Wide
df_HH_time = df_HH_time.pivot(index="Common-ID", columns="activity", values=["time_spent_m", "time_spent_w"])
# Flatten Multiindex
df_HH_time.columns = [f"{measure}_{act}" for measure,act in df_HH_time.columns]


In [296]:
df_HH_time.head()

Unnamed: 0_level_0,time_spent_m_home,time_spent_m_leisure,time_spent_m_self,time_spent_m_work,time_spent_w_home,time_spent_w_leisure,time_spent_w_self,time_spent_w_work
Common-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
TUS10001106201913310301382332001,0.5,3.375,12.625,7.5,5.75,2.125,12.125,4.0
TUS10001106201913310301382332003,0.0,3.25,13.75,7.0,3.75,7.0,13.25,0.0
TUS10001106201913310301382332007,2.375,4.125,12.75,4.75,4.125,3.125,14.25,2.5
TUS10001106201913310301382332009,0.0,4.25,11.25,8.5,5.0,4.0,15.0,0.0
TUS10001106201913310301382332011,0.0,4.5,13.5,6.0,4.5,2.75,12.75,4.0


In [297]:
df_HH_time.reset_index(inplace=True)

In [298]:
df_hh = pd.merge(df_hh, df_HH_time, on="Common-ID", how="outer", indicator=True)

In [299]:
df_hh["_merge"].value_counts()

_merge
both          94445
left_only       315
right_only        0
Name: count, dtype: int64

In [300]:
df_hh.head()

Unnamed: 0,Sector,District,State,north,Common-ID,Household size,religion,Social group,land_possessed_survey_date_code,cons_exp_purchase_monthly,...,member_5plus_needing_care_no_caregiver,time_spent_m_home,time_spent_m_leisure,time_spent_m_self,time_spent_m_work,time_spent_w_home,time_spent_w_leisure,time_spent_w_self,time_spent_w_work,_merge
0,1,19,1,,TUS10202106201910111901311011001,4,1,9,5,8500,...,2,2.166667,1.333333,9.5,11.0,6.0,2.0,9.5,6.5,both
1,1,19,1,,TUS10202106201910111901311011002,5,1,9,5,7000,...,2,2.25,5.916667,11.833333,4.0,9.0,5.166667,9.833333,0.0,both
2,1,19,1,,TUS10202106201910111901311011003,4,4,1,8,12000,...,2,1.5,4.333333,11.166667,7.0,8.583333,3.166667,9.25,3.0,both
3,1,19,1,,TUS10202106201910111901311011004,5,1,9,6,12000,...,2,1.5,1.666667,9.833333,11.0,10.416667,2.166667,10.916667,0.5,both
4,1,19,1,,TUS10202106201910111901311011005,5,1,9,5,10000,...,1,1.0,3.666667,13.333333,6.0,4.416667,4.5,9.583333,5.5,both


In [301]:
df_hh = df_hh[df_hh["_merge"] == "both"]
df_hh.drop(columns=["_merge"], inplace=True)

In [302]:
df_hh.to_csv("../../proc/TimeReg.csv")