### In this File

Having calculated time spent by each primID in each activity, I merge this dataframe with the dataframe containing characteristics for these primIDs.

In [2]:
import pandas as pd
import os
from datetime import datetime
import copy

In [3]:
################# GLOBAL VARIABLES ######################

# Definitions from PLFS 2018-19 Annual Report, Concepts and Definitions (2.38.1)
# Problematic codes for self employed criteria: `21` - worked in household enterprises (self-employed) as helper
SELF_EMP_CODES = ["11", "12", "21"]
REG_EMP_CODES = ["31"]
CASUAL_EMP_CODES = ["41", "42", "51", "61", "62", "71", "72"]

NOT_REG_CODES = SELF_EMP_CODES + CASUAL_EMP_CODES

EMP_CODES = SELF_EMP_CODES + REG_EMP_CODES + CASUAL_EMP_CODES
UNEMP_CODES = ["81", "82"]
LF_CODES = EMP_CODES + UNEMP_CODES
NOT_IN_LF_CODES = [str(x) for x in list(range(91,100))]

CODE_MAP = {"1": "work", "2": "work", "3": "home", "4": "home", "5": "home", "6": "leisure", "7": "leisure", "8": "leisure", "9": "self"}

In [4]:
# Read data files
df_chars = pd.read_pickle("df_L1L2.pkl")
df_time = pd.read_pickle("df_timeSpent.pkl")

In [5]:
df_chars.columns

Index(['Schedule ID', 'FSU Serial No.', 'Schedule', 'survey year', 'Sector',
       'NSS-Region', 'District', 'Stratum', 'Sub-Stratum', 'Sub-Round',
       'FOD Sub-Region', 'Sample hhld. No.', 'Level ', 'Filler_x',
       'Informant Sl.No.', 'Response Code', 'Survey Code',
       'Substitution Code/ Casualty code', 'Filler_x', 'Date of Survey',
       'Date of Despatch', 'Time to canvass(minutes)',
       'No. of investigators(FI/ASO) in the team', 'Remarks in block 7',
       'Remarks in block 8', 'Remarks elsewhere in Sch.',
       'Remarks elsewhere in Sch.', 'Blank_x', 'NSC_x', 'MULT_x', 'State',
       'north', 'Common-ID', 'Level', 'Filler_y', 'Person serial no.',
       'Relation to head', 'Gender', 'Age', 'marital status',
       'highest level of education', 'usual principal activity: status (code)',
       'industry of work: 2-digit of NIC 2008', 'Blank_y', 'NSC_y', 'MULT_y',
       'primID'],
      dtype='object', name='item')

In [6]:
df_chars.drop(columns=['Schedule ID', 'FSU Serial No.', 'Schedule', 'survey year',\
                       'NSS-Region','Stratum', 'Sub-Stratum', 'Sub-Round','FOD Sub-Region',\
                       'Sample hhld. No.', 'Level ', 'Filler_x','Date of Survey',\
                       'Date of Despatch', 'Time to canvass(minutes)',\
                       'No. of investigators(FI/ASO) in the team', 'Remarks in block 7',\
                       'Remarks in block 8', 'Remarks elsewhere in Sch.',\
                       'Remarks elsewhere in Sch.', 'Blank_x', 'NSC_x', 'MULT_x','Level', 'Filler_y',\
                       'Blank_y', 'Informant Sl.No.', 'Response Code', 'Survey Code',\
                       'Substitution Code/ Casualty code',],inplace=True)

In [7]:
df_chars.rename(columns={'NSC_y': 'NSC', 'MULT_y':'MULT'},inplace=True)

In [8]:
df_time.columns

Index(['time_spent', 'primID', 'activity', 'TotalTime'], dtype='object')

In [9]:
df_chars.columns

Index(['Sector', 'District', 'State', 'north', 'Common-ID',
       'Person serial no.', 'Relation to head', 'Gender', 'Age',
       'marital status', 'highest level of education',
       'usual principal activity: status (code)',
       'industry of work: 2-digit of NIC 2008', 'NSC', 'MULT', 'primID'],
      dtype='object', name='item')

In [10]:
df_merged=pd.merge(df_chars, df_time, on='primID', how="outer", indicator=True)

In [11]:
df_merged["_merge"].value_counts()

_merge
both          1781196
left_only       73445
right_only          0
Name: count, dtype: int64

In [12]:
# Checking if the 'left_only' are aged less than 6. Turns out, most of them are. 
df_merged[(df_merged["_merge"] == "left_only") & (df_merged['Age'].astype(float) <= 6)].shape

(71542, 20)

In [13]:
df_merged[(df_merged["_merge"] == "left_only") & (df_merged['Age'].astype(float) > 6)].shape

(1903, 20)

In [14]:
df_merged = df_merged[df_merged['_merge'] == 'both']
df_merged.drop(columns=['_merge'], inplace=True)

In [15]:
# Subsetting married people in urban areas 
df_merged = df_merged[df_merged["marital status"] == "2"]
df_merged = df_merged[df_merged["Sector"] == "2"]

In [16]:
df_merged.shape

(406444, 19)

In [17]:
df_merged["Relation to head"].value_counts()

Relation to head
1    172288
2    159876
3     33424
4     31628
8      4500
7      3916
9       476
6       336
Name: count, dtype: int64

In [18]:
df_merged = df_merged[df_merged['Relation to head'].isin(['1','2','3','4','7','8'])]
df_merged.loc[:,'spouse'] = "husband"
df_merged.loc[df_merged['Gender'] == "2", "spouse"] = "wife"

In [19]:
df_merged[['Relation to head', 'spouse']].value_counts().sort_index()

Relation to head  spouse 
1                 husband    165320
                  wife         6968
2                 husband      1628
                  wife       158248
3                 husband     30320
                  wife         3104
4                 husband      1000
                  wife        30628
7                 husband      1796
                  wife         2120
8                 husband      2356
                  wife         2144
Name: count, dtype: int64

In [20]:
df_husb = df_merged[df_merged['spouse'] == 'husband']
df_wife = df_merged[df_merged['spouse'] == 'wife']

In [21]:

# # Now, define a mapping -- 1<->2, 3<->4, 7<->7, 8<->8 within each hh has to be mapped.
# # The relationship key will be kept the same as b4q4_per_fv for husbands. Relationship key 
# # for wives will be according to the merge map. 


df_husb.loc[:,"rel_key"] = None
df_wife.loc[:,"rel_key"] = None
df_husb.loc[:,"rel_key"] = df_husb.loc[:,"Relation to head"]
merge_map = {"1":"2", "2":"1", "3":"4", "4":"3", "7":"7", "8":"8"}
df_wife.loc[:,"rel_key"] = df_wife.loc[:,"Relation to head"].apply(lambda x: merge_map[x])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_husb.loc[:,"rel_key"] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_wife.loc[:,"rel_key"] = None


In [22]:
# Update column names so that husb. cols have _husb, wives, _wife
df_husb.columns = [col+"_h" for col in df_husb.columns]
df_wife.columns = [col+"_w" for col in df_wife.columns]


In [23]:
df_husb_wife = pd.merge(left=df_husb, right=df_wife, left_on=["Common-ID_h", "rel_key_h"], right_on=["Common-ID_w", "rel_key_w"], how="outer", indicator=True)

In [24]:
df_husb_wife["_merge"].value_counts()

_merge
both          792736
right_only     11380
left_only      10888
Name: count, dtype: int64

In [25]:
df_husb_wife = df_husb_wife[df_husb_wife["_merge"] == "both"]
df_husb_wife.drop(columns=['_merge'], inplace=True)

In [26]:
# Now that I have the husband wife dataframe, now I subset women with age in [15,49]
df_husb_wife = df_husb_wife[(df_husb_wife['Age_w'].astype(float) <= 49) & (df_husb_wife['Age_w'].astype(float) >= 15)]

In [27]:
df_husb_wife.columns

Index(['Sector_h', 'District_h', 'State_h', 'north_h', 'Common-ID_h',
       'Person serial no._h', 'Relation to head_h', 'Gender_h', 'Age_h',
       'marital status_h', 'highest level of education_h',
       'usual principal activity: status (code)_h',
       'industry of work: 2-digit of NIC 2008_h', 'NSC_h', 'MULT_h',
       'primID_h', 'time_spent_h', 'activity_h', 'TotalTime_h', 'spouse_h',
       'rel_key_h', 'Sector_w', 'District_w', 'State_w', 'north_w',
       'Common-ID_w', 'Person serial no._w', 'Relation to head_w', 'Gender_w',
       'Age_w', 'marital status_w', 'highest level of education_w',
       'usual principal activity: status (code)_w',
       'industry of work: 2-digit of NIC 2008_w', 'NSC_w', 'MULT_w',
       'primID_w', 'time_spent_w', 'activity_w', 'TotalTime_w', 'spouse_w',
       'rel_key_w'],
      dtype='object')

In [43]:
df_husb_wife["highest level of education_h"].value_counts().sort_index()

highest level of education_h
01     52704
02     27632
03     56384
04     93984
05    103776
06     83856
07      8368
08      9168
10     10560
11    114384
12     44992
Name: count, dtype: int64

In [44]:
# Generate college dummy variables
df_husb_wife.loc[:,"col_h"] = 0
df_husb_wife.loc[df_husb_wife["highest level of education_h"].astype(float) >= 12,"col_h"] = 1

df_husb_wife.loc[:,"col_w"] = 0
df_husb_wife.loc[df_husb_wife["highest level of education_w"].astype(float) >= 12,"col_w"] = 1


 Now I calculate moments for: 
 1. The entire dataset
 2. Working men vs Working women
 3. Working men and Working women in North v South. 
 
 BEFORE I DO THAT, I need to note one thing about the men's and women's dataframes: For each woman, I will have 4 rows for all her activities, and 4 rows for her husband's activities. That means each woman is represented in 16 rows. Would it be a problem? I can do a check, where I disentangle the two frames and calculate moments that way too. \[My guess is that this won't make much of a difference\]

In [28]:
# 1. The entire dataset
df_husb_wife["prop_day_h"] = df_husb_wife["time_spent_h"]/df_husb_wife["TotalTime_h"]
df_husb_wife["prop_day_w"] = df_husb_wife["time_spent_w"]/df_husb_wife["TotalTime_w"]

df_husb_wife[["activity_h", "prop_day_h"]].groupby("activity_h").mean()


Unnamed: 0_level_0,prop_day_h
activity_h,Unnamed: 1_level_1
home,0.054616
leisure,0.335565
self,
work,0.609819


In [29]:
df_husb_wife[["activity_w", "prop_day_w"]].groupby("activity_w").mean()

Unnamed: 0_level_0,prop_day_w
activity_w,Unnamed: 1_level_1
home,0.552811
leisure,0.362055
self,
work,0.085134


In [30]:
# 2. Working men vs Working women
df_husb_wife.loc[:,"working_h"] = 0
df_husb_wife.loc[df_husb_wife["usual principal activity: status (code)_h"].isin(EMP_CODES),"working_h"] = 1

df_husb_wife.loc[:,"working_w"] = 0
df_husb_wife.loc[df_husb_wife["usual principal activity: status (code)_w"].isin(EMP_CODES),"working_w"] = 1


In [31]:
df_husb_wife[["activity_h", "prop_day_h", "working_h"]].groupby(["working_h", "activity_h"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,prop_day_h
working_h,activity_h,Unnamed: 2_level_1
0,home,0.119806
0,leisure,0.795375
0,self,
0,work,0.084819
1,home,0.052292
1,leisure,0.319173
1,self,
1,work,0.628535


In [32]:
df_husb_wife[["activity_h", "time_spent_h", "working_h"]].groupby(["working_h", "activity_h"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,time_spent_h
working_h,activity_h,Unnamed: 2_level_1
0,home,1.325242
0,leisure,8.50841
0,self,13.145133
0,work,1.021216
1,home,0.641987
1,leisure,3.938864
1,self,11.380026
1,work,8.039124


In [50]:
df_husb_wife[["activity_h", "time_spent_h", "col_h"]].groupby(["col_h", "activity_h"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,time_spent_h
col_h,activity_h,Unnamed: 2_level_1
0,home,0.654423
0,leisure,4.046977
0,self,11.453893
0,work,7.844707
1,home,0.804795
1,leisure,4.716779
1,self,11.280317
1,work,7.198109


In [33]:
df_husb_wife[["activity_w", "prop_day_w", "working_w"]].groupby(["working_w", "activity_w"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,prop_day_w
working_w,activity_w,Unnamed: 2_level_1
0,home,0.595931
0,leisure,0.385719
0,self,
0,work,0.01835
1,home,0.349531
1,leisure,0.250497
1,self,
1,work,0.399972


In [34]:
df_husb_wife[["activity_w", "time_spent_w", "working_w"]].groupby(["working_w", "activity_w"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,time_spent_w
working_w,activity_w,Unnamed: 2_level_1
0,home,7.330628
0,leisure,4.777924
0,self,11.65654
0,work,0.234908
1,home,4.634118
1,leisure,3.302231
1,self,10.638017
1,work,5.425634


In [52]:
# College vs Non-college
df_husb_wife[df_husb_wife["working_w"] == 1][["activity_w", "time_spent_w", "col_w"]].groupby(["col_w", "activity_w"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,time_spent_w
col_w,activity_w,Unnamed: 2_level_1
0,home,4.698559
0,leisure,3.250043
0,self,10.640694
0,work,5.410703
1,home,4.190844
1,leisure,3.661217
1,self,10.6196
1,work,5.528339


In [53]:
df_husb_wife[df_husb_wife["working_w"] == 0][["activity_w", "time_spent_w", "col_w"]].groupby(["col_w", "activity_w"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,time_spent_w
col_w,activity_w,Unnamed: 2_level_1
0,home,7.34221
0,leisure,4.760518
0,self,11.656973
0,work,0.240298
1,home,7.106367
1,leisure,5.11495
1,self,11.648142
1,work,0.130541


In [35]:
#3.  North v South: Working men v Working women

df_husb_wife[df_husb_wife["working_h"] == 1][["activity_h", "prop_day_h", "north_h"]].groupby(['north_h', 'activity_h']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,prop_day_h
north_h,activity_h,Unnamed: 2_level_1
0,home,0.047162
0,leisure,0.322195
0,self,0.922551
0,work,0.630643
1,home,0.039105
1,leisure,0.326082
1,self,0.946878
1,work,0.634813


In [37]:
df_husb_wife[df_husb_wife["working_h"] == 1][["activity_h", "time_spent_h", "north_h"]].groupby(['north_h', 'activity_h']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,time_spent_h
north_h,activity_h,Unnamed: 2_level_1
0,home,0.581275
0,leisure,4.008945
0,self,11.215828
0,work,8.193953
1,home,0.476465
1,leisure,4.039651
1,self,11.381377
1,work,8.102506


In [39]:
df_husb_wife[df_husb_wife["working_h"] == 0][["activity_h", "time_spent_h", "north_h"]].groupby(['north_h', 'activity_h']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,time_spent_h
north_h,activity_h,Unnamed: 2_level_1
0,home,1.293249
0,leisure,8.781435
0,self,13.134177
0,work,0.791139
1,home,0.88807
1,leisure,8.91386
1,self,13.077895
1,work,1.120175


In [42]:
df_husb_wife[df_husb_wife["working_h"] == 1][["activity_h", "prop_day_h", "north_h"]].groupby(['north_h', 'activity_h']).std()

Unnamed: 0_level_0,Unnamed: 1_level_0,prop_day_h
north_h,activity_h,Unnamed: 2_level_1
0,home,0.091897
0,leisure,0.216728
0,self,0.386315
0,work,0.246749
1,home,0.086273
1,leisure,0.216269
1,self,0.507704
1,work,0.240865


In [38]:
df_husb_wife[df_husb_wife["working_w"] == 1][["activity_w", "prop_day_w", "north_w"]].groupby(['north_w', 'activity_w']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,prop_day_w
north_w,activity_w,Unnamed: 2_level_1
0,home,0.329049
0,leisure,0.250778
0,self,0.809591
0,work,0.420172
1,home,0.355575
1,leisure,0.261242
1,self,0.840451
1,work,0.383183


In [40]:
df_husb_wife[df_husb_wife["working_w"] == 1][["activity_w", "time_spent_w", "north_w"]].groupby(['north_w', 'activity_w']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,time_spent_w
north_w,activity_w,Unnamed: 2_level_1
0,home,4.398236
0,leisure,3.333361
0,self,10.517897
0,work,5.750506
1,home,4.702661
1,leisure,3.421862
1,self,10.702237
1,work,5.17324


In [44]:
df_husb_wife[df_husb_wife["working_w"] == 0][["activity_w", "prop_day_w", "north_w"]].groupby(['north_w', 'activity_w']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,prop_day_w
north_w,activity_w,Unnamed: 2_level_1
0,home,0.583286
0,leisure,0.400915
0,self,
0,work,0.015799
1,home,0.581337
1,leisure,0.399278
1,self,0.9949
1,work,0.019384


In [41]:
df_husb_wife[df_husb_wife["working_w"] == 0][["activity_w", "time_spent_w", "north_w"]].groupby(['north_w', 'activity_w']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,time_spent_w
north_w,activity_w,Unnamed: 2_level_1
0,home,7.216073
0,leisure,4.967065
0,self,11.610999
0,work,0.205863
1,home,7.175212
1,leisure,4.984284
1,self,11.595702
1,work,0.244803


In [45]:
df_husb_wife[df_husb_wife["working_h"] == 0][["activity_h", "prop_day_h", "north_h"]].groupby(['north_h', 'activity_h']).std()

Unnamed: 0_level_0,Unnamed: 1_level_0,prop_day_h
north_h,activity_h,Unnamed: 2_level_1
0,home,0.175469
0,leisure,0.232482
0,self,
0,work,0.173116
1,home,0.139366
1,leisure,0.237019
1,self,1.310001
1,work,0.209629


In [94]:
df_husb_wife[df_husb_wife["working_w"] == 1][["activity_w", "prop_day_w", "north_w"]].groupby(['north_w', 'activity_w']).std()

Unnamed: 0_level_0,Unnamed: 1_level_0,prop_day_w
north_w,activity_w,Unnamed: 2_level_1
0,home,0.172077
0,leisure,0.166643
0,self,0.253295
0,work,0.244546
1,home,0.187768
1,leisure,0.173236
1,self,0.307457
1,work,0.235459


In [45]:
df_husb_wife[df_husb_wife["working_w"] == 1][["activity_w", "time_spent_w", "prop_day_w"]].groupby(['activity_w']).mean()

Unnamed: 0_level_0,time_spent_w,prop_day_w
activity_w,Unnamed: 1_level_1,Unnamed: 2_level_1
home,4.634118,0.349531
leisure,3.302231,0.250497
self,10.638017,
work,5.425634,0.399972


In [46]:
df_husb_wife[df_husb_wife["working_h"] == 1][["activity_h", "time_spent_h", "prop_day_h"]].groupby(['activity_h']).mean()

Unnamed: 0_level_0,time_spent_h,prop_day_h
activity_h,Unnamed: 1_level_1,Unnamed: 2_level_1
home,0.641987,0.052292
leisure,3.938864,0.319173
self,11.380026,
work,8.039124,0.628535


In [47]:
# What is the average time spent by a working person, on work. 
# Weighted average of men and women's time spent at work: 
# Proportion men and women: 
df_husb_wife["spouse_h"].value_counts()

spouse_h
husband    605808
Name: count, dtype: int64

In [48]:
df_husb_wife["spouse_w"].value_counts()

spouse_w
wife    605808
Name: count, dtype: int64

In [53]:
df_husb_wife["primID_h"].nunique()

36273

In [54]:
df_husb_wife["primID_w"].nunique()

36308

In [55]:
16*36273

580368

In [56]:
df_husb_wife[df_husb_wife["spouse_w"] == "wife"][["primID_w", "activity_w"]]

Unnamed: 0,primID_w,activity_w
16,TUS20124106201920110702012011005002,leisure
17,TUS20124106201920110702012011005002,self
18,TUS20124106201920110702012011005002,work
19,TUS20124106201920110702012011005002,home
20,TUS20124106201920110702012011005002,leisure
...,...,...
803619,TUS26082106201923612304043361014002,home
803620,TUS26082106201923612304043361014002,leisure
803621,TUS26082106201923612304043361014002,self
803622,TUS26082106201923612304043361014002,work


#### Trying to separate the two dataframes and seeing if results remain the same

In [57]:
df_husb = df_husb_wife[['Sector_h', 'District_h', 'State_h', 'north_h', 'Common-ID_h',\
       'Person serial no._h', 'Relation to head_h', 'Gender_h', 'Age_h',\
       'marital status_h', 'highest level of education_h',\
       'usual principal activity: status (code)_h',\
       'industry of work: 2-digit of NIC 2008_h', 'NSC_h', 'MULT_h',\
       'primID_h', 'time_spent_h', 'activity_h', 'TotalTime_h', 'spouse_h',\
       'rel_key_h']]

df_wife = df_husb_wife[['Sector_w', 'District_w', 'State_w', 'north_w',\
       'Common-ID_w', 'Person serial no._w', 'Relation to head_w', 'Gender_w',\
       'Age_w', 'marital status_w', 'highest level of education_w',\
       'usual principal activity: status (code)_w',\
       'industry of work: 2-digit of NIC 2008_w', 'NSC_w', 'MULT_w',\
       'primID_w', 'time_spent_w', 'activity_w', 'TotalTime_w', 'spouse_w',\
       'rel_key_w']]

In [59]:
df_husb.loc[:,'IDact'] = df_husb['primID_h']+df_husb['activity_h']
df_wife.loc[:,'IDact'] = df_wife['primID_w'] + df_wife['activity_w']

In [60]:
df_husb = df_husb.drop_duplicates(subset='IDact')
df_wife = df_wife.drop_duplicates(subset="IDact")

In [62]:
# 1. The entire dataset
df_husb.loc[:,"prop_day_h"] = df_husb["time_spent_h"]/df_husb["TotalTime_h"]
df_wife.loc[:,"prop_day_w"] = df_wife["time_spent_w"]/df_wife["TotalTime_w"]
df_husb[['activity_h', 'prop_day_h']].groupby('activity_h').mean()
# df_husb_wife[["activity_h", "prop_day_h"]].groupby("activity_h").mean()


Unnamed: 0_level_0,prop_day_h
activity_h,Unnamed: 1_level_1
home,0.055053
leisure,0.33475
self,
work,0.610197


In [63]:
df_wife[['activity_w', 'prop_day_w']].groupby('activity_w').mean()


Unnamed: 0_level_0,prop_day_w
activity_w,Unnamed: 1_level_1
home,0.55288
leisure,0.360651
self,
work,0.086468


In [64]:
# 2. Working men vs Working women
df_husb.loc[:,"working_h"] = 0
df_husb.loc[df_husb["usual principal activity: status (code)_h"].isin(EMP_CODES),"working_h"] = 1

df_wife.loc[:,"working_w"] = 0
df_wife.loc[df_husb_wife["usual principal activity: status (code)_w"].isin(EMP_CODES),"working_w"] = 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_husb.loc[:,"working_h"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_wife.loc[:,"working_w"] = 0


In [66]:
df_husb[["activity_h", "prop_day_h", "working_h", 'time_spent_h']].groupby(["working_h", "activity_h"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,prop_day_h,time_spent_h
working_h,activity_h,Unnamed: 2_level_1,Unnamed: 3_level_1
0,home,0.125132,1.388587
0,leisure,0.794025,8.442517
0,self,,13.196697
0,work,0.080843,0.972199
1,home,0.052669,0.64682
1,leisure,0.319128,3.937148
1,self,,11.378586
1,work,0.628203,8.037446


In [67]:
df_wife[["activity_w", "prop_day_w", "working_w", 'time_spent_w']].groupby(["working_w", "activity_w"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,prop_day_w,time_spent_w
working_w,activity_w,Unnamed: 2_level_1,Unnamed: 3_level_1
0,home,0.596739,7.341304
0,leisure,0.384786,4.765316
0,self,,11.656702
0,work,0.018475,0.236679
1,home,0.350579,4.649611
1,leisure,0.249327,3.288865
1,self,,10.632967
1,work,0.400093,5.428557


### Conditional on working, what proportion of time does an average person spend?

In [91]:
h_h = df_husb[(df_husb['working_h'] == 1) & (df_husb['activity_h'] == 'work')]['prop_day_h'].mean()
h_w = df_wife[(df_wife['working_w'] == 1) & (df_wife['activity_w'] == 'work')]['prop_day_w'].mean()
# Taking Weighted average
N_h = df_husb[df_husb['working_h'] == 1]['primID_h'].nunique()
N_w = df_wife[df_wife['working_w'] == 1]['primID_w'].nunique()
H = (h_h*N_h + h_w*N_w)/(N_h+N_w)

In [92]:
H

0.5926845434561677