In [1]:
import pandas as pd
import os
from datetime import datetime
import copy

In [2]:
################# GLOBAL VARIABLES ######################

# Definitions from PLFS 2018-19 Annual Report, Concepts and Definitions (2.38.1)
# Problematic codes for self employed criteria: `21` - worked in household enterprises (self-employed) as helper
SELF_EMP_CODES = ["11", "12", "21"]
REG_EMP_CODES = ["31"]
CASUAL_EMP_CODES = ["41", "42", "51", "61", "62", "71", "72"]

NOT_REG_CODES = SELF_EMP_CODES + CASUAL_EMP_CODES

EMP_CODES = SELF_EMP_CODES + REG_EMP_CODES + CASUAL_EMP_CODES
UNEMP_CODES = ["81", "82"]
LF_CODES = EMP_CODES + UNEMP_CODES
NOT_IN_LF_CODES = [str(x) for x in list(range(91,100))]

CODE_MAP = {"1": "work", "2": "work", "3": "home", "4": "home", "5": "home", "6": "leisure", "7": "leisure", "8": "leisure", "9": "self"}

In [3]:
df_test = pd.read_pickle("df_merged.pkl")
DF_GENDER = pd.read_pickle("DF_GENDER.pkl")
DF_WEIGHT = pd.read_pickle("DF_WEIGHT.pkl")

In [4]:
# Just in case, GENDER and WEIGHT don't have unique primID 
DF_GENDER.drop_duplicates(subset=["primID"], inplace=True)
DF_WEIGHT.drop_duplicates(subset=["primID"], inplace=True)

In [5]:
# Working? 
DF_WORKING = df_test[["primID", "usual principal activity: status (code)"]]
DF_WORKING.loc[:,"working"] = DF_WORKING.loc[:,"usual principal activity: status (code)"].apply(lambda x: 1 if x in EMP_CODES else 0)
# DF_WORKING.drop_duplicates(subset=["primID"], inplace=True)
# DF_WORKING.drop(columns=["usual principal activity: status (code)"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DF_WORKING.loc[:,"working"] = DF_WORKING.loc[:,"usual principal activity: status (code)"].apply(lambda x: 1 if x in EMP_CODES else 0)


In [6]:
DF_WORKING[DF_WORKING["working"] == 0]["usual principal activity: status (code)"].unique()

array(['93', '91', '92', '94', '97', '95', '81'], dtype=object)

In [7]:
# Age: Bishnu: 15-65
df_test = df_test[(df_test["Age"].astype(float) <= 65) & (df_test["Age"].astype(float) >= 15)]
# Marital
df_test = df_test[df_test["marital status"] == "2"]
# Major activity
df_test["major_code"] = df_test["3-didit activity code"].apply(lambda x: x[0])


In [8]:
# Handle multiple: Split time between these activities - this will get Total Time to be 24 hours. 
# Create id + time_slot: Then each id and time slot that has multiple entries, give equally to each. That's df_factor.
df_test["time_slot"] = df_test["time from"].apply(lambda x: x.strftime("%H:%M")) + df_test["time to"].apply(lambda x: x.strftime("%H:%M"))
df_test["id_time_slot"] = df_test["primID"]+df_test["time_slot"]
# no of time each time slot appears with a primID is the factor applied to time spent 
df_factor = df_test["id_time_slot"].value_counts()
df_factor = pd.DataFrame(df_factor)
df_factor.reset_index(inplace=True)
df_factor.columns=["id_time_slot", "factor"]
df_test_factor = pd.merge(df_test, df_factor, on="id_time_slot", how="inner")
df_test_factor["time_spent"] = df_test_factor["time_spent"]/df_test_factor["factor"]

In [9]:
# Calculating time spent in each activity: Need to take out other columns
df_sub = df_test_factor[["primID", "major_code", "time_spent"]]
df_sub.loc[:,'activity'] = df_sub.loc[:,'major_code'].apply(lambda x: CODE_MAP[x])
df_sub.drop(columns=['major_code'], inplace=True)
# Now for each person, calculate time spent on each activity
df_sub_agg = df_sub.groupby(["primID", "activity"]).sum("time_spent")
df_sub_agg.reset_index(inplace=True)
# Because final dataset I need has to have gender and activity code and time, 
df_actgen = pd.merge(DF_GENDER, df_sub_agg, on="primID", how="inner")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:,'activity'] = df_sub.loc[:,'major_code'].apply(lambda x: CODE_MAP[x])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.drop(columns=['major_code'], inplace=True)


In [10]:
# Get Total Time removing "self"
df_TotalTime = df_actgen[df_actgen["activity"]== "self"][["primID", "time_spent"]]
df_TotalTime["TotalTime"] = 24 - df_TotalTime["time_spent"]
df_TotalTime.drop(columns="time_spent", inplace=True)
df_actgen = pd.merge(df_actgen, df_TotalTime, on="primID", how="inner")


In [11]:
df_actgen.head(15)

Unnamed: 0,primID,Gender,activity,time_spent,TotalTime
0,TUS10202106201910111901311011001001,1,home,2.166667,14.5
1,TUS10202106201910111901311011001001,1,leisure,1.333333,14.5
2,TUS10202106201910111901311011001001,1,self,9.5,14.5
3,TUS10202106201910111901311011001001,1,work,11.0,14.5
4,TUS10202106201910111901311011001002,2,home,6.0,14.5
5,TUS10202106201910111901311011001002,2,leisure,2.0,14.5
6,TUS10202106201910111901311011001002,2,self,9.5,14.5
7,TUS10202106201910111901311011001002,2,work,6.5,14.5
8,TUS10202106201910111901311011002001,1,home,2.25,12.166667
9,TUS10202106201910111901311011002001,1,leisure,5.916667,12.166667


In [12]:
# Before I get to calculating proportions, I need to make sure that if some activity
# is missing for some person, make sure it has 0. 
# For each primID, I will do a cross prod with all activities
# Then for those people that don't any such activity in a day, 
# an NA will show up - will replace NAs with 0.
primids = pd.DataFrame(df_actgen["primID"].unique())
primids.columns=["primID"]

import itertools
primids_act = pd.DataFrame(
            list(itertools.product(primids["primID"], ["leisure", "self", "work", "home"])), 
            columns=["primID", "activity"])

primids_act = pd.merge(primids_act, DF_GENDER, on="primID", how="inner")
primids_act = pd.merge(primids_act, df_TotalTime, on="primID", how="inner")
primids_act["ID_act"] = primids_act["primID"]+primids_act["activity"]

df_actgen["ID_act"] = df_actgen["primID"]+df_actgen["activity"]

df_actgen = pd.merge(df_actgen, primids_act, on="ID_act", how="right") 
df_actgen.drop(columns=["primID_x", "activity_x", "ID_act", "Gender_x", "TotalTime_x"], inplace=True)

df_actgen.columns = ["time_spent", "primID", "activity", "gender", "TotalTime"]
df_actgen = df_actgen[["primID", "gender", "activity", "time_spent", "TotalTime"]]
df_actgen.fillna(0, inplace=True)


In [13]:
# Merge with Working Status. 
df_working = pd.merge(df_actgen, DF_WORKING, on="primID", how="inner")

In [14]:
# Drop self
df_working = df_working[df_working["activity"] != "self"]
# Calculate prop time
df_working["propTime"] = df_working["time_spent"]/df_working["TotalTime"]


In [15]:
df_working.head()

Unnamed: 0,primID,gender,activity,time_spent,TotalTime,usual principal activity: status (code),working,propTime
0,TUS10202106201910111901311011001001,1,leisure,1.333333,14.5,31,1,0.091954
1,TUS10202106201910111901311011001001,1,leisure,1.333333,14.5,31,1,0.091954
2,TUS10202106201910111901311011001001,1,leisure,1.333333,14.5,31,1,0.091954
3,TUS10202106201910111901311011001001,1,leisure,1.333333,14.5,31,1,0.091954
4,TUS10202106201910111901311011001001,1,leisure,1.333333,14.5,31,1,0.091954


In [16]:
df_working[df_working["activity"] == "work"][["gender", "time_spent", "working"]].groupby(["gender", "working"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,time_spent
gender,working,Unnamed: 2_level_1
1,0,0.779853
1,1,7.178618
2,0,0.578792
2,1,4.825073
3,0,0.300708
3,1,7.158796


In [17]:
df_working[df_working["working"] == 1][["activity", "propTime"]].groupby(["activity"]).mean()

Unnamed: 0_level_0,propTime
activity,Unnamed: 1_level_1
home,0.143743
leisure,0.33319
work,0.523067


In [18]:
df_working[["gender", "activity", "working", "propTime"]].groupby(["gender", "working", "activity"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,propTime
gender,working,activity,Unnamed: 3_level_1
1,0,home,0.133729
1,0,leisure,0.798298
1,0,work,0.067973
1,1,home,0.068504
1,1,leisure,0.356202
1,1,work,0.575294
2,0,home,0.578327
2,0,leisure,0.375779
2,0,work,0.045894
2,1,home,0.373369


In [19]:
# Here I want to do two things: 
# 1. First, calculate time spent by only working individuals on working using TUS. 
# 2. Second, replace time spent by only working individuals from PLFS, and then calculate their prop. of day.

df_working = pd.merge(df_actgen, DF_WORKING, on="primID", how="inner")
df_working = df_working[df_working["working"] == 1]
df_working.drop(columns=["working"], inplace=True)
df_w

NameError: name 'df_w' is not defined