### In this file 
From L5 data, I create how much time each primID spends in each activity

In [1]:
import pandas as pd
import os
from datetime import datetime
import copy

In [2]:
################# GLOBAL VARIABLES ######################

# Definitions from PLFS 2018-19 Annual Report, Concepts and Definitions (2.38.1)
# Problematic codes for self employed criteria: `21` - worked in household enterprises (self-employed) as helper
SELF_EMP_CODES = ["11", "12", "21"]
REG_EMP_CODES = ["31"]
CASUAL_EMP_CODES = ["41", "42", "51", "61", "62", "71", "72"]

NOT_REG_CODES = SELF_EMP_CODES + CASUAL_EMP_CODES

EMP_CODES = SELF_EMP_CODES + REG_EMP_CODES + CASUAL_EMP_CODES
UNEMP_CODES = ["81", "82"]
LF_CODES = EMP_CODES + UNEMP_CODES
NOT_IN_LF_CODES = [str(x) for x in list(range(91,100))]

CODE_MAP = {"1": "work", "2": "work", "3": "home", "4": "home", "5": "home", "6": "leisure", "7": "leisure", "8": "leisure", "9": "self"}

In [3]:
df_L5 = pd.read_pickle("df_L5.pkl")

In [4]:
df_L5.columns

Index(['Common-ID', 'Level', 'Filler', 'Serial no.of member', 'age',
       'srl. No of activity', 'time from', 'time to',
       'whether performed multiple activity in the time slot',
       'whether simultaneous activity', 'whether a major activity',
       '3-didit activity code', 'where the activity was performed',
       'unpaid/paid status of activity', 'enterprise type', 'Blank', 'NSC',
       'MULT', 'primID'],
      dtype='object', name='item')

In [5]:
df_L5 = df_L5[df_L5["3-didit activity code"].apply(lambda x: len(x) > 0)] # There is one entry that is empty.
df_L5["major_code"] = df_L5["3-didit activity code"].apply(lambda x: str(x)[0])

In [6]:
df_L5 = df_L5[df_L5.loc[:,"time from"].isna() != True]

In [7]:
df_L5.loc[:,'time from'] = df_L5.loc[:,"time from"].apply(lambda x: datetime.strptime(x, '%H:%M'))
df_L5.loc[:,'time to'] = df_L5.loc[:,"time to"].apply(lambda x: datetime.strptime(x, '%H:%M'))
df_L5.loc[:,'time_spent'] = (df_L5.loc[:,'time to']-df_L5.loc[:,'time from']).apply(lambda x: x.total_seconds()/3600)
df_L5.loc[:,'time_spent'] = df_L5.loc[:,'time_spent'].apply(lambda x: x+24.0 if x < 0 else x)

In [8]:
# Handle multiple: Split time between these activities - this will get Total Time to be 24 hours. 
# Create id + time_slot: Then each id and time slot that has multiple entries, give equally to each. That's df_factor.
df_L5.loc[:,"time_slot"] = df_L5.loc[:,"time from"].apply(lambda x: x.strftime("%H:%M")) + df_L5.loc[:,"time to"].apply(lambda x: x.strftime("%H:%M"))
df_L5.loc[:,"id_time_slot"] = df_L5["primID"]+df_L5["time_slot"]
# no of time each time slot appears with a primID is the factor applied to time spent 
df_factor = df_L5["id_time_slot"].value_counts()
df_factor = pd.DataFrame(df_factor)
df_factor.reset_index(inplace=True)
df_factor.columns=["id_time_slot", "factor"]
df_L5 = pd.merge(df_L5, df_factor, on="id_time_slot", how="inner")
df_L5.loc[:,"time_spent"] = df_L5["time_spent"]/df_L5["factor"]

In [9]:
df_L5[["primID", "major_code", "time_spent"]].head(15)

Unnamed: 0,primID,major_code,time_spent
0,TUS10202106201910111901311011001001,9,2.0
1,TUS10202106201910111901311011001001,9,0.166667
2,TUS10202106201910111901311011001001,9,0.166667
3,TUS10202106201910111901311011001001,7,0.166667
4,TUS10202106201910111901311011001001,9,0.25
5,TUS10202106201910111901311011001001,8,0.25
6,TUS10202106201910111901311011001001,4,1.0
7,TUS10202106201910111901311011001001,1,0.5
8,TUS10202106201910111901311011001001,1,4.5
9,TUS10202106201910111901311011001001,1,1.0


In [10]:
# Calculating time spent in each activity: Need to take out other columns
df_sub = df_L5[["primID", "major_code", "time_spent"]]
df_sub.loc[:,'activity'] = df_sub.loc[:,'major_code'].apply(lambda x: CODE_MAP[x])
df_sub.drop(columns=['major_code'], inplace=True)
# Now for each person, calculate time spent on each activity
df_sub_agg = df_sub.groupby(["primID", "activity"]).sum("time_spent")
df_sub_agg.reset_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.loc[:,'activity'] = df_sub.loc[:,'major_code'].apply(lambda x: CODE_MAP[x])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub.drop(columns=['major_code'], inplace=True)


In [11]:
df_sub_agg.head(10)

Unnamed: 0,primID,activity,time_spent
0,TUS10001106201913310301382332001001,home,0.75
1,TUS10001106201913310301382332001001,leisure,2.75
2,TUS10001106201913310301382332001001,self,12.5
3,TUS10001106201913310301382332001001,work,8.0
4,TUS10001106201913310301382332001002,home,3.75
5,TUS10001106201913310301382332001002,leisure,3.25
6,TUS10001106201913310301382332001002,self,12.0
7,TUS10001106201913310301382332001002,work,5.0
8,TUS10001106201913310301382332001003,home,0.25
9,TUS10001106201913310301382332001003,leisure,4.0


In [12]:
# I need to make sure that if some activity
# is missing for some person, make sure it has 0. 
# For each primID, I will do a cross prod with all activities
# Then for those people that don't any such activity in a day,
# an NA will show up - will replace NAs with 0.
primids = pd.DataFrame(df_sub_agg["primID"].unique())
primids.columns=["primID"]

import itertools
primids_act = pd.DataFrame(
            list(itertools.product(primids["primID"], ["leisure", "self", "work", "home"])), 
            columns=["primID", "activity"])
primids_act["ID_act"] = primids_act["primID"]+primids_act["activity"]

df_sub_agg["ID_act"] = df_sub_agg["primID"]+df_sub_agg["activity"]
df_sub_agg = pd.merge(df_sub_agg, primids_act, on="ID_act", how="right") 
df_sub_agg.drop(columns=["activity_x", "primID_x", "ID_act"], inplace=True)
df_sub_agg.rename(columns={"primID_y": "primID", "activity_y": "activity"}, inplace=True)
df_sub_agg.fillna(0, inplace=True)
# df_sub_agg.head(25)

In [13]:
# Get Total Time net of Self
df_TotalTime = df_sub_agg[df_sub_agg["activity"]== "self"][["primID", "time_spent"]]
df_TotalTime["TotalTime"] = 24 - df_TotalTime["time_spent"]
df_TotalTime.drop(columns="time_spent", inplace=True)

In [14]:
df_TotalTime.head(10)

Unnamed: 0,primID,TotalTime
1,TUS10001106201913310301382332001001,11.5
5,TUS10001106201913310301382332001002,12.0
9,TUS10001106201913310301382332001003,11.25
13,TUS10001106201913310301382332001004,11.75
17,TUS10001106201913310301382332002001,9.5
21,TUS10001106201913310301382332003001,10.25
25,TUS10001106201913310301382332003002,10.75
29,TUS10001106201913310301382332003003,10.75
33,TUS10001106201913310301382332003004,8.25
37,TUS10001106201913310301382332004001,11.0


In [15]:
# merge total time available df with data frame with time in each activity
df_sub_agg = pd.merge(df_sub_agg, df_TotalTime, on="primID", how="outer",indicator=True)

In [16]:
df_sub_agg["_merge"].value_counts()

_merge
both          1781196
left_only           0
right_only          0
Name: count, dtype: int64

In [17]:
df_sub_agg.drop(columns=["_merge"], inplace=True)

In [18]:
df_sub_agg.fillna(0,inplace=True)

In [19]:
# Final Output: 
df_sub_agg.to_pickle("df_timeSpent.pkl")

In [20]:
############################## END ########################################