In [None]:
import os
from timeit import default_timer as timer
import pandas as pd
from datetime import datetime, timedelta, date
from glob import glob
from pathlib import Path
import pandas as pd
import numpy as np

In [None]:
path_to_data='/scratch/spf248/covid/data'
source='cuebiq'
country='ID'
admin_id='ADM4_PCODE'
day=list(range(8,20))
night=list(range(8))+list(range(20,24))
inputs_folders=sorted([x.split('/')[-1][:-2] for x in glob(os.path.join(path_to_data,source,'s3',country,'*'))])
start_date=pd.to_datetime(inputs_folders[0],format='%Y%m%d').strftime('%m/%d/%Y')
end_date=pd.to_datetime(inputs_folders[-1],format='%Y%m%d').strftime('%m/%d/%Y')
print('Start:',start_date)
print('End:',end_date)

# Compute time spent at home

In [None]:
def get_coords(df):
    df['latitude']=df['point'].apply(lambda x:x['latitude'])
    df['longitude']=df['point'].apply(lambda x:x['longitude'])
    return df.drop('point',1)

In [None]:
print('Import')
start = timer()
data_dirs=glob(os.path.join(path_to_data,source,'processed',country,'pings_geocoded','*'))
pings_geocoded = pd.concat(
    pd.read_parquet(parquet_file)
    for data_dir in data_dirs for parquet_file in Path(data_dir).glob('*.parquet')
)
print("Done in", round(timer()-start), "sec")  

In [None]:
print('Import')
start = timer()
data_dir = Path(os.path.join(path_to_data,source,'processed',country,'primary_home'))
primary_home = pd.concat(
    pd.read_parquet(parquet_file)
    for parquet_file in data_dir.glob('*.parquet')
)
print("Done in", round(timer()-start), "sec")  

In [None]:
print('Process')
start = timer()
primary_home=get_coords(primary_home)
pings_geocoded.drop(['accuracy',admin_id],1,inplace=True)
pings_geocoded.sort_values(by=['cuebiq_id','time'],inplace=True)
pings_geocoded.reset_index(drop=True,inplace=True)
pings_geocoded['hour']=pings_geocoded.time.dt.hour
print("Done in", round(timer()-start), "sec")

In [None]:
print('Find home pings')
start = timer()
pings_home=pings_geocoded[pings_geocoded['classification_type']=='PERSONAL_AREA'].merge(
primary_home.drop('perfect_match',1),on=['cuebiq_id','latitude','longitude'],right_index=True)
pings_geocoded['home']=0
pings_geocoded.loc[pings_home.index,'home']=1
pings_geocoded.drop(['latitude','longitude','classification_type'],1,inplace=True)
print("Done in", round(timer()-start), "sec")

In [None]:
print('Time spent')
start = timer()
pings_geocoded['end']=pings_geocoded.groupby('cuebiq_id')['time'].shift(-1).rename('end')
pings_geocoded.dropna(inplace=True)
pings_geocoded['t_pings']=pings_geocoded.end.subtract(pings_geocoded.time).dt.seconds
pings_geocoded['t_home']=pings_geocoded.home.multiply(pings_geocoded.t_pings)
pings_geocoded.drop('end',1,inplace=True)
print("Done in", round(timer()-start), "sec")

In [None]:
print('Aggregate')
start = timer()
duration_date_id=pings_geocoded.groupby([pd.Grouper(key='time',freq='D'),'cuebiq_id']).agg(n_home=('home','sum'),t_home=('t_home','sum'),n_pings=('t_pings','count'),t_pings=('t_pings','sum'))
duration_date_id.reset_index(inplace=True)
duration_date_id.rename(columns={'time':'date'},inplace=True)
duration_date_id=duration_date_id[(duration_date_id['date']>=pd.to_datetime(start_date))&(duration_date_id['date']<=pd.to_datetime(end_date))].copy()
duration_date_id.dropna(inplace=True)
duration_date_id.reset_index(drop=True,inplace=True)
duration_date_id['pct_n']=duration_date_id['n_home'].divide(duration_date_id['n_pings'])
duration_date_id['pct_t']=duration_date_id['t_home'].divide(duration_date_id['t_pings'])
print("Done in", round(timer()-start),"sec")

In [None]:
print('Aggregate Daytime')
start = timer()
duration_daytime_id=pings_geocoded[pings_geocoded['hour'].isin(day)].groupby([pd.Grouper(key='time',freq='D'),'cuebiq_id']).agg(n_home=('home','sum'),t_home=('t_home','sum'),n_pings=('t_pings','count'),t_pings=('t_pings','sum'))
duration_daytime_id.reset_index(inplace=True)
duration_daytime_id.rename(columns={'time':'date'},inplace=True)
duration_daytime_id=duration_daytime_id[(duration_daytime_id['date']>=pd.to_datetime(start_date))&(duration_daytime_id['date']<=pd.to_datetime(end_date))].copy()
duration_daytime_id.dropna(inplace=True)
duration_daytime_id.reset_index(drop=True,inplace=True)
duration_daytime_id['pct_n']=duration_daytime_id['n_home'].divide(duration_daytime_id['n_pings'])
duration_daytime_id['pct_t']=duration_daytime_id['t_home'].divide(duration_daytime_id['t_pings'])
print("Done in", round(timer()-start), "sec")

In [None]:
print('Aggregate Nighttime')
start = timer()
duration_nighttime_id=pings_geocoded[pings_geocoded['hour'].isin(night)].groupby([pd.Grouper(key='time',freq='D'),'cuebiq_id']).agg(n_home=('home','sum'),t_home=('t_home','sum'),n_pings=('t_pings','count'),t_pings=('t_pings','sum'))
duration_nighttime_id.reset_index(inplace=True)
duration_nighttime_id.rename(columns={'time':'date'},inplace=True)
duration_nighttime_id=duration_nighttime_id[(duration_nighttime_id['date']>=pd.to_datetime(start_date))&(duration_nighttime_id['date']<=pd.to_datetime(end_date))].copy()
duration_nighttime_id.dropna(inplace=True)
duration_nighttime_id.reset_index(drop=True,inplace=True)
duration_nighttime_id['pct_n']=duration_nighttime_id['n_home'].divide(duration_nighttime_id['n_pings'])
duration_nighttime_id['pct_t']=duration_nighttime_id['t_home'].divide(duration_nighttime_id['t_pings'])
print("Done in", round(timer()-start), "sec")

In [None]:
print('Save')
start = timer()
duration_date_id.to_csv(os.path.join(path_to_data,source,'processed',country,'duration_date_id.csv'))
duration_daytime_id.to_csv(os.path.join(path_to_data,source,'processed',country,'duration_daytime_id.csv'))
duration_nighttime_id.to_csv(os.path.join(path_to_data,source,'processed',country,'duration_nighttime_id.csv'))
print("Done in", round(timer()-start), "sec")

# Figures