In [1]:
import os
import pyspark.sql.functions as F
from pyspark.sql.functions import col, lit, lag
from pyspark.sql.window import Window
from pyspark.sql.types import LongType 

In [2]:
results_path = "/results/"
start_date='2020-01-01'
end_date='2020-04-16'
source='cuebiq'
country='ID'
admin_id='ADM4_PCODE'
start_hour_day = 8
end_hour_day = 20

# Compute time spent at home

In [4]:
pings_geocoded = spark.read.parquet(os.path.join(results_path,source, 'processed',country,'pings_geocoded')).drop('accuracy','ADM4_PCODE')
primary_home = spark.read.parquet(os.path.join(results_path,source, 'processed',country,'primary_home')).select(col("cuebiq_id"), col("point.latitude").alias("latitude"), col("point.longitude").alias("longitude"), lit("1").alias("home_location"))


In [5]:
pings_geocoded = pings_geocoded.where((pings_geocoded.time >= start_date) & (pings_geocoded.time <= end_date))

In [6]:
pings_home = pings_geocoded.join(primary_home, on = ['cuebiq_id','latitude','longitude'], how="left").select( "cuebiq_id",  "latitude","longitude","time",  F.when(primary_home.home_location.isNull(), 0).otherwise(1).alias("home"))

In [7]:
print('Aggregate')
w = Window().partitionBy("cuebiq_id").orderBy("time")
duration_df = pings_home.select("cuebiq_id", (col("time").cast(LongType()) - lag("time", 1).over(w).cast(LongType())).alias("duration") , "home", F.date_format('time','yyyy-MM-dd').alias("date"), F.hour("time").alias("hour")).dropna()
home_df = duration_df.where(duration_df.home==1).groupby("date","cuebiq_id").agg(F.sum("duration").alias("t_home"), F.count(lit(1)).alias("n_home"))

all_df = duration_df.groupby("date", "cuebiq_id").agg(F.sum("duration").alias("t_pings"), F.count(lit(1)).alias("n_pings"))
duration_date_id = home_df.join(all_df, on = ['cuebiq_id', "date"])
duration_date_id = duration_date_id.withColumn("pct_n", col("n_home")/ col("n_pings")).withColumn("pct_t", col("t_home")/ col("t_pings"))

In [8]:
print('Aggregate Daytime')
w = Window().partitionBy("cuebiq_id").orderBy("time")
day_df = duration_df.filter((col("hour") >= 8) & (col("hour") < 20))

home_df = day_df.where(duration_df.home==1).groupby("date","cuebiq_id").agg(F.sum("duration").alias("t_home"), F.count(lit(1)).alias("n_home")).dropna()

all_df = day_df.groupby("date", "cuebiq_id").agg(F.sum("duration").alias("t_pings"), F.count(lit(1)).alias("n_pings"))
duration_daytime_id = home_df.join(all_df, on = ['cuebiq_id', "date"])
duration_daytime_id = duration_daytime_id.withColumn("pct_n", col("n_home")/ col("n_pings")).withColumn("pct_t", col("t_home")/ col("t_pings"))


In [9]:
print('Aggregate Nighttime')
w = Window().partitionBy("cuebiq_id").orderBy("time")
night_df  = duration_df.filter(((col("hour") >= 20) & (col("hour") <= 23)) |((col("hour") >= 0) & (col("hour") < 8)))

home_df = night_df.where(duration_df.home==1).groupby("date","cuebiq_id").agg(F.sum("duration").alias("t_home"), F.count(lit(1)).alias("n_home")).dropna()

all_df = night_df.groupby("date", "cuebiq_id").agg(F.sum("duration").alias("t_pings"), F.count(lit(1)).alias("n_pings"))
duration_nighttime_id = home_df.join(all_df, on = ['cuebiq_id', "date"])
duration_nighttime_id = duration_nighttime_id.withColumn("pct_n", col("n_home")/ col("n_pings")).withColumn("pct_t", col("t_home")/ col("t_pings"))


In [10]:
#just a sanity check (the numbers should add up)
print("All")
duration_date_id.where((duration_date_id.cuebiq_id=='00021be4c0d93c136d417bbf1f3409fb892732f9e796ac3f734fa519a621c22a') &  (duration_date_id.date == '2020-01-31')).show()
print("Day")
duration_daytime_id.where((duration_daytime_id.cuebiq_id=='00021be4c0d93c136d417bbf1f3409fb892732f9e796ac3f734fa519a621c22a') &  (duration_daytime_id.date == '2020-01-31')).show()
print("Night")
duration_nighttime_id.where((duration_nighttime_id.cuebiq_id=='00021be4c0d93c136d417bbf1f3409fb892732f9e796ac3f734fa519a621c22a') &  (duration_nighttime_id.date == '2020-01-31')).show()

In [11]:
print('Save')

results_path = '/dbfs/results/'

distances_date_id.toPandas().to_csv(os.path.join(results_path,source,'processed',country,'duration_date_id.csv'), index=False)
distances_daytime_id.toPandas().to_csv(os.path.join(results_path,source,'processed',country,'duration_daytime_id.csv'), index = False)
distances_nighttime_id.toPandas().to_csv(os.path.join(results_path,source,'processed',country,'duration_nighttime_id.csv'), index = False)

# Figures