spark-submit --master yarn --deploy-mode cluster  --conf spark.yarn.appMasterEnv.SPARK_HOME=/share/apps/spark/^Cark-2.4.0-bin-hadoop2.6 --conf spark.yarn.submit.waitAppCompletion=false --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.speculation=false --conf spark.executorEnv.LANG=en_US.UTF-8 --conf spark.yarn.appMasterEnv.LANG=en_US.UTF-8 --driver-cores 20 --driver-memory 40G --num-executors 40 --executor-cores 15 --executor-memory 40G ./covid/py/get-summary-statistics.py

In [2]:
import os
from datetime import datetime

import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.functions import col, hour, date_format, to_timestamp, struct
from pyspark.sql.types import LongType, StringType, StructType, StructField, FloatType

# Params

In [4]:
def dbutils_path_exists(path):
  try:
    dbutils.fs.ls(path)
    return True
  except:
    print("{} not found".format(path))

In [5]:
source='cuebiq'
country='ID'
admin_id='ADM4_PCODE'
start_date='2020-01-01'
end_date=datetime.today().strftime('%Y-%m-%d')
days_to_consider=[x.strftime('%Y%m%d00') for x in pd.date_range(start_date,end_date) if dbutils_path_exists("mnt/wbgggscecovid19dev-datapartnership/data/cuebiq/covid-19/{}/{}/".format(country, x.strftime('%Y%m%d00')))]
paths_to_data = ["/mnt/wbgggscecovid19dev-datapartnership/data/cuebiq/covid-19/{}/{}/*.csv.gz".format(country, day) for day in days_to_consider]
admin_path = "/admin/"
results_path = "/results/"

schema= StructType([
StructField("timestamp", LongType(), False),
StructField("cuebiq_id", StringType(), False),
StructField("device_type", FloatType(), False),
StructField("latitude", FloatType(), False),
StructField("longitude", FloatType(), False),
StructField("accuracy", FloatType(), False),
StructField("time_zone_offset", LongType(), False),
StructField("classification_type", StringType(), False),
StructField("transformation_type", StringType(), False),])

# Load Data By Chunk

In [7]:
def load_data(paths_to_data):
  if not paths_to_data:
    raise Exception("No paths given as input!")
  df=spark.read.format("csv").option('header', 'false').option("delimiter", "\t").schema(schema).load(paths_to_data)
  df=df.withColumn(
      "time",to_timestamp(df["timestamp"]+df["time_zone_offset"])).withColumn(
      "date", date_format(col("time"), "yyyy-MM-dd")).withColumn(
      'hour',hour("time")).withColumn(
      'point', struct('longitude','latitude'))
  return df.select('cuebiq_id','device_type','time','date','hour','point','classification_type')

# Summary Statistics

In [9]:

df=load_data(paths_to_data)
df.cache()

device_id=df.groupby('cuebiq_id').agg(F.first("device_type").alias("device_type"))
n_pings_id_date_hour=df.groupby('cuebiq_id','date','hour').count().withColumnRenamed('count','n_pings')

n_pings_id_personal_date_hour=df.filter(df['classification_type']=='PERSONAL_AREA').groupby(
'cuebiq_id','point','date','hour').count().withColumnRenamed('count','n_pings')
df.unpersist()

# Aggregate Chunks

In [11]:
n_pings_id_date_hour=n_pings_id_date_hour.groupby('cuebiq_id','date','hour').agg({'n_pings':'sum'}).withColumnRenamed('sum(n_pings)','n_pings')
n_pings_id_personal_date_hour=n_pings_id_personal_date_hour.groupby('cuebiq_id','point','date','hour').agg({'n_pings':'sum'}).withColumnRenamed('sum(n_pings)','n_pings')
n_pings_id_day_hour=n_pings_id_date_hour.withColumn('dayofweek',date_format("date","u")).groupby('cuebiq_id','dayofweek','hour').agg({'n_pings':'sum'}).withColumnRenamed('sum(n_pings)','n_pings')
n_pings_id_personal_day_hour=n_pings_id_personal_date_hour.withColumn('dayofweek',date_format("date","u")).groupby('cuebiq_id','point','dayofweek','hour').agg({'n_pings':'sum'}).withColumnRenamed('sum(n_pings)','n_pings')

# Save

In [13]:
device_id.write.mode("overwrite").parquet(os.path.join(results_path,source,'processed',country,'device_id'))
n_pings_id_date_hour.write.mode("overwrite").parquet(os.path.join(results_path,source,'processed',country,'n_pings_id_date_hour'))
n_pings_id_day_hour.write.mode("overwrite").parquet(os.path.join(results_path,source,'processed',country,'n_pings_id_day_hour'))
n_pings_id_personal_date_hour.write.mode("overwrite").parquet(os.path.join(results_path,source,'processed',country,'n_pings_id_personal_date_hour'))
n_pings_id_personal_day_hour.write.mode("overwrite").parquet(os.path.join(results_path,source,'processed',country,'n_pings_id_personal_day_hour'))