spark-submit --master yarn --deploy-mode cluster  --conf spark.yarn.appMasterEnv.SPARK_HOME=/share/apps/spark/^Cark-2.4.0-bin-hadoop2.6 --conf spark.yarn.submit.waitAppCompletion=false --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.speculation=false --conf spark.executorEnv.LANG=en_US.UTF-8 --conf spark.yarn.appMasterEnv.LANG=en_US.UTF-8 --driver-cores 20 --driver-memory 40G --num-executors 40 --executor-cores 15 --executor-memory 40G ./covid/py/get-summary-statistics.py

In [None]:
import os
from datetime import datetime
import pandas as pd
import numpy as np

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,hour,dayofmonth,dayofweek,to_timestamp,size,isnan,lit,date_format,to_timestamp,struct
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType, DoubleType

In [None]:
try:
    spark
except NameError:
    spark=SparkSession.builder.appName("").getOrCreate()

# Params

In [None]:
source='cuebiq'
country='MX'
n_chunks=1
start_date='2020-01-01'
end_date=datetime.today().strftime('%Y-%m-%d')
if os.getenv('CLUSTER')=='PRINCE':
    path_to_data='/scratch/spf248/covid/data'
    directories=directories[:1]
else:
    path_to_data='/user/spf248/covid/data'

In [None]:
# List data files
paths=[]
fs=spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
directories=[x.strftime('%Y-%m-%d').replace('-','')+'00' for x in pd.date_range(start_date,end_date)]
for directory in directories:
    path_to_directory=os.path.join(path_to_data,source,'s3',country,directory)
    if not fs.exists(spark._jvm.org.apache.hadoop.fs.Path(path_to_directory)):
        continue
    list_status=fs.listStatus(spark._jvm.org.apache.hadoop.fs.Path(path_to_directory))
    paths.extend([file.getPath().toString().replace('hdfs://dumbo','').replace('file:','') for file in list_status])
    paths=sorted([path for path in paths if '.csv.gz' in path])
    print(directory)
    
# Split files list into chunks to avoid memory issues
if os.getenv('CLUSTER')=='PRINCE':
    paths=paths[:1]
    paths_chunks=np.array_split(paths,n_chunks)
    paths_chunks=paths_chunks[:1]
else:
    paths_chunks=np.array_split(paths,n_chunks)
print('# Files:', sum([len(paths_chunk) for paths_chunk in paths_chunks]))
print('# Chunks:', len(paths_chunks))
    
schema= StructType([
StructField("_c0", FloatType(), False),
StructField("_c1", StringType(), False),
StructField("_c2", FloatType(), False),
StructField("_c3", FloatType(), False),
StructField("_c4", FloatType(), False),
StructField("_c5", FloatType(), False),
StructField("_c6", FloatType(), False),
StructField("_c7", StringType(), False),
StructField("_c8", StringType(), False),])

# Load Data By Chunk

In [None]:
def load_data(paths_chunk):

    df=spark.read.option(
    'compression', 'gzip').option(
    'header', 'false').option(
    "multiLine", "true").option(
    'escape','"').option(
    "encoding", "UTF-8").option(
    "delimiter", "\t").schema(schema).csv(list(paths_chunk))

    column_names=[
    'timestamp',
    'cuebiq_id',
    'device_type',
    'latitude',
    'longitude',
    'accuracy',
    'time_zone_offset',
    'classification_type',
    'transformation_type']
    df=df.toDF(*column_names)

    df=df.withColumn(
    "time",to_timestamp(df["timestamp"]+df["time_zone_offset"])).withColumn(
    "date", date_format(col("time"), "yyyy-MM-dd")).withColumn(
    'hour',hour("time")).withColumn(
    'point', struct('longitude','latitude'))
    
    return df.select('cuebiq_id','device_type','time','date','hour','point','classification_type')

# Summary Statistics by Chunk

In [None]:
for i,paths_chunk in enumerate(paths_chunks):
    
    df=load_data(paths_chunk)
    df.cache()
    
    if not i:
        
        device_id=df.groupby('cuebiq_id').agg(
        {'device_type':'first'}).withColumnRenamed('first(device_type)','device_type')
        
        n_pings_id_date_hour=df.groupby(
        'cuebiq_id','date','hour').count().withColumnRenamed('count','n_pings')
        
        n_pings_id_personal_date_hour=df.filter(df['classification_type']=='PERSONAL_AREA').groupby(
        'cuebiq_id','point','date','hour').count().withColumnRenamed('count','n_pings')
        
    else:
        
        device_id=device_id.unionByName(
        df.groupby('cuebiq_id').agg({'device_type':'first'}).withColumnRenamed('first(device_type)','device_type'))
        
        n_pings_id_date_hour=n_pings_id_date_hour.unionByName(
        df.groupby('cuebiq_id','date','hour').count().withColumnRenamed('count','n_pings'))
        
        n_pings_id_personal_date_hour=n_pings_id_personal_date_hour.unionByName(
        df.filter(df['classification_type']=='PERSONAL_AREA').groupby(
        'cuebiq_id','point','date','hour').count().withColumnRenamed('count','n_pings'))
        
    df.unpersist()

# Aggregate Chunks

In [None]:
device_id=device_id.groupby('cuebiq_id').agg({'device_type':'first'}).withColumnRenamed('first(device_type)','device_type')
n_pings_id_date_hour=n_pings_id_date_hour.groupby('cuebiq_id','date','hour').agg({'n_pings':'sum'}).withColumnRenamed('sum(n_pings)','n_pings')
n_pings_id_personal_date_hour=n_pings_id_personal_date_hour.groupby('cuebiq_id','point','date','hour').agg({'n_pings':'sum'}).withColumnRenamed('sum(n_pings)','n_pings')
n_pings_id_day_hour=n_pings_id_date_hour.withColumn('dayofweek',date_format("date","u")).groupby('cuebiq_id','dayofweek','hour').agg({'n_pings':'sum'}).withColumnRenamed('sum(n_pings)','n_pings')
n_pings_id_personal_day_hour=n_pings_id_personal_date_hour.withColumn('dayofweek',date_format("date","u")).groupby('cuebiq_id','point','dayofweek','hour').agg({'n_pings':'sum'}).withColumnRenamed('sum(n_pings)','n_pings')

# Save

In [None]:
device_id.write.mode("overwrite").parquet(os.path.join(path_to_data,source,'processed',country,'device_id'))
n_pings_id_date_hour.write.mode("overwrite").parquet(os.path.join(path_to_data,source,'processed',country,'n_pings_id_date_hour'))
n_pings_id_day_hour.write.mode("overwrite").parquet(os.path.join(path_to_data,source,'processed',country,'n_pings_id_day_hour'))
n_pings_id_personal_date_hour.write.mode("overwrite").parquet(os.path.join(path_to_data,source,'processed',country,'n_pings_id_personal_date_hour'))
n_pings_id_personal_day_hour.write.mode("overwrite").parquet(os.path.join(path_to_data,source,'processed',country,'n_pings_id_personal_day_hour'))