spark-submit --master yarn --deploy-mode cluster  --conf spark.yarn.submit.waitAppCompletion=false --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.speculation=false --conf spark.executorEnv.LANG=en_US.UTF-8 --conf spark.yarn.appMasterEnv.LANG=en_US.UTF-8 --driver-cores 20 --driver-memory 20G --num-executors 40 --executor-cores 15 --executor-memory 25G ./covid/py/get-daily-counts-pyspark.py

In [1]:
import os

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,hour,dayofweek,to_timestamp,size,isnan,lit,date_format,to_timestamp,struct,expr
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType, DoubleType

In [2]:
try:
    spark
except NameError:
    spark=SparkSession.builder.appName("").getOrCreate()

In [3]:
source='cuebiq'
country='US'

if os.getenv('CLUSTER')=='PRINCE':
    path_to_data='/scratch/spf248/covid/data'
    directories=['*'+str(x)+'00' for x in range(10)][1:2]
    file='part-00000-0428e20d-9019-4cbf-b5ce-bc9414007fec-c000.csv.gz'
else:
    path_to_data='/user/spf248/covid/data'
    directories=['*'+str(x)+'00' for x in range(10)]
    file='*'

In [4]:
schema= StructType([
StructField("_c0", FloatType(), False),
StructField("_c1", StringType(), False),
StructField("_c2", FloatType(), False),
StructField("_c3", FloatType(), False),
StructField("_c4", FloatType(), False),
StructField("_c5", FloatType(), False),
StructField("_c6", FloatType(), False),
StructField("_c7", StringType(), False),
StructField("_c8", StringType(), False),])

# Load Dataset

In [5]:
def load_data(directory):

    df=spark.read.option(
    'compression', 'gzip').option(
    'header', 'false').option(
    "multiLine", "true").option(
    'escape','"').option(
    "encoding", "UTF-8").option(
    "delimiter", "\t").schema(schema).csv(
    os.path.join(
    path_to_data,
    source,
    country,
    directory,
    file))

    df.cache()

    column_names=[
    'timestamp',
    'cuebiq_id',
    'device_type',
    'latitude',
    'longitude',
    'accuracy',
    'time_zone_offset',
    'classification_type',
    'transformation_type']
    df=df.toDF(*column_names)

    df=df.withColumn("time",to_timestamp(df["timestamp"]+df["time_zone_offset"]))
    df=df.withColumn("date", date_format(col("time"), "yyyy-MM-dd"))
    df=df.withColumn('point', struct('longitude','latitude'))
    df=df.drop('timestamp','device_type','accuracy','time_zone_offset','transformation_type','time','latitude','longitude')
    
    return df

In [6]:
for i,directory in enumerate(directories):
    
    df=load_data(directory)
    
    if not i:
        
        counts_daily=df.groupby(
        'cuebiq_id','date').count()
        counts_daily.cache()
        
        counts_personal=df.filter(df['classification_type']=='PERSONAL_AREA').groupby(
        'cuebiq_id','point','date').count()
        counts_personal.cache()
        
    else:
        
        counts_daily=counts_daily.unionByName(
        df.groupby('cuebiq_id','date').count())
        
        counts_personal=counts_personal.unionByName(
        df.filter(df['classification_type']=='PERSONAL_AREA').groupby('cuebiq_id','point','date').count())

In [7]:
print('Compute Pings per Day')
counts_daily=counts_daily.groupby('cuebiq_id','date').agg(
{'count':'sum'}).withColumnRenamed('sum(count)','n_pings')
counts_daily.write.mode("overwrite").parquet(os.path.join(path_to_data,source,country,'counts_daily'))

Compute Pings per Day


In [9]:
# counts_daily.show()

+--------------------+----------+-------+
|           cuebiq_id|      date|n_pings|
+--------------------+----------+-------+
|40c5cd542fd6e8e2a...|2019-12-31|    247|
|43f7982855fa3f980...|2019-12-31|    139|
|3f56e7349f32b5aca...|2019-12-31|     16|
|a764905f3693b62bb...|2020-01-01|     88|
|dd8c17e237cc95a83...|2019-12-31|     85|
|451265a51d3a6fdeb...|2019-12-31|    107|
|647ce70cd7dc52820...|2020-01-01|     25|
|f2a0e7e72ca6d9355...|2019-12-31|     10|
|e01b1035cbc48408d...|2019-12-31|    216|
|83820e28dd1c20bd7...|2019-12-30|      7|
|6073f2c828b93f8b7...|2020-01-01|     21|
|7a3f48a5e285c94aa...|2020-01-01|      6|
|1c53ada43dd4efc3d...|2020-01-01|    120|
|2eeea3efb7e743314...|2020-01-01|     42|
|3a4f6447d98999bcd...|2020-01-01|     23|
|322e84c668f308e32...|2020-01-01|     36|
|d0353d8a36ea0145a...|2020-01-01|     54|
|45ba4f9d53917c4aa...|2019-12-31|    304|
|fce8fc42d3eb452cc...|2020-01-01|     51|
|ba899e2158a42b8bd...|2020-01-01|     41|
+--------------------+----------+-

In [8]:
print('Compute Pings per Day at Personal Areas')
counts_personal=counts_personal.groupby('cuebiq_id','point','date').agg(
{'count':'sum'}).withColumnRenamed('sum(count)','n_pings')
counts_personal.write.mode("overwrite").parquet(os.path.join(path_to_data,source,country,'counts_personal'))

Compute Pings per Day at Personal Areas


In [10]:
# counts_personal.show()

+--------------------+--------------------+----------+-------+
|           cuebiq_id|               point|      date|n_pings|
+--------------------+--------------------+----------+-------+
|5fc29c86f16874927...|[-102.17165, 31.9...|2020-01-01|     18|
|3d9027dc8c309ecaf...|[-80.26523, 25.93...|2019-12-31|     22|
|0c3ef66ce5d83906e...|[-77.74162, 36.18...|2019-12-31|     62|
|e01b1035cbc48408d...|[-117.39334, 47.6...|2019-12-31|     67|
|b5be0aa82ddd9c575...|[-86.17238, 36.98...|2020-01-01|     11|
|16ba421959630a1bb...|[-74.04457, 43.11...|2020-01-01|      2|
|b944a212c59fdf4d8...|[-98.39095, 29.51...|2019-12-31|     37|
|7462e464f98265547...|[-76.420265, 36.8...|2019-12-30|      3|
|a647b7b9323c1afea...|[-104.95956, 39.9...|2019-12-31|      8|
|f53f1ba18c490267e...|[-84.23694, 39.97...|2019-12-31|     17|
|1d1dd9523fa53890a...|[-95.72366, 31.66...|2019-12-31|     72|
|26c6731377df4f60f...|[-84.42471, 42.25...|2020-01-01|      3|
|5f6bceb419f25fd2c...|[-84.38356, 33.74...|2019-12-31| 