spark-submit --master yarn --deploy-mode cluster  --conf spark.yarn.submit.waitAppCompletion=false --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.speculation=false --conf spark.executorEnv.LANG=en_US.UTF-8 --conf spark.yarn.appMasterEnv.LANG=en_US.UTF-8 --driver-cores 20 --driver-memory 20G --num-executors 40 --executor-cores 15 --executor-memory 25G ./covid/py/get-users-statistics-pyspark.py

In [9]:
(1585594165677-1585591632662)/1000

2533.015

In [1]:
import os

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,dayofweek,to_timestamp,size,isnan,lit,date_format,to_timestamp
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType, DoubleType

In [2]:
try:
    spark
except NameError:
    spark=SparkSession.builder.appName("").getOrCreate()

In [3]:
source='cuebiq'
country='US'

if os.getenv('CLUSTER')=='PRINCE':
    path_to_data='/scratch/spf248/covid/data'
    directory='2020010100'
    file='part-00000-0428e20d-9019-4cbf-b5ce-bc9414007fec-c000.csv.gz'
else:
    path_to_data='/user/spf248/covid/data'
    directory='*'
    file='*'

# Load Dataset

In [4]:
print('Load Dataset')

schema= StructType([
StructField("_c0", FloatType(), False),
StructField("_c1", StringType(), False),
StructField("_c2", FloatType(), False),
StructField("_c3", FloatType(), False),
StructField("_c4", FloatType(), False),
StructField("_c5", FloatType(), False),
StructField("_c6", FloatType(), False),
StructField("_c7", StringType(), False),
StructField("_c8", StringType(), False),])

df=spark.read.option(
'compression', 'gzip').option(
'header', 'false').option(
"multiLine", "true").option(
'escape','"').option(
"encoding", "UTF-8").option(
"delimiter", "\t").schema(schema).csv(
os.path.join(
path_to_data,
source,
country,
directory,
file))

Load Dataset


In [5]:
print('Preprocess Dataset')

column_names=[
'timestamp',
'cuebiq_id',
'device_type',
'latitude',
'longitude',
'accuracy',
'time_zone_offset',
'classification_type',
'transformation_type']
df=df.toDF(*column_names)

df=df.withColumn("time",to_timestamp(df["timestamp"]+df["time_zone_offset"]))
df=df.withColumn("date", date_format(col("time"), "yyyy-MM-dd"))

Preprocess Dataset


In [7]:
print('Compute Users Statistics')
users=df.groupBy('cuebiq_id','date').agg(
{'time':'count','device_type':'first'}).groupBy('cuebiq_id').agg(
{'date':'count','count(time)':'sum','first(device_type)':'first'}).withColumnRenamed(
"first(first(device_type))", "device_type").withColumnRenamed(
"count(date)", "n_days").withColumnRenamed(
"sum(count(time))", "n_pings")

Compute Users Statistics


In [None]:
print('Save')
users.write.mode("overwrite").parquet(os.path.join(path_to_data,source,country,'users'))

In [6]:
# '2020-03-31'
# Pings: 236913719038
# Pings within the time window: 235448985572

In [8]:
# users.show()

+--------------------+------+-----------+-------+
|           cuebiq_id|n_days|device_type|n_pings|
+--------------------+------+-----------+-------+
|322e84c668f308e32...|     2|          0|     75|
|5476c2d50fa3396a6...|     3|          0|     17|
|a28a817158e80e35d...|     2|          0|     36|
|39d3c113a30f1fd7a...|     2|          1|    179|
|ba5844efcdf7b415a...|     2|          1|    135|
|2b3f9d1040ff00bef...|     2|          1|    381|
|80212d453ce93ce85...|     2|          1|    662|
|b2e54fa9d2420ed40...|     2|          1|     80|
|c05b4cf1328f36cce...|     2|          0|    234|
|a63778ecb57bbe24c...|     2|          1|     56|
|8eb9cdc18ed67ce85...|     2|          0|     29|
|21f619f1cc118d432...|     2|          1|    185|
|d8cd97ae992d235a1...|     4|          0|      6|
|14c967be056940b6e...|     2|          1|    293|
|20132c25e188c1543...|     2|          1|    159|
|3415decfb70d9cb42...|     2|          0|     37|
|0e59242866089fc17...|     2|          0|     39|


In [7]:
# df.groupby('time_zone_offset').count().show()

+----------------+------+
|time_zone_offset| count|
+----------------+------+
|          3600.0|     8|
|        -14400.0|     5|
|        -32400.0|  1629|
|        -25200.0| 77683|
|        -36000.0|  3198|
|        -21600.0|488452|
|        -18000.0|605074|
|        -28800.0|145252|
|             0.0|   440|
+----------------+------+

