spark-submit --master yarn --deploy-mode cluster  --conf spark.yarn.submit.waitAppCompletion=false --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.speculation=false --conf spark.executorEnv.LANG=en_US.UTF-8 --conf spark.yarn.appMasterEnv.LANG=en_US.UTF-8 --driver-cores 20 --driver-memory 20G --num-executors 40 --executor-cores 15 --executor-memory 25G ./covid/py/get-personal-locations-pyspark.py

In [1]:
print('Computing Time:',(1585581654521-1585577712693)/1000,'sec')

Computing Time: 3941.828 sec


In [2]:
import os

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,hour,dayofweek,to_timestamp,size,isnan,lit,date_format,to_timestamp,struct,expr,array,max
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType, DoubleType

In [3]:
try:
    spark
except NameError:
    spark=SparkSession.builder.appName("").getOrCreate()

In [4]:
source='cuebiq'
country='US'

if os.getenv('CLUSTER')=='PRINCE':
    path_to_data='/scratch/spf248/covid/data'
    directories=['*'+str(x)+'00' for x in range(10)][1:2]
    file='part-00000-0428e20d-9019-4cbf-b5ce-bc9414007fec-c000.csv.gz'
else:
    path_to_data='/user/spf248/covid/data'
    directories=['*'+str(x)+'00' for x in range(10)]
    file='*'

# Load Dataset

In [5]:
schema= StructType([
StructField("_c0", FloatType(), False),
StructField("_c1", StringType(), False),
StructField("_c2", FloatType(), False),
StructField("_c3", FloatType(), False),
StructField("_c4", FloatType(), False),
StructField("_c5", FloatType(), False),
StructField("_c6", FloatType(), False),
StructField("_c7", StringType(), False),
StructField("_c8", StringType(), False),])

In [16]:
def get_summary(directory):
    
    df=spark.read.option(
    'compression', 'gzip').option(
    'header', 'false').option(
    "multiLine", "true").option(
    'escape','"').option(
    "encoding", "UTF-8").option(
    "delimiter", "\t").schema(schema).csv(
    os.path.join(
    path_to_data,
    source,
    country,
    directory,
    file))
    
    df.cache()

    column_names=[
    'timestamp',
    'cuebiq_id',
    'device_type',
    'latitude',
    'longitude',
    'accuracy',
    'time_zone_offset',
    'classification_type',
    'transformation_type']
    df=df.toDF(*column_names)

    df=df.withColumn("time",to_timestamp(df["timestamp"]+df["time_zone_offset"]))
    df=df.filter(df['classification_type']=='PERSONAL_AREA')
    df=df.withColumn('hour',hour("time"))
    df=df.withColumn('dayofweek',dayofweek("time"))
    df=df.withColumn('point',struct('longitude','latitude'))

    return df.groupby('cuebiq_id','point','dayofweek','hour').count()

In [19]:
for i,directory in enumerate(directories):
    if not i:
        users=get_summary(directory)
        users.cache()
    else:
        users=users.unionByName(get_summary(directory))

In [None]:
print('Save Personal Locations')
users=users.groupby('cuebiq_id','point','dayofweek','hour').agg(
{'count':'sum'}).withColumnRenamed('sum(count)','count')
users.write.mode("overwrite").parquet(os.path.join(path_to_data,source,country,'personal'))

In [22]:
users.show()

+--------------------+--------------------+---------+----+-----+
|           cuebiq_id|               point|dayofweek|hour|count|
+--------------------+--------------------+---------+----+-----+
|21a253b012efda42d...|[-118.25058, 34.0...|        3|  17|    1|
|4c035b57f75f041ac...|[-89.65056, 41.81...|        4|   0|    5|
|4ceb457337b65dcc5...|[-88.087524, 39.9...|        4|   3|    2|
|9883f2f68789cdfdb...|[-85.49249, 38.23...|        3|  19|    5|
|45ed687a83fe33bc6...|[-75.67466, 41.42...|        3|  16|    6|
|2214401b40bb7dca0...|[-70.96851, 42.54...|        4|  12|    1|
|bd705702d90e59802...|[-82.642876, 27.8...|        3|   8|    1|
|640419b48a5c3bb74...|[-105.24425, 35.6...|        4|   1|    1|
|a98e8849986e03c23...|[-80.212105, 26.0...|        3|  17|    3|
|480279bd3fa0f8b8f...|[-85.00785, 41.20...|        4|  13|    2|
|480279bd3fa0f8b8f...|[-85.00785, 41.20...|        4|  10|    1|
|08208a616a43c6c63...|[-91.59524, 41.70...|        4|   4|    1|
|e9839857f51ab6cdf...|[-7