spark-submit --master yarn --deploy-mode cluster  --conf spark.yarn.submit.waitAppCompletion=false --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.speculation=false --conf spark.executorEnv.LANG=en_US.UTF-8 --conf spark.yarn.appMasterEnv.LANG=en_US.UTF-8 --driver-cores 20 --driver-memory 20G --num-executors 40 --executor-cores 15 --executor-memory 25G ./covid/py/get-personal-locations-pyspark.py

In [1]:
print('Computing Time:',(1585581654521-1585577712693)/1000,'sec')

Computing Time: 3941.828 sec


In [2]:
import os
from timeit import default_timer as timer

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,hour,dayofweek,to_timestamp,size,isnan,lit,date_format,to_timestamp,struct,expr,array,max
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType, DoubleType

In [3]:
try:
    spark
except NameError:
    spark=SparkSession.builder.appName("").getOrCreate()

In [4]:
source='cuebiq'
country='US'

if os.getenv('CLUSTER')=='PRINCE':
    path_to_data='/scratch/spf248/covid/data'
    directories=['*'+str(x)+'00' for x in range(10)][1:2]
    file='part-00000-0428e20d-9019-4cbf-b5ce-bc9414007fec-c000.csv.gz'
else:
    path_to_data='/user/spf248/covid/data'
    directories=['*'+str(x)+'00' for x in range(10)]
    file='*'

# Load Dataset

In [5]:
schema= StructType([
StructField("_c0", FloatType(), False),
StructField("_c1", StringType(), False),
StructField("_c2", FloatType(), False),
StructField("_c3", FloatType(), False),
StructField("_c4", FloatType(), False),
StructField("_c5", FloatType(), False),
StructField("_c6", FloatType(), False),
StructField("_c7", StringType(), False),
StructField("_c8", StringType(), False),])

In [21]:
def load_data(directory):
    
    df=spark.read.option(
    'compression', 'gzip').option(
    'header', 'false').option(
    "multiLine", "true").option(
    'escape','"').option(
    "encoding", "UTF-8").option(
    "delimiter", "\t").schema(schema).csv(
    os.path.join(
    path_to_data,
    source,
    country,
    directory,
    file))
    
    column_names=[
    'timestamp',
    'cuebiq_id',
    'device_type',
    'latitude',
    'longitude',
    'accuracy',
    'time_zone_offset',
    'classification_type',
    'transformation_type']
    df=df.toDF(*column_names)

    df=df.withColumn("time",to_timestamp(df["timestamp"]+df["time_zone_offset"]))# Use local timezone
    df=df.filter(df['classification_type']=='PERSONAL_AREA')
    df=df.withColumn('hour',hour("time")) 
    df=df.withColumn('dayofweek',dayofweek("time")-1)
    df=df.withColumn('point',struct('longitude','latitude'))
    return df.select('cuebiq_id','point','dayofweek','hour')

In [24]:
for i,directory in enumerate(directories):
    print('Directory:',i)
    start = timer()
    df=load_data(directory)
    if not i:
        hours_week=df.filter(df['dayofweek']<6).groupby('cuebiq_id','point','hour').count()
        hours_weekend=df.filter(df['dayofweek']>=6).groupby('cuebiq_id','point','hour').count()
    else:
        hours_week=hours_week.unionByName(
        df.filter(df['dayofweek']<6).groupby('cuebiq_id','point','hour').count())
        hours_weekend=hours_weekend.unionByName(
        df.filter(df['dayofweek']>=6).groupby('cuebiq_id','point','hour').count())
    print("Done in", round(timer()-start), "sec")

Directory: 0
Done in 1 sec


In [25]:
print('Save Weekday Hours')
start = timer()
hours_week=hours_week.groupby('cuebiq_id','point','hour').agg(
{'count':'sum'}).withColumnRenamed('sum(count)','count')
hours_week.write.mode("overwrite").parquet(os.path.join(path_to_data,source,country,'personal_week'))
print("Done in", round(timer()-start), "sec")

Save Weekday Hours
Done in 0 sec


In [26]:
hours_week.show()

+--------------------+--------------------+----+-----+
|           cuebiq_id|               point|hour|count|
+--------------------+--------------------+----+-----+
|e9de98d34c1033666...|[-119.065704, 35....|  21|   23|
|941a6ed8314972278...|[-90.682556, 29.5...|  23|    3|
|b9a5f598ab9566819...|[-76.62132, 39.17...|  13|    6|
|4ceb457337b65dcc5...|[-88.087524, 39.9...|   6|    2|
|9883f2f68789cdfdb...|[-85.49249, 38.23...|  16|    4|
|2b8bc177e9d3fbbb5...|[-122.56232, 45.1...|   7|    1|
|d447feb07db2d5bd3...|[-122.14287, 37.8...|  22|    6|
|63478062c03b7f6c0...|[-101.89764, 35.1...|   4|    4|
|9d95eb0e0e4c6c280...|[-93.33092, 37.13...|  13|    1|
|85f44a52d7ed5f322...|[-117.09156, 33.4...|   7|    1|
|0d513b99fcc54ba2d...|[-101.72858, 34.1...|  15|    7|
|187ab762032491b88...|[-97.707596, 30.3...|  10|   42|
|cd9fd5c259f261f51...|[-95.18838, 29.48...|  10|    2|
|04ee726a919cd0f44...|[-96.831314, 34.8...|  14|    8|
|68f5a0ffd3238796f...|[-82.23125, 37.62...|   4|    5|
|e7f9dcef4

In [10]:
print('Save Weekends Hours')
start = timer()
hours_weekend=hours_weekend.groupby('cuebiq_id','point','hour').agg(
{'count':'sum'}).withColumnRenamed('sum(count)','count')
hours_weekend.write.mode("overwrite").parquet(os.path.join(path_to_data,source,country,'personal_weekend'))
print("Done in", round(timer()-start), "sec")

Save Weekends Hours


In [12]:
# hours_weekend.show()

+--------------------+--------------------+----+-----+
|           cuebiq_id|               point|hour|count|
+--------------------+--------------------+----+-----+
|b96660af2a3b93bb1...|[-89.98306, 38.52...|  16|    1|
|9ef6a2c7e2f1aafa4...|[-84.53045, 39.06...|  13|    1|
|aed7e744134f3e36c...|[-87.66809, 42.01...|  23|    1|
|81b43fbec387a132f...|[-121.44829, 38.4...|  10|    1|
|ea79e7feab1986f03...|[-93.372635, 44.9...|  15|    2|
|7ad2f2509b9d5857a...|[-111.76617, 33.4...|  21|    1|
|7d172eded0fccee41...|[-80.32448, 26.00...|  10|    1|
|992c2afd83d276e05...|[-117.87656, 33.8...|  11|    1|
|edbde6dc915eb5bf1...|[-104.74783, 38.8...|   3|    1|
|3f79aed557e438fb9...|[-120.38679, 35.6...|  14|    1|
|16ba421959630a1bb...|[-74.04457, 43.11...|  15|    1|
|3f79aed557e438fb9...|[-120.38679, 35.6...|  21|    2|
|3f7d09b7048e79fa7...|[-117.08142, 33.4...|  12|    1|
|edbde6dc915eb5bf1...|[-104.74783, 38.8...|   6|    1|
|31cd3f94b267a0afa...|[-118.8976, 34.18...|  15|    1|
|0beb5dab7