spark-submit --master yarn --deploy-mode cluster  --conf spark.yarn.submit.waitAppCompletion=false --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.speculation=false --conf spark.executorEnv.LANG=en_US.UTF-8 --conf spark.yarn.appMasterEnv.LANG=en_US.UTF-8 --driver-cores 20 --driver-memory 55G --num-executors 30 --executor-cores 15 --executor-memory 30G

In [8]:
import os

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,hour,dayofweek,to_timestamp,size,isnan,lit,date_format,to_timestamp,struct,expr,array,max
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType, DoubleType

In [9]:
try:
    spark
except NameError:
    spark=SparkSession.builder.appName("").getOrCreate()

In [3]:
source='cuebiq'
country='US'

if os.getenv('CLUSTER')=='PRINCE':
    path_to_data='/scratch/spf248/covid/data'
    directory='2020010100'
    file='part-00000-0428e20d-9019-4cbf-b5ce-bc9414007fec-c000.csv.gz'
else:
    path_to_data='/user/spf248/covid/data'
    directory='*'
    file='*'

# Load Dataset

In [4]:
print('Load Dataset')

schema= StructType([
StructField("_c0", DoubleType(), False),
StructField("_c1", StringType(), False),
StructField("_c2", IntegerType(), False),
StructField("_c3", FloatType(), False),
StructField("_c4", FloatType(), False),
StructField("_c5", DoubleType(), False),
StructField("_c6", DoubleType(), False),
StructField("_c7", StringType(), False),
StructField("_c8", StringType(), False),])

df=spark.read.option(
'compression', 'gzip').option(
'header', 'false').option(
"multiLine", "true").option(
'escape','"').option(
"encoding", "UTF-8").option(
"delimiter", "\t").schema(schema).csv(
os.path.join(
path_to_data,
source,
country,
directory,
file))

Load Dataset


In [5]:
print('Preprocess Dataset')

column_names=[
'timestamp',
'cuebiq_id',
'device_type',
'latitude',
'longitude',
'accuracy',
'time_zone_offset',
'classification_type',
'transformation_type']
df=df.toDF(*column_names)

df=df.filter(df['classification_type']=='PERSONAL_AREA')
df=df.withColumn('point',struct('longitude','latitude'))
df=df.select('cuebiq_id','point')
personal_locations=df.drop_duplicates(subset=['cuebiq_id','point'])

Preprocess Dataset


In [7]:
personal_locations.show()

+--------------------+--------------------+
|           cuebiq_id|               point|
+--------------------+--------------------+
|49bbc26b27698b96e...|[-94.98346, 43.03...|
|39b7657b8e073bee0...|[-74.489044, 40.5...|
|f392a77531dc0d880...|[-84.39135, 42.38...|
|233bb51d8226ab036...|[-95.88674, 29.12...|
|6b523b390d82527ea...|[-82.52493, 28.04...|
|02935969fc9db984b...|[-77.49745, 38.29...|
|a3eeb2817d8be12bc...|[-81.85312, 27.20...|
|6355972325e22683d...|[-81.40665, 28.62...|
|25ccaba6d8291a7e4...|[-80.88046, 39.67...|
|b96660af2a3b93bb1...|[-89.98306, 38.52...|
|317d2d0ee9880186b...|[-73.687065, 41.0...|
|3f7611f451e2a6321...|[-78.67569, 36.09...|
|96e42da0c24d13b4d...|[-84.56239, 42.98...|
|44aa00f2c1e0bf4c5...|[-91.87545, 35.16...|
|45457176b988545ad...|[-85.72122, 38.22...|
|8864753eed00a621d...|[-93.49896, 45.02...|
|0caea2e347fba488a...|[-94.69574, 38.86...|
|2ada38aeda2495db4...|[-78.73021, 35.78...|
|105d556951d46c802...|[-76.240875, 40.1...|
|f5d5be02433837638...|[-71.1474,

In [6]:
print('Save Personal Locations')
personal_locations.write.mode("overwrite").parquet(os.path.join(path_to_data,source,country,'personal_locations'))

Save Personal Locations
