spark-submit --master yarn --deploy-mode cluster  --conf spark.yarn.submit.waitAppCompletion=false --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.speculation=false --conf spark.executorEnv.LANG=en_US.UTF-8 --conf spark.yarn.appMasterEnv.LANG=en_US.UTF-8 --driver-cores 20 --driver-memory 55G --num-executors 30 --executor-cores 15 --executor-memory 30G

In [6]:
import os

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,hour,dayofweek,to_timestamp,size,isnan,lit,date_format,to_timestamp,struct,expr,array,max
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType, DoubleType

In [2]:
try:
    spark
except NameError:
    spark=SparkSession.builder.appName("").getOrCreate()

In [3]:
source='cuebiq'
country='US'

if os.getenv('CLUSTER')=='PRINCE':
    path_to_data='/scratch/spf248/covid/data'
    directory='2020010100'
    file='part-00000-0428e20d-9019-4cbf-b5ce-bc9414007fec-c000.csv.gz'
else:
    path_to_data='/user/spf248/covid/data'
    directory='*'
    file='*'

# Load Dataset

In [4]:
print('Load Dataset')

schema= StructType([
StructField("_c0", DoubleType(), False),
StructField("_c1", StringType(), False),
StructField("_c2", IntegerType(), False),
StructField("_c3", FloatType(), False),
StructField("_c4", FloatType(), False),
StructField("_c5", DoubleType(), False),
StructField("_c6", DoubleType(), False),
StructField("_c7", StringType(), False),
StructField("_c8", StringType(), False),])

df=spark.read.option(
'compression', 'gzip').option(
'header', 'false').option(
"multiLine", "true").option(
'escape','"').option(
"encoding", "UTF-8").option(
"delimiter", "\t").schema(schema).csv(
os.path.join(
path_to_data,
source,
country,
directory,
file))

Load Dataset


In [5]:
print('Preprocess Dataset')

column_names=[
'timestamp',
'cuebiq_id',
'device_type',
'latitude',
'longitude',
'accuracy',
'time_zone_offset',
'classification_type',
'transformation_type']
df=df.toDF(*column_names)

df=df.withColumn("time",to_timestamp(df["timestamp"]+df["time_zone_offset"]))
df=df.filter(df['classification_type']=='PERSONAL_AREA')
df=df.withColumn('hour',hour("time"))
df=df.withColumn('point',struct('longitude','latitude'))
df=df.drop('timestamp','device_type','accuracy','time_zone_offset','transformation_type','classification_type','time','latitude','longitude')

Preprocess Dataset


In [6]:
print('Save Personal Locations')
users=df.groupby('cuebiq_id','point','hour').count().withColumn(
"count_hour", array("count", "hour")).groupby(
'cuebiq_id','point').agg(
max("count_hour").getItem(1).alias("most_freq_hour"))
users.write.mode("overwrite").parquet(os.path.join(path_to_data,source,country,'personal'))

Save Personal Locations


In [11]:
users.show()

+--------------------+--------------------+--------------+
|           cuebiq_id|               point|most_freq_hour|
+--------------------+--------------------+--------------+
|025ed431cb56ef2ad...|[-97.40806, 27.71...|            14|
|02935969fc9db984b...|[-77.49745, 38.29...|             5|
|06305350630236c29...|[-82.51925, 35.43...|             5|
|0caea2e347fba488a...|[-94.69574, 38.86...|            11|
|0f9bc4160d4ea9435...|[-82.39684, 34.87...|            10|
|105d556951d46c802...|[-76.240875, 40.1...|            16|
|1e6d8201bdb9543b6...|[-96.99278, 28.85...|            16|
|20c6601b4fdece0d2...|[-93.64436, 36.59...|            14|
|233bb51d8226ab036...|[-95.88674, 29.12...|            18|
|25ccaba6d8291a7e4...|[-80.88046, 39.67...|            10|
|2ada38aeda2495db4...|[-78.73021, 35.78...|            13|
|317d2d0ee9880186b...|[-73.687065, 41.0...|            10|
|39b7657b8e073bee0...|[-74.489044, 40.5...|            13|
|3f7611f451e2a6321...|[-78.67569, 36.09...|            2