spark-submit --master yarn --deploy-mode cluster  --conf spark.yarn.submit.waitAppCompletion=false --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.speculation=false --conf spark.executorEnv.LANG=en_US.UTF-8 --conf spark.yarn.appMasterEnv.LANG=en_US.UTF-8 --driver-cores 20 --driver-memory 20G --num-executors 40 --executor-cores 15 --executor-memory 25G ./covid/py/get-accuracy-statistics-pyspark.py

In [13]:
(1585595574642-1585591654570)/1000

3920.072

In [2]:
import os

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,dayofweek,to_timestamp,size,isnan,lit,date_format,to_timestamp
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType, DoubleType

In [3]:
try:
    spark
except NameError:
    spark=SparkSession.builder.appName("").getOrCreate()

In [4]:
source='cuebiq'
country='US'

if os.getenv('CLUSTER')=='PRINCE':
    path_to_data='/scratch/spf248/covid/data'
    directory='2020010100'
    file='part-00000-0428e20d-9019-4cbf-b5ce-bc9414007fec-c000.csv.gz'
else:
    path_to_data='/user/spf248/covid/data'
    directory='*'
    file='*'

# Load Dataset

In [5]:
print('Load Dataset')

schema= StructType([
StructField("_c0", FloatType(), False),
StructField("_c1", StringType(), False),
StructField("_c2", FloatType(), False),
StructField("_c3", FloatType(), False),
StructField("_c4", FloatType(), False),
StructField("_c5", FloatType(), False),
StructField("_c6", FloatType(), False),
StructField("_c7", StringType(), False),
StructField("_c8", StringType(), False),])

df=spark.read.option(
'compression', 'gzip').option(
'header', 'false').option(
"multiLine", "true").option(
'escape','"').option(
"encoding", "UTF-8").option(
"delimiter", "\t").schema(schema).csv(
os.path.join(
path_to_data,
source,
country,
directory,
file))

Load Dataset


In [6]:
print('Preprocess Dataset')

column_names=[
'timestamp',
'cuebiq_id',
'device_type',
'latitude',
'longitude',
'accuracy',
'time_zone_offset',
'classification_type',
'transformation_type']
df=df.toDF(*column_names)

Preprocess Dataset


In [11]:
print('Accuracy Statistics')
accuracy=df.groupBy('accuracy').count()

Accuracy Statistics


In [None]:
print('Save')
accuracy.write.mode("overwrite").parquet(os.path.join(path_to_data,source,country,'accuracy'))

In [12]:
# accuracy.show()

+--------+-----+
|accuracy|count|
+--------+-----+
|  5039.0|    1|
| 11189.0|    1|
|  2976.0|    2|
|  1575.0|    8|
|   714.0|   11|
|  2679.0|    3|
|  2098.0|    4|
|  2318.0|   11|
|  1033.0|    2|
|  3238.0|    1|
|  3599.0|    2|
| 11378.0|    1|
|  5750.0|    1|
|  7460.0|    1|
|  3675.0|    1|
| 11827.0|    1|
|  1393.0|    5|
|  1585.0|    4|
|  1081.0|    5|
|   550.0|    2|
+--------+-----+
only showing top 20 rows

