https://datasystemslab.github.io/GeoSpark/tutorial/geospark-sql-python/

https://github.com/DataSystemsLab/GeoSpark/tree/master/python

spark-submit --master yarn --deploy-mode cluster  --conf spark.yarn.appMasterEnv.SPARK_HOME=/share/apps/spark/spark-2.4.0-bin-hadoop2.6 --conf spark.yarn.submit.waitAppCompletion=false --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.speculation=false --conf spark.executorEnv.LANG=en_US.UTF-8 --conf spark.yarn.appMasterEnv.LANG=en_US.UTF-8 --driver-cores 20 --driver-memory 55G --num-executors 30 --executor-cores 15 --executor-memory 30G ./covid/py/merge-admin-blocks-pyspark.py

In [1]:
import os
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,dayofweek,to_timestamp,size,isnan,lit,date_format,from_json
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType, DoubleType

from geospark.register import GeoSparkRegistrator
from geospark.utils import GeoSparkKryoRegistrator, KryoSerializer
from geospark.register import upload_jars

In [2]:
upload_jars()

True

In [3]:
try:
    spark
except NameError:
    print('Create Spark')
    spark=SparkSession.builder.appName("").config(
    "spark.serializer", KryoSerializer.getName).config(
    "spark.kryo.registrator", GeoSparkKryoRegistrator.getName).getOrCreate()

Create Spark


In [4]:
GeoSparkRegistrator.registerAll(spark)

True

In [5]:
source='cuebiq'
country='US'
start_date='2020-01-01'
end_date='2020-03-19'
accuracy_threshold=100

states=['AK', 'AL', 'AR', 'AS', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'GU', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VI', 'VT', 'WA', 'WI', 'WV', 'WY']

if os.getenv('CLUSTER')=='PRINCE':
    path_to_data='/scratch/spf248/covid/data'
    directories=['*'+str(x)+'00' for x in range(10)][1:2]
    file='part-00000-0428e20d-9019-4cbf-b5ce-bc9414007fec-c000.csv.gz'
    states=['NJ']
else:
    path_to_data='/user/spf248/covid/data'
    directories=['*'+str(x)+'00' for x in range(10)]
    file='*'

In [6]:
schema= StructType([
StructField("_c0", DoubleType(), False),
StructField("_c1", StringType(), False),
StructField("_c2", IntegerType(), False),
StructField("_c3", FloatType(), False),
StructField("_c4", FloatType(), False),
StructField("_c5", DoubleType(), False),
StructField("_c6", DoubleType(), False),
StructField("_c7", StringType(), False),
StructField("_c8", StringType(), False),])

In [7]:
def load_pings(directory):
    
    pings=spark.read.option(
    'compression', 'gzip').option(
    'header','false').option(
    "multiLine", "true").option(
    'escape','"').option(
    "encoding", "UTF-8").option(
    "delimiter", "\t").schema(schema).csv(
    os.path.join(
    path_to_data,
    source,
    country,
    directory,
    file))

    column_names=[
    'timestamp',
    'cuebiq_id',
    'device_type',
    'latitude',
    'longitude',
    'accuracy',
    'time_zone_offset',
    'classification_type',
    'transformation_type']
    pings=pings.toDF(*column_names)
    pings=pings.withColumn("time",to_timestamp(pings["timestamp"]+pings["time_zone_offset"]))
    pings=pings.withColumn("date", date_format(col("time"),"yyyy-MM-dd"))
    pings=pings.filter(pings["date"]>=lit(start_date)).filter(pings["date"]<=lit(end_date))
    pings=pings.filter(pings["accuracy"]<=lit(accuracy_threshold))
    pings.createOrReplaceTempView("pings")

    pings=spark.sql("""select time
    , cuebiq_id
    , latitude
    , longitude
    , accuracy
    , classification_type
    , ST_Point(cast(pings.longitude as Decimal(24,20))
    , cast(pings.latitude as Decimal(24,20))) as point
    from pings
    """)
    pings.createOrReplaceTempView("pings")
    
    return pings

In [8]:
def load_admin(state):
    
    admin=spark.read.option(
    "header", "true").csv(
    os.path.join(
    path_to_data,
    'shapefiles',
    country,
    'polygons',
    state+'.csv'))
    admin.createOrReplaceTempView("admin")

    admin=spark.sql("""select admin.GEOID10 as GEOID10
    , ST_GeomFromText(admin.geometry) as polygon
    from admin
    """)
    admin.createOrReplaceTempView("admin")
    
    return admin

In [9]:
for directory in directories:
    
    pings=load_pings(directory)
    
    for state in states:
        
        admin=load_admin(state)

        spatial_join=spark.sql(
        """
            SELECT p.time
            , p.cuebiq_id
            , p.latitude
            , p.longitude
            , p.accuracy
            , p.classification_type
            , s.GEOID10
            FROM pings AS p, admin AS s
            WHERE ST_Intersects(p.point, s.polygon)
        """
        )
        spatial_join.write.mode("overwrite").parquet(os.path.join(path_to_data,source,country,state,directory))

In [10]:
spatial_join.show()

+-------------------+--------------------+--------+---------+--------+-------------------+---------------+
|               time|           cuebiq_id|latitude|longitude|accuracy|classification_type|        GEOID10|
+-------------------+--------------------+--------+---------+--------+-------------------+---------------+
|2020-01-01 11:02:25|222a36f677448d751...|40.86519|-74.13686|    15.0|      PERSONAL_AREA|340311755003010|
|2020-01-01 11:22:13|222a36f677448d751...|40.86519|-74.13686|     8.0|      PERSONAL_AREA|340311755003010|
|2020-01-01 11:22:12|222a36f677448d751...|40.86519|-74.13686|     8.0|      PERSONAL_AREA|340311755003010|
|2020-01-01 07:24:52|222a36f677448d751...|40.86519|-74.13686|     5.0|      PERSONAL_AREA|340311755003010|
|2020-01-01 07:22:56|222a36f677448d751...|40.86519|-74.13686|    15.0|      PERSONAL_AREA|340311755003010|
|2020-01-01 10:32:25|222a36f677448d751...|40.86519|-74.13686|    15.0|      PERSONAL_AREA|340311755003010|
|2020-01-01 08:11:22|222a36f677448d75