https://datasystemslab.github.io/GeoSpark/tutorial/geospark-sql-python/

https://github.com/DataSystemsLab/GeoSpark/tree/master/python

https://datasystemslab.github.io/GeoSpark/tutorial/geospark-core-python/

https://medium.com/@karijdempsey/efficient-geospatial-analysis-with-spark-363ba50c5248

spark-submit --master yarn --deploy-mode cluster  --conf spark.yarn.appMasterEnv.SPARK_HOME=/share/apps/spark/spark-2.4.0-bin-hadoop2.6 --conf spark.yarn.submit.waitAppCompletion=false --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.speculation=false --conf spark.executorEnv.LANG=en_US.UTF-8 --conf spark.yarn.appMasterEnv.LANG=en_US.UTF-8 --driver-cores 20 --driver-memory 55G --num-executors 40 --executor-cores 15 --executor-memory 55G ./covid/py/merge-census-blocks-pyspark.py

In [3]:
import os
from datetime import datetime

import pandas as pd
from geospark.register import GeoSparkRegistrator
from pyspark.sql.functions import to_timestamp
from pyspark.sql.types import LongType, StringType, StructType, StructField, FloatType

In [4]:
GeoSparkRegistrator.registerAll(spark)

# Params

In [None]:
def dbutils_path_exists(path):
  try:
    dbutils.fs.ls(path)
    return True
  except:
    print("{} not found".format(path))

In [6]:
source='cuebiq'
country='ID'
admin_id='ADM4_PCODE'
start_date='2020-01-01'
end_date=datetime.today().strftime('%Y-%m-%d')
days_to_consider=[x.strftime('%Y%m%d00') for x in pd.date_range(start_date,end_date) if dbutils_path_exists("mnt/wbgggscecovid19dev-datapartnership/data/cuebiq/covid-19/{}/{}/".format(country, x.strftime('%Y%m%d00')))]
paths_to_data = ["/mnt/wbgggscecovid19dev-datapartnership/data/cuebiq/covid-19/{}/{}/*.csv.gz".format(country, day) for day in days_to_consider]
admin_path = "/admin/"
results_path = "/results/"

schema= StructType([
StructField("timestamp", LongType(), False),
StructField("cuebiq_id", StringType(), False),
StructField("device_type", FloatType(), False),
StructField("latitude", FloatType(), False),
StructField("longitude", FloatType(), False),
StructField("accuracy", FloatType(), False),
StructField("time_zone_offset", LongType(), False),
StructField("classification_type", StringType(), False),
StructField("transformation_type", StringType(), False),])

# Load Admin Data

In [8]:
admin=spark.read.option("header","true").csv(os.path.join(admin_path,country,'admin.csv'))
admin.createOrReplaceTempView("admin")

query="select admin."+admin_id+" as "+admin_id+", ST_GeomFromText(admin.geometry) as polygon from admin"
admin=spark.sql(query)
admin.createOrReplaceTempView("admin")
admin.cache()
print('# Admin Units:',admin.count())

# Geocode Pings By Chunk

In [10]:
def geocode_pings(pings):

    pings=pings.withColumn("time",to_timestamp(pings["timestamp"]+pings["time_zone_offset"]))
    
    pings.createOrReplaceTempView("pings")
    pings=spark.sql("""select time
    , cuebiq_id
    , latitude
    , longitude
    , accuracy
    , classification_type
    , ST_Point(cast(pings.longitude as Decimal(24,20))
    , cast(pings.latitude as Decimal(24,20))) as point
    from pings
    """)
    
    pings.createOrReplaceTempView("pings")
    query="""SELECT p.time
    , p.cuebiq_id
    , p.latitude
    , p.longitude
    , p.accuracy
    , p.classification_type
    , s."""+admin_id+""" 
    FROM pings AS p, admin AS s WHERE ST_Intersects(p.point, s.polygon)"""
    pings_geocoded=spark.sql(query)
    
    return pings_geocoded

In [11]:
pings=spark.read.format("csv").option('header', 'false').option("delimiter", "\t").schema(schema).load(paths_to_data)


In [12]:
pings_geocoded=geocode_pings(pings)

pings_geocoded.write.mode("overwrite").parquet( os.path.join(results_path,source,'processed',country, 'pings_geocoded'))