https://datasystemslab.github.io/GeoSpark/tutorial/geospark-sql-python/

https://github.com/DataSystemsLab/GeoSpark/tree/master/python

https://datasystemslab.github.io/GeoSpark/tutorial/geospark-core-python/

https://medium.com/@karijdempsey/efficient-geospatial-analysis-with-spark-363ba50c5248


spark-submit --master yarn --deploy-mode cluster  --conf spark.yarn.appMasterEnv.SPARK_HOME=/share/apps/spark/spark-2.4.0-bin-hadoop2.6 --conf spark.yarn.submit.waitAppCompletion=false --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.speculation=false --conf spark.executorEnv.LANG=en_US.UTF-8 --conf spark.yarn.appMasterEnv.LANG=en_US.UTF-8 --driver-cores 20 --driver-memory 55G --num-executors 40 --executor-cores 15 --executor-memory 55G ./covid/py/merge-census-blocks-pyspark.py

In [2]:
import os
from datetime import datetime
import pandas as pd
import numpy as np

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,dayofweek,to_timestamp,size,isnan,lit,date_format,from_json,broadcast
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType, DoubleType

from geospark.register import GeoSparkRegistrator
from geospark.utils import GeoSparkKryoRegistrator, KryoSerializer
from geospark.register import upload_jars
from geospark.utils.adapter import Adapter

In [3]:
upload_jars()

True

In [4]:
try:
    spark
except NameError:
    print('Create Spark')
    spark=SparkSession.builder.appName("").config(
    "spark.serializer", KryoSerializer.getName).config(
    "spark.kryo.registrator", GeoSparkKryoRegistrator.getName).getOrCreate()

Create Spark


In [5]:
GeoSparkRegistrator.registerAll(spark)

True

In [6]:
source='cuebiq'
country='MX'
admin_id='ageb'
n_chunks=10
start_date='2020-01-01'
end_date=datetime.today().strftime('%Y-%m-%d')
directories=[x.strftime('%Y-%m-%d').replace('-','')+'00' for x in pd.date_range(start_date,end_date)]
fs=spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())

if os.getenv('CLUSTER')=='PRINCE':
    path_to_data='/scratch/spf248/covid/data'
    directories=directories[:1]
else:
    path_to_data='/user/spf248/covid/data'
    
paths=[]
for directory in directories:
    path_to_directory=os.path.join(path_to_data,source,'s3',country,directory)
    if not fs.exists(spark._jvm.org.apache.hadoop.fs.Path(path_to_directory)):
        continue
    list_status=fs.listStatus(spark._jvm.org.apache.hadoop.fs.Path(path_to_directory))
    paths.extend([file.getPath().toString().replace('hdfs://dumbo','').replace('file:','') for file in list_status])
    paths=sorted([path for path in paths if '.csv.gz' in path])
    print(directory)
    
if os.getenv('CLUSTER')=='PRINCE':
    paths=paths[:1]
    paths_chunks=np.array_split(paths,n_chunks)
    paths_chunks=paths_chunks[:1]
else:
    paths_chunks=np.array_split(paths,n_chunks)

print('# Files:', sum([len(paths_chunk) for paths_chunk in paths_chunks]))
print('# Chunks:', len(paths_chunks))
    
schema= StructType([
StructField("_c0", FloatType(), False),
StructField("_c1", StringType(), False),
StructField("_c2", FloatType(), False),
StructField("_c3", FloatType(), False),
StructField("_c4", FloatType(), False),
StructField("_c5", FloatType(), False),
StructField("_c6", FloatType(), False),
StructField("_c7", StringType(), False),
StructField("_c8", StringType(), False),])

2020010100
# Files: 1
# Chunks: 1


In [None]:
admin=spark.read.option(
"header", "true").csv(
os.path.join(
path_to_data,
'admin',
country,
'admin.csv'))
admin.createOrReplaceTempView("admin")

query="select admin."+admin_id+" as "+admin_id+", ST_GeomFromText(admin.geometry) as polygon from admin"
admin=spark.sql(query)
admin.createOrReplaceTempView("admin")
admin.cache()
print('# Admin Units:',admin.count())

In [7]:
def geocode_pings(paths_chunk):

    pings=spark.read.option(
    'compression', 'gzip').option(
    'header', 'false').option(
    "multiLine", "true").option(
    'escape','"').option(
    "encoding", "UTF-8").option(
    "delimiter", "\t").schema(schema).csv(list(paths_chunk))

    column_names=[
    'timestamp',
    'cuebiq_id',
    'device_type',
    'latitude',
    'longitude',
    'accuracy',
    'time_zone_offset',
    'classification_type',
    'transformation_type']
    pings=pings.toDF(*column_names)

    pings=pings.withColumn("time",to_timestamp(pings["timestamp"]+pings["time_zone_offset"]))
    pings.createOrReplaceTempView("pings")
    pings=spark.sql("""select time
    , cuebiq_id
    , latitude
    , longitude
    , accuracy
    , classification_type
    , ST_Point(cast(pings.longitude as Decimal(24,20))
    , cast(pings.latitude as Decimal(24,20))) as point
    from pings
    """)
    
    pings.createOrReplaceTempView("pings")
    query="""SELECT p.time
    , p.cuebiq_id
    , p.latitude
    , p.longitude
    , p.accuracy
    , p.classification_type
    , s."""+admin_id+""" 
    FROM pings AS p, admin AS s WHERE ST_Intersects(p.point, s.polygon)"""
    pings_geocoded=spark.sql(query)
    
    return pings_geocoded

In [None]:
for i in range(len(paths_chunks)):
    
    print('Chunk:', i)
    
    pings_geocoded=geocode_pings(paths_chunks[i])

    pings_geocoded.write.mode("overwrite").parquet(
    os.path.join(path_to_data,source,'pings_geocoded',country,str(i)))