In [1]:
import os

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,hour,dayofweek,to_timestamp,size,isnan,lit,date_format,to_timestamp,struct,expr,explode
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType, DoubleType

from geospark.register import GeoSparkRegistrator
from geospark.utils import GeoSparkKryoRegistrator, KryoSerializer
from geospark.register import upload_jars

In [2]:
upload_jars()

True

In [3]:
try:
    spark
except NameError:
    print('Create Spark')
    spark=SparkSession.builder.appName("").config(
    "spark.serializer", KryoSerializer.getName).config(
    "spark.kryo.registrator", GeoSparkKryoRegistrator.getName).getOrCreate()

Create Spark


In [4]:
GeoSparkRegistrator.registerAll(spark)

True

In [5]:
source='cuebiq'
country='US'

if os.getenv('CLUSTER')=='PRINCE':
    path_to_data='/scratch/spf248/covid/data'
    state='NJ'
else:
    path_to_data='/user/spf248/covid/data'
    state='*'

In [7]:
personal_locations=spark.read.parquet(os.path.join(path_to_data,source,country,'personal_locations'))

personal_locations.createOrReplaceTempView("personal_locations")

personal_locations=spark.sql("""select cuebiq_id
, point.latitude as latitude
, point.longitude as longitude
, ST_Point(cast(personal_locations.point.longitude as Decimal(24,20))
, cast(personal_locations.point.latitude as Decimal(24,20))) as point
from personal_locations
""")

personal_locations.createOrReplaceTempView("personal_locations")

In [9]:
def load_admin(state):
    
    admin=spark.read.option(
    "header", "true").csv(
    os.path.join(
    path_to_data,
    'shapefiles',
    country,
    'polygons',
    state+'.csv'))
    admin.createOrReplaceTempView("admin")

    admin=spark.sql("""select admin.GEOID10 as GEOID10
    , ST_GeomFromText(admin.geometry) as polygon
    from admin
    """)
    admin.createOrReplaceTempView("admin")
    
    return admin

In [16]:
admin=load_admin(state)

personal_geocoded=spark.sql(
"""
    SELECT p.cuebiq_id
    , p.latitude
    , p.longitude
    , s.GEOID10
    FROM personal_locations AS p, admin AS s
    WHERE ST_Intersects(p.point, s.polygon)
"""
)

personal_geocoded.write.mode("overwrite").parquet(
os.path.join(path_to_data,source,country,'personal_geocoded'))