In [1]:
import os
from timeit import default_timer as timer

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,hour,dayofweek,to_timestamp,size,isnan,lit,date_format,to_timestamp,struct,expr,explode
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType, DoubleType

from geospark.register import GeoSparkRegistrator
from geospark.utils import GeoSparkKryoRegistrator, KryoSerializer
from geospark.register import upload_jars

In [2]:
upload_jars()

True

In [3]:
try:
    spark
except NameError:
    print('Create Spark')
    spark=SparkSession.builder.appName("").config(
    "spark.serializer", KryoSerializer.getName).config(
    "spark.kryo.registrator", GeoSparkKryoRegistrator.getName).getOrCreate()

Create Spark


In [4]:
GeoSparkRegistrator.registerAll(spark)

True

In [5]:
source='cuebiq'
country='ID'

if os.getenv('CLUSTER')=='PRINCE':
    path_to_data='/scratch/spf248/covid/data'
else:
    path_to_data='/user/spf248/covid/data'

In [6]:
personal=spark.read.parquet(os.path.join(path_to_data,source,'aggregates',country,'n_pings_id_personal'))
personal=personal.select('point').drop_duplicates(subset=['point'])
personal.createOrReplaceTempView("personal")

personal=spark.sql("""select point.latitude as latitude
, point.longitude as longitude
, ST_Point(cast(personal.point.longitude as Decimal(24,20))
, cast(personal.point.latitude as Decimal(24,20))) as point
from personal
""")
personal.createOrReplaceTempView("personal")

In [7]:
# print('# Personal Locations:', personal.count()) # US: 218137 / MX: 46545 / ID: 26379

In [8]:
admin=spark.read.option(
"header", "true").csv(
os.path.join(
path_to_data,
'admin',
country,
'admin.csv'))
admin.createOrReplaceTempView("admin")

query="select"
for column in admin.columns:
    if column == 'geometry' or column=='polygon':
        continue
    query+=" admin."+column+" as "+column+","
query+=" ST_GeomFromText(admin.geometry) as polygon from admin"

admin=spark.sql(query)
admin.createOrReplaceTempView("admin")

In [9]:
# print('# Admin Units:', admin.count()) # US: 220320/ MX: 56177 / ID: 72300

In [10]:
query="SELECT p.latitude, p.longitude"
for column in admin.columns:
    if column == 'geometry' or column=='polygon':
        continue
    query+=", s."+column
query+=" FROM personal as p, admin as s WHERE ST_Intersects(p.point, s.polygon)"

personal_admin=spark.sql(query)
personal_admin=personal_admin.withColumn('point',struct('longitude','latitude')).drop('longitude','latitude')

In [11]:
# print('# Matched Locations:', personal_admin.count()) # US: 217190 / MX: 24512 / ID: 24711

In [12]:
print('Save Geocoded Locations')
start = timer()

personal_admin.write.mode("overwrite").parquet(os.path.join(path_to_data,source,'aggregates',country,'personal_admin'))

print("Done in", round(timer()-start), "sec")

Save Geocoded Locations
Done in 98 sec


In [13]:
personal_admin.show()

+------------+------------------+----------+---------------+----------+-------+----------+----------+------------------+----------------+--------------------+
|  ADM4_PCODE|           ADM3_EN|ADM3_PCODE|        ADM2_EN|ADM2_PCODE|ADM1_EN|ADM1_PCODE|median_age|      wealth_index|total_population|               point|
+------------+------------------+----------+---------------+----------+-------+----------+----------+------------------+----------------+--------------------+
|ID1172010007|          Sukajaya| ID1172010|    Kota Sabang|    ID1172|   Aceh|      ID11|      26.0|0.7582239837506708|          5510.0|[95.33386, 5.8859...|
|ID1172010010|          Sukajaya| ID1172010|    Kota Sabang|    ID1172|   Aceh|      ID11|      25.0|0.7305132011677065|          3652.0|[95.33386, 5.8914...|
|ID1172020007|         Sukakarya| ID1172020|    Kota Sabang|    ID1172|   Aceh|      ID11|      27.0|0.7187204600457259|          3069.0|[95.31189, 5.8914...|
|ID1172010007|          Sukajaya| ID1172010|  