In [31]:
import os
from timeit import default_timer as timer

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,hour,dayofweek,to_timestamp,size,isnan,lit,date_format,to_timestamp,struct,expr,explode
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType, DoubleType

from geospark.register import GeoSparkRegistrator
from geospark.utils import GeoSparkKryoRegistrator, KryoSerializer
from geospark.register import upload_jars

In [32]:
upload_jars()

True

In [33]:
try:
    spark
except NameError:
    print('Create Spark')
    spark=SparkSession.builder.appName("").config(
    "spark.serializer", KryoSerializer.getName).config(
    "spark.kryo.registrator", GeoSparkKryoRegistrator.getName).getOrCreate()

In [34]:
GeoSparkRegistrator.registerAll(spark)

True

In [47]:
source='cuebiq'
country='ID'

if os.getenv('CLUSTER')=='PRINCE':
    path_to_data='/scratch/spf248/covid/data'
else:
    path_to_data='/user/spf248/covid/data'

In [48]:
personal=spark.read.parquet(os.path.join(path_to_data,source,'aggregates',country,'n_pings_id_personal'))

In [49]:
personal.groupby('point.latitude','point.longitude').count().withColumnRenamed(
'count','n_users').repartition(1).write.mode("overwrite").option('header', 'true').csv(
os.path.join(path_to_data,source,'aggregates',country,'personal'))

In [50]:
personal=personal.select('point').drop_duplicates(subset=['point'])
personal.createOrReplaceTempView("personal")

personal=spark.sql("""select point.latitude as latitude
, point.longitude as longitude
, ST_Point(cast(personal.point.longitude as Decimal(24,20))
, cast(personal.point.latitude as Decimal(24,20))) as point
from personal
""")
personal.createOrReplaceTempView("personal")

In [51]:
# print('# Personal Locations:', personal.count()) # US: 218137 / MX: 46545 / ID: 26379

# Personal Locations: 26379


In [52]:
admin=spark.read.option(
"header", "true").csv(
os.path.join(
path_to_data,
'admin',
country,
'admin.csv'))
admin.createOrReplaceTempView("admin")

query="select"
for column in admin.columns:
    if column == 'geometry' or column=='polygon':
        continue
    query+=" admin."+column+" as "+column+","
query+=" ST_GeomFromText(admin.geometry) as polygon from admin"

admin=spark.sql(query)
admin.createOrReplaceTempView("admin")

In [53]:
# print('# Admin Units:', admin.count()) # US: 220320/ MX: 56177 / ID: 72300

# Admin Units: 72300


In [54]:
query="SELECT p.latitude, p.longitude"
for column in admin.columns:
    if column == 'geometry' or column=='polygon':
        continue
    query+=", s."+column
query+=" FROM personal as p, admin as s WHERE ST_Intersects(p.point, s.polygon)"

personal_admin=spark.sql(query)
personal_admin=personal_admin.withColumn('point',struct('longitude','latitude')).drop('longitude','latitude')

In [55]:
# print('# Matched Locations:', personal_admin.count()) # US: 217190 / MX: 24512 / ID: 24711

# Matched Locations: 24711


In [56]:
print('Save Geocoded Locations')
start = timer()

personal_admin.write.mode("overwrite").parquet(os.path.join(path_to_data,source,'aggregates',country,'personal_admin'))

print("Done in", round(timer()-start), "sec")

Save Geocoded Locations
Done in 73 sec


In [57]:
personal_admin.show()

+------------+------------------+----------+---------------+----------+-------+----------+----------+------------------+----------------+--------------------+
|  ADM4_PCODE|           ADM3_EN|ADM3_PCODE|        ADM2_EN|ADM2_PCODE|ADM1_EN|ADM1_PCODE|median_age|      wealth_index|total_population|               point|
+------------+------------------+----------+---------------+----------+-------+----------+----------+------------------+----------------+--------------------+
|ID1108090028|        Ingin Jaya| ID1108090|     Aceh Besar|    ID1108|   Aceh|      ID11|      22.5|0.6212808719422072|             602|[95.34485, 5.5178...|
|ID1108090018|        Ingin Jaya| ID1108090|     Aceh Besar|    ID1108|   Aceh|      ID11|      19.0|0.5681051121306846|             593|[95.388794, 5.506...|
|ID1171011005|         Jaya Baru| ID1171011|Kota Banda Aceh|    ID1171|   Aceh|      ID11|      24.0|0.7736237394582959|            2441|[95.3009, 5.5233765]|
|ID1171040013|       Syiah Kuala| ID1171040|Ko