In [None]:
import os
from glob import glob
import pandas as pd
from timeit import default_timer as timer
import numpy as np
from collections import Counter
import pygeohash as pgh

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,hour,dayofweek,to_timestamp,size,isnan,lit,date_format,to_timestamp,struct,expr,explode,collect_list,array,length
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType, DoubleType

In [None]:
try:
    spark
except NameError:
    print('Create Spark')
    spark=SparkSession.builder.appName("").getOrCreate()

In [None]:
source='cuebiq'
country='ID'
geohash_precision=15 # Max precision for MX is 10 
day_cutoffs=[5,7]
morning_cutoffs=[7,9]
night_cutoffs=[21,23]

if os.getenv('CLUSTER')=='PRINCE':
    path_to_data='/scratch/spf248/covid/data' 
else:
    path_to_data='/user/spf248/covid/data'

# Import Processed Data

In [None]:
device_id=spark.read.parquet(os.path.join(path_to_data,source,'processed',country,'device_id'))
device_id.cache()
print('# Users:', device_id.count()) 

In [None]:
n_pings_id_personal_day_hour=spark.read.parquet(os.path.join(path_to_data,source,'processed',country,'n_pings_id_personal_day_hour'))
n_pings_id_personal_day_hour.cache()
total_pings=n_pings_id_personal_day_hour.select('n_pings').groupby().sum().collect()[0][0]
print('# Pings at personal Locations:', total_pings)

# Compute Primary Home Specifications

In [None]:
point2geohash=udf(lambda x,y: pgh.encode(x,y,precision=geohash_precision))
def geohash2point(geohash):
    (x,y)=pgh.decode(geohash)
    return (y,x)
schema=StructType([StructField("longitude", FloatType(), False),StructField("latitude", FloatType(), False)])
geohash2point_udf=udf(geohash2point, schema)

In [None]:
primary_home=device_id.select('cuebiq_id')
for day_cutoff in day_cutoffs:
    print()
    print('Cutoff day:',day_cutoff)
    for morning_cutoff in morning_cutoffs:
        print('Cutoff morning:',morning_cutoff)
        for night_cutoff in night_cutoffs:
            print('Cutoff night:',night_cutoff)
            # Count Pings at Night For Each Personal Area
            n_pings_id_personal_filtered=n_pings_id_personal_day_hour.filter(
            n_pings_id_personal_day_hour['dayofweek']<=day_cutoff).filter(
            (n_pings_id_personal_day_hour['hour']>=night_cutoff)|\
            (n_pings_id_personal_day_hour['hour']<=morning_cutoff)).groupby('cuebiq_id','point').agg(
            {'n_pings':'sum'}).withColumnRenamed('sum(n_pings)','n_pings')
            # Find Max Ping Count
            n_pings_id_max=n_pings_id_personal_filtered.groupby('cuebiq_id').agg(
            {'n_pings':'max'}).withColumnRenamed('max(n_pings)','n_pings')
            # Find Personal Area With Max Ping Count
            n_pings_id_personal_filtered_max=n_pings_id_personal_filtered.join(
            n_pings_id_max,on=['cuebiq_id','n_pings']).drop_duplicates(
            subset=['cuebiq_id','n_pings']).drop('n_pings')
            # Include As Home Candidate
            primary_home=primary_home.join(n_pings_id_personal_filtered_max,on=['cuebiq_id'])
            # Convert to Geohash
            primary_home=primary_home.withColumn('geohash',point2geohash(col('point.latitude'),col('point.longitude')))
            # Rename
            primary_home=primary_home.withColumnRenamed(
            'point','point_'+str(day_cutoff)+'_'+str(morning_cutoff)+'_'+str(night_cutoff)).withColumnRenamed(
            'geohash','geohash_'+str(day_cutoff)+'_'+str(morning_cutoff)+'_'+str(night_cutoff))

# Infer Primary Home

In [None]:
# Home = personal area with the most pings across specifications of morning/night/week
mode=udf(lambda arr: Counter(arr).most_common(1)[0][0], StringType())
primary_home=primary_home.withColumn(
'geohash', mode(array([x for x in primary_home.columns if 'geohash' in x])))

# Create index if home is identical across specifications
is_unique=udf(lambda arr: np.int(len(np.unique(arr))==1), IntegerType())
primary_home=primary_home.withColumn(
'perfect_match', is_unique(array([x for x in primary_home.columns if 'geohash' in x])))

# Map baseline home back to coordinates
primary_home=primary_home.withColumn('point',geohash2point_udf('geohash'))

# Save

In [None]:
print('Save')
start = timer()
primary_home.select('cuebiq_id','point','perfect_match').write.mode("overwrite").parquet(
os.path.join(path_to_data,source,'processed',country,'primary_home'))
print("Done in", round(timer()-start), "sec")        