In [8]:
import os
from glob import glob
import pandas as pd
from timeit import default_timer as timer
import numpy as np
from collections import Counter
import pygeohash as pgh

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,hour,dayofweek,to_timestamp,size,isnan,lit,date_format,to_timestamp,struct,expr,explode,collect_list,array
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType, DoubleType

In [2]:
try:
    spark
except NameError:
    print('Create Spark')
    spark=SparkSession.builder.appName("").getOrCreate()

Create Spark


In [3]:
source='cuebiq'
# country='US'
country='MX'

if os.getenv('CLUSTER')=='PRINCE':
    path_to_data='/scratch/spf248/covid/data' 
    path_to_fig='/scratch/spf248/covid/fig'    
    %matplotlib inline
    import matplotlib.pyplot as plt
    import seaborn as sns
else:
    path_to_data='/user/spf248/covid/data'
    path_to_fig='/home/spf248/covid/fig'
    import matplotlib as mpl
    mpl.use('Agg')
    import matplotlib.pyplot as plt
    import seaborn as sns

  import pandas.util.testing as tm


# Import Aggregated Data

In [4]:
users_profile=spark.read.parquet(os.path.join(path_to_data,source,'aggregates',country,'users_profile'))
users_profile.cache()
print('# Users:', users_profile.count()) # 21783569

# Users: 301967


In [5]:
n_pings_id_personal=spark.read.parquet(os.path.join(path_to_data,source,'aggregates',country,'n_pings_id_personal'))
n_pings_id_personal.cache()
print('# Personal Areas:', n_pings_id_personal.count()) # 34963009

# Personal Areas: 442269


In [6]:
n_pings_id_personal_day_hour=spark.read.parquet(os.path.join(path_to_data,source,'aggregates',country,'n_pings_id_personal_day_hour'))
n_pings_id_personal_day_hour.cache()
total_pings=n_pings_id_personal_day_hour.select('n_pings').groupby().sum().collect()[0][0]
print('# Pings at personal Locations:', total_pings) # 84668795782

# Pings at personal Locations: 295165987


# Infer Primary Home Location

In [7]:
primary_home=users_profile.select('cuebiq_id')

for cutoff_day in [5,7]:
    print()
    print('Cutoff day:',cutoff_day)
    for cutoff_morning in [5,7,9]:
        print()
        print('Cutoff morning:',cutoff_morning)
        for cutoff_night in [19,21,23]:
            print('Cutoff night:',cutoff_night)
            
            n_pings_id_personal_filtered=n_pings_id_personal_day_hour.filter(
            n_pings_id_personal_day_hour['dayofweek']<=cutoff_day).filter(
            (n_pings_id_personal_day_hour['hour']>=cutoff_night)|\
            (n_pings_id_personal_day_hour['hour']<=cutoff_morning)).groupby('cuebiq_id','point').agg(
            {'n_pings':'sum'}).withColumnRenamed('sum(n_pings)','n_pings')
            
            n_pings_id_max=n_pings_id_personal_filtered.groupby('cuebiq_id').agg(
            {'n_pings':'max'}).withColumnRenamed('max(n_pings)','n_pings')
            
            n_pings_id_personal_filtered_max=n_pings_id_personal_filtered.join(
            n_pings_id_max,on=['cuebiq_id','n_pings']).drop_duplicates(
            subset=['cuebiq_id','n_pings']).drop('n_pings')
            
            primary_home=primary_home.join(
            n_pings_id_personal_filtered_max,on=['cuebiq_id']).withColumnRenamed(
            'point','point_'+str(cutoff_day)+'_'+str(cutoff_morning)+'_'+str(cutoff_night))


Cutoff day: 5

Cutoff morning: 5
Cutoff night: 19
Cutoff night: 21
Cutoff night: 23

Cutoff morning: 7
Cutoff night: 19
Cutoff night: 21
Cutoff night: 23

Cutoff morning: 9
Cutoff night: 19
Cutoff night: 21
Cutoff night: 23

Cutoff day: 7

Cutoff morning: 5
Cutoff night: 19
Cutoff night: 21
Cutoff night: 23

Cutoff morning: 7
Cutoff night: 19
Cutoff night: 21
Cutoff night: 23

Cutoff morning: 9
Cutoff night: 19
Cutoff night: 21
Cutoff night: 23


In [None]:
geohash = udf(lambda x,y: pgh.encode(x,y))

In [None]:
# Baseline = Most Frequently Allocated Block Across Specification
mode=udf(lambda arr: Counter(arr).most_common(1)[0][0], StructType())
primary_home=primary_home.withColumn(
'point', mode(array([x for x in primary_home.columns if 'point' in x])))

# Check unicity across specifications
is_unique=udf(lambda arr: np.int(len(np.unique(arr))==1), IntegerType())
primary_home=primary_home.withColumn(
'perfect_match', is_unique(array([x for x in primary_home.columns if 'point' in x])))

In [None]:
print('Save Primary Home Location')
start = timer()
primary_home.write.mode("overwrite").parquet(os.path.join(path_to_data,source,'aggregates',country,'primary_home'))
print("Done in", round(timer()-start), "sec")        

In [None]:
# Personal Locations: 34273578
# Users with personal locations: 21417460
# Home detected: 20992177
# Non-home locations: 13281401
# Non-home locations: 11624061