In [None]:
import os
from glob import glob
import pandas as pd
from timeit import default_timer as timer
import numpy as np
from collections import Counter

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,hour,dayofweek,to_timestamp,size,isnan,lit,date_format,to_timestamp,struct,expr,explode,collect_list,array
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType, DoubleType

In [None]:
try:
    spark
except NameError:
    print('Create Spark')
    spark=SparkSession.builder.appName("").getOrCreate()

In [None]:
source='cuebiq'
country='MX'
admin_id='ageb'

source='cuebiq'
country='US'
admin_id='census_block_group'

if os.getenv('CLUSTER')=='PRINCE':
    path_to_data='/scratch/spf248/covid/data' 
    path_to_fig='/scratch/spf248/covid/fig'    
    %matplotlib inline
    import matplotlib.pyplot as plt
    import seaborn as sns
else:
    path_to_data='/user/spf248/covid/data'
    path_to_fig='/home/spf248/covid/fig'
    import matplotlib as mpl
    mpl.use('Agg')
    import matplotlib.pyplot as plt
    import seaborn as sns

# Import Aggregated Data

In [None]:
users_profile=spark.read.parquet(os.path.join(path_to_data,source,'aggregates',country,'users_profile'))
users_profile.cache()
print('# Users:', users_profile.count()) # 21783569

In [None]:
personal_admin=spark.read.parquet(os.path.join(path_to_data,source,'aggregates',country,'personal_admin'))
personal_admin.cache()
print('# Personal Areas:', personal_admin.count()) # 217190

In [None]:
n_pings_id_personal=spark.read.parquet(os.path.join(path_to_data,source,'aggregates',country,'n_pings_id_personal'))
n_pings_id_personal.cache()
print('# Personal Areas:', n_pings_id_personal.count()) # 34963009

In [None]:
n_pings_id_personal_day_hour=spark.read.parquet(os.path.join(path_to_data,source,'aggregates',country,'n_pings_id_personal_day_hour'))
n_pings_id_personal_day_hour.cache()
total_pings=n_pings_id_personal_day_hour.select('n_pings').groupby().sum().collect()[0][0]
print('# Pings at personal Locations:', total_pings) # 84668795782

# Infer Primary Home Location

In [None]:
personal_admin=personal_admin.select(admin_id,'point')

In [None]:
primary_home=users_profile.select('cuebiq_id')

for cutoff_day in [5,7]:
    print()
    print('Cutoff day:',cutoff_day)
    for cutoff_morning in [5,7,9]:
        print()
        print('Cutoff morning:',cutoff_morning)
        for cutoff_night in [19,21,23]:
            print('Cutoff night:',cutoff_night)
            
            n_pings_id_personal_filtered=n_pings_id_personal_day_hour.filter(
            n_pings_id_personal_day_hour['dayofweek']<=cutoff_day).filter(
            (n_pings_id_personal_day_hour['hour']>=cutoff_night)|\
            (n_pings_id_personal_day_hour['hour']<=cutoff_morning)).groupby('cuebiq_id','point').agg(
            {'n_pings':'sum'}).withColumnRenamed('sum(n_pings)','n_pings')
            
            n_pings_id_max=n_pings_id_personal_filtered.groupby('cuebiq_id').agg(
            {'n_pings':'max'}).withColumnRenamed('max(n_pings)','n_pings')
            
            n_pings_id_personal_filtered_max=n_pings_id_personal_filtered.join(
            n_pings_id_max,on=['cuebiq_id','n_pings']).drop_duplicates(
            subset=['cuebiq_id','n_pings']).drop('n_pings')
            
            primary_home=primary_home.join(
            n_pings_id_personal_filtered_max,on=['cuebiq_id']).join(personal_admin,on=['point']).withColumnRenamed(
            'point','point_'+str(cutoff_day)+'_'+str(cutoff_morning)+'_'+str(cutoff_night)).withColumnRenamed(
            admin_id,admin_id+'_'+str(cutoff_day)+'_'+str(cutoff_morning)+'_'+str(cutoff_night))

In [None]:
# Baseline = Most Frequently Allocated Block Across Specification
mode=udf(lambda arr: Counter(arr).most_common(1)[0][0], StringType())
primary_home=primary_home.withColumn(
admin_id, mode(array([x for x in primary_home.columns if admin_id in x])))

# Rematch with corresponding point
primary_home=primary_home.join(personal_admin,on=admin_id)

# Check unicity across specifications
is_unique=udf(lambda arr: np.int(len(np.unique(arr))==1), IntegerType())
primary_home=primary_home.withColumn(
'perfect_match', is_unique(array([x for x in primary_home.columns if admin_id in x])))

In [None]:
print('Save Primary Home Location')
start = timer()
primary_home.write.mode("overwrite").parquet(os.path.join(path_to_data,source,'aggregates',country,'primary_home'))
print("Done in", round(timer()-start), "sec")        

# Secondary Personal Locations

In [None]:
primary_home=spark.read.parquet(os.path.join(path_to_data,source,'aggregates',country,'primary_home'))
primary_home.cache()
print('# Non-home Locations:', primary_home.count())

In [None]:
personal_others=n_pings_id_personal.join(primary_home,on=['cuebiq_id','point'],how='left_anti')
personal_others.cache()
print('# Non-home Locations:', personal_others.count())

In [None]:
location_name_to_data={
'primary_high_conf':primary_home.filter(primary_home['perfect_match']==1).select('cuebiq_id','point'),
'primary_low_conf':primary_home.filter(primary_home['perfect_match']==0).select('cuebiq_id','point'),
'non_primary':personal_others.select('cuebiq_id','point'),
}

In [None]:
primary_home.groupby(admin_id).agg(
{'cuebiq_id':'count'}).withColumnRenamed(
'count(cuebiq_id)','n_users').repartition(1).write.mode("overwrite").option('header', 'true').csv(
os.path.join(path_to_data,source,'aggregates',country,'admin_users_pop'))

# Hours Spent Across Personal Locations

In [None]:
print('Hour spent at personal location')
start = timer()

hours={}
for personal_value in [1,2,3]:
    print('Personal Value:', personal_value)
    for min_days in [0,70]:
        print('Min Days:', min_days)
        for device_type in [0,1]:
            print('Device:', device_type)
            for location_name in location_name_to_data:
                print('Location:', location_name)
                for day_value in range(1,8):
                    print('Day:', day_value)
                    hours[(personal_value,min_days,device_type,location_name,day_value)]=\
                    n_pings_id_personal_day_hour.filter(
                    n_pings_id_personal_day_hour['dayofweek']==day_value).join(
                    users_profile.filter(users_profile['n_personal']==personal_value).filter(
                    users_profile['device_type']==device_type).filter(
                    users_profile['n_days']>min_days).select('cuebiq_id'),on='cuebiq_id').join(
                    location_name_to_data[location_name],on=['cuebiq_id','point']).groupby('hour').agg(
                    {'n_pings':'sum'}).withColumnRenamed('sum(n_pings)','n_pings')
                    
                    hours[(personal_value,min_days,device_type,location_name,day_value)].write.mode("overwrite").parquet(
                    os.path.join(path_to_data,source,'aggregates',country,'hours',str(personal_value),str(min_days),str(device_type),location_name,str(day_value)))
                    
print("Done in", round(timer()-start), "sec")        

# Figures

In [None]:
print('Hour spent at personal location')
start = timer()

hours={}
for personal_value in [1,2,3]:
    print('Personal Value:', personal_value)
    for min_days in [0,70]:
        print('Min Days:', min_days)
        for device_type in [0,1]:
            print('Device:', device_type)
            for location_name in ['primary_high_conf','primary_low_conf','non_primary']:
                print('Location:', location_name)
                for day_value in range(1,8):
                    print('Day:', day_value)
                    hours[(personal_value,min_days,device_type,location_name,day_value)]=spark.read.parquet(
                    os.path.join(path_to_data,source,'aggregates',country,'hours',str(personal_value),str(min_days),str(device_type),location_name,str(day_value))).toPandas()
                    try:
                        fig,ax=plt.subplots(figsize=(8,5))
                        hours[(personal_value,min_days,device_type,location_name,day_value)].set_index('hour')['n_pings'].sort_index().plot(
                        ax=ax,kind='bar',color='k')
                        ax.tick_params(which='both',direction='in',pad=3)
                        plt.xticks(rotation=0)
                        ax.set_xlabel('Weekday hour')
                        ax.set_ylabel('Number of pings')
                        plt.savefig(os.path.join(path_to_fig,country,'hours-day-'+str(day_value)+'-'+str(location_name)+'-'+str(personal_value)+'-personal-location-device-'+str(device_type)+'-min-days-'+str(min_days)+'.pdf'),bbox_inches='tight')
#                         ax.set_ylim([0,total_pings])
                        ax.set_xlim([0,23])
                    except:
                        pass
print("Done in", round(timer()-start), "sec")        

In [None]:
# Personal Locations: 34273578
# Users with personal locations: 21417460
# Home detected: 20992177
# Non-home locations: 13281401
# Non-home locations: 11624061