In [None]:
import os
from glob import glob
import pandas as pd
from timeit import default_timer as timer

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,hour,dayofweek,to_timestamp,size,isnan,lit,date_format,to_timestamp,struct,expr,explode,collect_list
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType, DoubleType
import pyspark.sql.functions as F
from pyspark.sql import Window

In [None]:
try:
    spark
except NameError:
    print('Create Spark')
    spark=SparkSession.builder.appName("").getOrCreate()

In [None]:
source='cuebiq'
country='US'

if os.getenv('CLUSTER')=='PRINCE':
    path_to_data='/scratch/spf248/covid/data' 
    path_to_fig='/scratch/spf248/covid/fig'    
    %matplotlib inline
    import matplotlib.pyplot as plt
    import seaborn as sns

else:
    path_to_data='/user/spf248/covid/data'
    path_to_fig='/home/spf248/covid/fig'
    import matplotlib as mpl
    mpl.use('Agg')
    import matplotlib.pyplot as plt
    import seaborn as sns

# Import

In [None]:
# Personal Locations: 34273578
# Users with personal locations: 21417460
# Home detected: 20992177
# Non-home locations: 13281401
# Non-home locations: 11624061

In [None]:
users=spark.read.parquet(os.path.join(path_to_data,source,'aggregates',country,'users'))
users.cache()
# print('# Users:', users.count()) # 21783569

In [None]:
users_personal=spark.read.parquet(os.path.join(path_to_data,source,'aggregates',country,'users_personal'))
users_personal.cache()
# print('# Users:', users_personal.count()) # 33664706

In [None]:
hourly_personal=spark.read.parquet(os.path.join(path_to_data,source,'aggregates',country,'hourly_personal'))
hourly_personal.cache()
# print('# Personal Pings on weekdays:', hourly_personal.count())

# Classify Locations

In [None]:
total_hours=hourly_personal.select('n_pings').groupby().sum().collect()[0][0]
total_locations=users.select('n_personal').groupby().sum().collect()[0][0]

In [None]:
users_home=users_personal.join(users_personal.groupby('cuebiq_id').agg(
{'n_weeknights':'max'}).withColumnRenamed('max(n_weeknights)','n_weeknights'),
on=['cuebiq_id','n_weeknights']).drop_duplicates(
subset=['cuebiq_id','n_weeknights']).select('cuebiq_id','point')
users_home.cache()
# print('# Home detected:',users_home.count())#21335363

In [None]:
users_non_home=users_personal.join(users_home,on=['cuebiq_id','point'],how='left_anti')
users_non_home.cache()
# print('# Non-home locations:',users_non_home.count())#12329343

In [None]:
print('Hour spent at personal location')
start = timer()

hours={}

for personal_value in range(1,4):
    
    print('Personal Value:', personal_value)
    
    for device_type in [0,1]:
        
        print('Device:', device_type)
        
        for (location_name,location_type) in zip(['home','non_home'],[users_home,users_non_home]):
            
            print('Location:', location_name)
            
            try:

                hours[(personal_value,device_type,location_name)]=\
                ((hourly_personal.join(users.filter(users['n_personal']==personal_value).filter(
                users['device_type']==device_type).select('cuebiq_id'),on='cuebiq_id')).join(
                location_type.select('cuebiq_id','point'),on=['cuebiq_id','point'])).groupby('hour').agg({'n_pings':'sum'}).withColumnRenamed('sum(n_pings)','n_pings').toPandas()

                fig,ax=plt.subplots(figsize=(8,5))
                hours[(personal_value,device_type,location_name)].set_index('hour')['n_pings'].sort_index().plot(
                ax=ax,kind='bar',color='k')
                ax.tick_params(which='both',direction='in',pad=3)
                plt.xticks(rotation=0)
                ax.set_xlabel('Weekday hour')
                ax.set_ylabel('Number of pings')
                plt.savefig(os.path.join(path_to_fig,country,'weekday-hours-'+str(personal_value)+'-personal-location-device-'+str(device_type)+'-'+str(location_name)+'.pdf'),bbox_inches='tight')
                ax.set_ylim([0,total_hours])
                ax.set_xlim([0,23])
                print('Location:', location_name)

            except:
                
                pass
            
print("Done in", round(timer()-start), "sec")        

In [None]:
print('Most Frequent Hour')
start = timer()

most_freq_hour={}

for personal_value in range(1,4):
    
    print('Personal Value:', personal_value)
    
    for device_type in [0,1]:
        
        print('Device:', device_type)
    
        for (location_name,location_type) in zip(['home','non_home'],[users_home,users_non_home]):
            
            print('Location:', location_name)
            
            try:

                most_freq_hour[(personal_value,device_type,location_name)]=\
                ((users_personal.join(users.filter(users['n_personal']==personal_value).filter(users['device_type']==device_type).select('cuebiq_id'),on='cuebiq_id')).join(
                location_type.select('cuebiq_id','point'),on=['cuebiq_id','point'])).groupby('most_freq_hour').count().toPandas()

                fig,ax=plt.subplots(figsize=(8,5))
                most_freq_hour[(personal_value,device_type,location_name)].set_index('most_freq_hour')['count'].sort_index().plot(
                ax=ax,kind='bar',color='k')
                ax.tick_params(which='both',direction='in',pad=3)
                plt.xticks(rotation=0)
                ax.set_xlabel('Most frequent weekday hour')
                ax.set_ylabel('Number of locations')
                plt.savefig(os.path.join(path_to_fig,country,'most-freq-weekday-hour-'+str(personal_value)+'-personal-location-device-'+str(device_type)+'-'+str(location_name)+'.pdf'),bbox_inches='tight')
                ax.set_ylim([0,total_locations])
                ax.set_xlim([0,23])
                
            except:
                
                pass
            
print("Done in", round(timer()-start), "sec")        

# Backup

In [None]:
personal_counts=(personal_week.unionByName(personal_weekend)).groupby(
'cuebiq_id','point').agg({'count':'sum'}).withColumnRenamed('sum(count)','count')
print('# Personal Locations:',personal_counts.count())#34273578

users=users.join(personal_counts.groupby('cuebiq_id').count().withColumnRenamed(
'count','n_personal'),on='cuebiq_id')
print('# Users with personal locations:',users.count())#21417460

In [None]:
personal_hours=personal_week.unionByName(personal_weekend)

personal_night=personal_hours.filter(
(personal_hours['hour']<=cutoff_morning)|(personal_hours['hour']>=cutoff_night)).groupby(
'cuebiq_id','point').agg({'count':'sum'}).withColumnRenamed('sum(count)','count')

personal_night_max=personal_night.groupby('cuebiq_id').agg(
{'count':'max'}).withColumnRenamed('max(count)','count')

personal_home=personal_night.join(
personal_night_max,on=['cuebiq_id','count']).drop_duplicates(
subset=['cuebiq_id','count']).select('cuebiq_id','point')
print('# Home detected:',personal_home.count())#21091919

personal_home.write.mode("overwrite").parquet(os.path.join(path_to_data,source,country,'personal_home'))

In [None]:
personal_other=personal_counts.join(personal_home,on=['cuebiq_id','point'],how='left_anti')
print('# Non-home locations:',personal_other.count())#13181659

personal_most_freq_other=personal_other.groupby('cuebiq_id').agg(
{'count':'max'}).withColumnRenamed('max(count)','count')

personal_secondary=personal_other.join(
personal_most_freq_other,on=['cuebiq_id','count']).drop_duplicates(
subset=['cuebiq_id','count']).select('cuebiq_id','point')
print('# Non-home locations:',personal_secondary.count())#11529108

personal_secondary.write.mode("overwrite").parquet(os.path.join(path_to_data,source,country,'personal_secondary'))

In [None]:
hours_week={}
hours_weekend={}

for personal_value in range(1,4):
    
    print('Personal Value:', personal_value)
    
    for device_type in [0,1]:
        
        print('Device:', device_type)
    
        hours_week[(personal_value,device_type)]=(personal_week.join(users.filter(users['n_personal']==personal_value).filter(users['device_type']==device_type),on='cuebiq_id')).groupby('hour').agg({'count':'sum'}).withColumnRenamed('sum(count)','count')
        
        hours_week[(personal_value,device_type)].write.mode("overwrite").parquet(os.path.join(path_to_data,source,country,'personal_hours','week',str(personal_value),str(device_type)))

        hours_weekend[(personal_value,device_type)]=(personal_weekend.join(users.filter(users['n_personal']==personal_value).filter(users['device_type']==device_type),on='cuebiq_id')).groupby('hour').agg({'count':'sum'}).withColumnRenamed('sum(count)','count')
        
        hours_weekend[(personal_value,device_type)].write.mode("overwrite").parquet(os.path.join(path_to_data,source,country,'personal_hours','weekend',str(personal_value),str(device_type)))

In [None]:
hours_week={}
hours_weekend={}

for personal_value in range(1,4):
    
    print('Personal Value:', personal_value)
    
    for device_type in [0,1]:
        
        print('Device:', device_type)
    
        for (location_name,location_type) in zip(['home','secondary'],[personal_home,personal_secondary]):
            
            print('Location:', location_name)
            
            hours_week[(personal_value,device_type,location_name)]=((personal_week.join(users.filter(users['n_personal']==personal_value).filter(users['device_type']==device_type),on='cuebiq_id')).join(location_type,on=['cuebiq_id','point'])).groupby('hour').agg({'count':'sum'}).withColumnRenamed('sum(count)','count')
            
            hours_week[(personal_value,device_type,location_name)].write.mode("overwrite").parquet(os.path.join(path_to_data,source,country,'personal_hours','week',str(personal_value),str(device_type),location_name))

            hours_weekend[(personal_value,device_type,location_name)]=((personal_weekend.join(users.filter(users['n_personal']==personal_value).filter(users['device_type']==device_type),on='cuebiq_id')).join(location_type,on=['cuebiq_id','point'])).groupby('hour').agg({'count':'sum'}).withColumnRenamed('sum(count)','count')
            
            hours_weekend[(personal_value,device_type,location_name)].write.mode("overwrite").parquet(os.path.join(path_to_data,source,country,'personal_hours','weekend',str(personal_value),str(device_type),location_name))

# Load and Plots

In [None]:
hours_week={}
hours_weekend={}

for personal_value in range(1,4):
    
    print('Personal Value:', personal_value)
    
    for device_type in [0,1]:
        
        print('Device:', device_type)
    
        hours_week[(personal_value,device_type)]=spark.read.parquet(os.path.join(path_to_data,source,country,'personal_hours','week',str(personal_value),str(device_type))).toPandas()

        hours_weekend[(personal_value,device_type)]=spark.read.parquet(os.path.join(path_to_data,source,country,'personal_hours','weekend',str(personal_value),str(device_type))).toPandas()

In [None]:
for personal_value in range(1,4):
    
    print('Personal Value:', personal_value)
    
    for device_type in [0,1]:
        
        print('Device:', device_type)
    
        fig,ax=plt.subplots(figsize=(8,5))
        hours_week[(personal_value,device_type)].set_index('hour')['count'].sort_index().plot(ax=ax,kind='bar',color='k')
        ax.tick_params(which='both',direction='in',pad=3)
        plt.xticks(rotation=0)
        ax.set_xlabel('Hour at Personal Location')
        ax.set_ylabel('Number of Pings')
        plt.savefig(os.path.join(path_to_data+'/../fig','hours-week-'+str(personal_value)+'-personal-device-'+str(device_type)+'.pdf'),bbox_inches='tight')

        fig,ax=plt.subplots(figsize=(8,5))
        hours_weekend[(personal_value,device_type)].set_index('hour')['count'].sort_index().plot(ax=ax,kind='bar',color='k')
        ax.tick_params(which='both',direction='in',pad=3)
        plt.xticks(rotation=0)
        ax.set_xlabel('Hour at Personal Location')
        ax.set_ylabel('Number of Pings')
        plt.savefig(os.path.join(path_to_data+'/../fig','hours-weekend-'+str(personal_value)+'-personal-device-'+str(device_type)+'.pdf'),bbox_inches='tight')

In [None]:
hours_week={}
hours_weekend={}

for personal_value in range(1,3):
    
    print('Personal Value:', personal_value)
    
    for device_type in [0,1]:
        
        print('Device:', device_type)
        
        for location_name in ['home','secondary']:
            
            print('Location:', location_name)
            
            hours_week[(personal_value,device_type,location_name)]=spark.read.parquet(os.path.join(path_to_data,source,country,'personal_hours','week',str(personal_value),str(device_type),location_name)).toPandas()

            hours_weekend[(personal_value,device_type,location_name)]=spark.read.parquet(os.path.join(path_to_data,source,country,'personal_hours','weekend',str(personal_value),str(device_type),location_name)).toPandas()

In [None]:
for personal_value in range(1,4):
    
    print('Personal Value:', personal_value)
    
    for device_type in [0,1]:
        
        print('Device:', device_type)
        
        for location_name in ['home','secondary']:
            
            print('Location:', location_name)

            fig,ax=plt.subplots(figsize=(8,5))
            hours_week[(personal_value,device_type,location_name)].set_index('hour')['count'].sort_index().plot(ax=ax,kind='bar',color='k')
            ax.tick_params(which='both',direction='in',pad=3)
            plt.xticks(rotation=0)
            ax.set_xlabel('Hour at Personal Location')
            ax.set_ylabel('Number of Pings')
            plt.savefig(os.path.join(path_to_data+'/../fig','hours-week-'+str(personal_value)+'-personal-device-'+str(device_type)+'-'+str(location_name)+'.pdf'),bbox_inches='tight')

            fig,ax=plt.subplots(figsize=(8,5))
            hours_weekend[(personal_value,device_type,location_name)].set_index('hour')['count'].sort_index().plot(ax=ax,kind='bar',color='k')
            ax.tick_params(which='both',direction='in',pad=3)
            plt.xticks(rotation=0)
            ax.set_xlabel('Hour at Personal Location')
            ax.set_ylabel('Number of Pings')
            plt.savefig(os.path.join(path_to_data+'/../fig','hours-weekend-'+str(personal_value)+'-personal-device-'+str(device_type)+'-'+str(location_name)+'.pdf'),bbox_inches='tight')