In [1]:
import os
import matplotlib as mpl
if os.environ.get('DISPLAY','') == '':
    mpl.use('Agg')

import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import pandas as pd
from timeit import default_timer as timer

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,hour,dayofweek,to_timestamp,size,isnan,lit,date_format,to_timestamp,struct,expr,explode,collect_list
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType, DoubleType
import pyspark.sql.functions as F
from pyspark.sql import Window

  import pandas.util.testing as tm


In [2]:
try:
    spark
except NameError:
    print('Create Spark')
    spark=SparkSession.builder.appName("").getOrCreate()

Create Spark


In [3]:
source='cuebiq'
country='US'
cutoff_morning=8
cutoff_night=22

if os.getenv('CLUSTER')=='PRINCE':
    path_to_data='/scratch/spf248/covid/data'
    file_week='*00-099fe598-7adb-4411-b68c-a60d0c28cb7a-c000.snappy.parquet'
    file_weekend='*00-54e95071-216b-4a7a-89f3-de881c0bf379-c000.snappy.parquet'
    file_users='*0-ca003b73-1dea-4468-acde-e43360e39ed4-c000.snappy.parquet'
else:
    path_to_data='/user/spf248/covid/data'
    file_week='*'
    file_weekend='*'
    file_users='*'

# Distributions of Personal Hours

In [25]:
# Personal Locations: 34273578
# Users with personal locations: 21417460
# Home detected: 20992177
# Non-home locations: 13281401
# Non-home locations: 11624061

In [4]:
users=spark.read.parquet(os.path.join(path_to_data,source,country,'users',file_users))
users.cache()

DataFrame[cuebiq_id: string, n_days: bigint, device_type: float, n_pings: bigint]

In [5]:
personal_week=spark.read.parquet(os.path.join(path_to_data,source,country,'personal_week',file_week))
personal_week.cache()
personal_weekend=spark.read.parquet(os.path.join(path_to_data,source,country,'personal_weekend',file_weekend))
personal_weekend.cache()

DataFrame[cuebiq_id: string, point: struct<longitude:float,latitude:float>, hour: int, count: bigint]

In [6]:
personal_counts=(personal_week.unionByName(personal_weekend)).groupby(
'cuebiq_id','point').agg({'count':'sum'}).withColumnRenamed('sum(count)','count')
print('# Personal Locations:',personal_counts.count())#34273578

users=users.join(personal_counts.groupby('cuebiq_id').count().withColumnRenamed(
'count','n_personal'),on='cuebiq_id')
print('# Users with personal locations:',users.count())#21417460

In [12]:
personal_hours=personal_week.unionByName(personal_weekend)

personal_night=personal_hours.filter(
(personal_hours['hour']<=cutoff_morning)|(personal_hours['hour']>=cutoff_night)).groupby(
'cuebiq_id','point').agg({'count':'sum'}).withColumnRenamed('sum(count)','count')

personal_night_max=personal_night.groupby('cuebiq_id').agg(
{'count':'max'}).withColumnRenamed('max(count)','count')

personal_home=personal_night.join(
personal_night_max,on=['cuebiq_id','count']).drop_duplicates(
subset=['cuebiq_id','count']).select('cuebiq_id','point')
print('# Home detected:',personal_home.count())#21091919

personal_home.write.mode("overwrite").parquet(os.path.join(path_to_data,source,country,'personal_home'))

In [52]:
personal_other=personal_counts.join(personal_home,on=['cuebiq_id','point'],how='left_anti')
print('# Non-home locations:',personal_other.count())#13181659

personal_most_freq_other=personal_other.groupby('cuebiq_id').agg(
{'count':'max'}).withColumnRenamed('max(count)','count')

personal_secondary=personal_other.join(
personal_most_freq_other,on=['cuebiq_id','count']).drop_duplicates(
subset=['cuebiq_id','count']).select('cuebiq_id','point')
print('# Non-home locations:',personal_secondary.count())#11529108

personal_secondary.write.mode("overwrite").parquet(os.path.join(path_to_data,source,country,'personal_secondary'))

In [None]:
hours_week={}
hours_weekend={}

for personal_value in range(1,4):
    
    print('Personal Value:', personal_value)
    
    for device_type in [0,1]:
        
        print('Device:', device_type)
    
        hours_week[(personal_value,device_type)]=(personal_week.join(users.filter(users['n_personal']==personal_value).filter(users['device_type']==device_type),on='cuebiq_id')).groupby('hour').agg({'count':'sum'}).withColumnRenamed('sum(count)','count')
        
        hours_week[(personal_value,device_type)].write.mode("overwrite").parquet(os.path.join(path_to_data,source,country,'personal_hours','week',str(personal_value),str(device_type)))

        hours_weekend[(personal_value,device_type)]=(personal_weekend.join(users.filter(users['n_personal']==personal_value).filter(users['device_type']==device_type),on='cuebiq_id')).groupby('hour').agg({'count':'sum'}).withColumnRenamed('sum(count)','count')
        
        hours_weekend[(personal_value,device_type)].write.mode("overwrite").parquet(os.path.join(path_to_data,source,country,'personal_hours','weekend',str(personal_value),str(device_type)))

In [None]:
hours_week={}
hours_weekend={}

for personal_value in range(1,4):
    
    print('Personal Value:', personal_value)
    
    for device_type in [0,1]:
        
        print('Device:', device_type)
    
        for (location_name,location_type) in zip(['home','secondary'],[personal_home,personal_secondary]):
            
            print('Location:', location_name)
            
            hours_week[(personal_value,device_type,location_name)]=((personal_week.join(users.filter(users['n_personal']==personal_value).filter(users['device_type']==device_type),on='cuebiq_id')).join(location_type,on=['cuebiq_id','point'])).groupby('hour').agg({'count':'sum'}).withColumnRenamed('sum(count)','count')
            
            hours_week[(personal_value,device_type,location_name)].write.mode("overwrite").parquet(os.path.join(path_to_data,source,country,'personal_hours','week',str(personal_value),str(device_type),location_name))

            hours_weekend[(personal_value,device_type,location_name)]=((personal_weekend.join(users.filter(users['n_personal']==personal_value).filter(users['device_type']==device_type),on='cuebiq_id')).join(location_type,on=['cuebiq_id','point'])).groupby('hour').agg({'count':'sum'}).withColumnRenamed('sum(count)','count')
            
            hours_weekend[(personal_value,device_type,location_name)].write.mode("overwrite").parquet(os.path.join(path_to_data,source,country,'personal_hours','weekend',str(personal_value),str(device_type),location_name))

# Load and Plots

In [20]:
hours_week={}
hours_weekend={}

for personal_value in range(1,4):
    
    print('Personal Value:', personal_value)
    
    for device_type in [0,1]:
        
        print('Device:', device_type)
    
        hours_week[(personal_value,device_type)]=spark.read.parquet(os.path.join(path_to_data,source,country,'personal_hours','week',str(personal_value),str(device_type))).toPandas()

        hours_weekend[(personal_value,device_type)]=spark.read.parquet(os.path.join(path_to_data,source,country,'personal_hours','weekend',str(personal_value),str(device_type))).toPandas()

Personal Value: 1
Device: 0
Device: 1
Personal Value: 2
Device: 0
Device: 1
Personal Value: 3
Device: 0
Device: 1


In [21]:
for personal_value in range(1,4):
    
    print('Personal Value:', personal_value)
    
    for device_type in [0,1]:
        
        print('Device:', device_type)
    
        fig,ax=plt.subplots(figsize=(8,5))
        hours_week[(personal_value,device_type)].set_index('hour')['count'].sort_index().plot(ax=ax,kind='bar',color='k')
        ax.tick_params(which='both',direction='in',pad=3)
        plt.xticks(rotation=0)
        ax.set_xlabel('Hour at Personal Location')
        ax.set_ylabel('Number of Pings')
        plt.savefig(os.path.join(path_to_data+'/../fig','hours-week-'+str(personal_value)+'-personal-device-'+str(device_type)+'.pdf'),bbox_inches='tight')

        fig,ax=plt.subplots(figsize=(8,5))
        hours_weekend[(personal_value,device_type)].set_index('hour')['count'].sort_index().plot(ax=ax,kind='bar',color='k')
        ax.tick_params(which='both',direction='in',pad=3)
        plt.xticks(rotation=0)
        ax.set_xlabel('Hour at Personal Location')
        ax.set_ylabel('Number of Pings')
        plt.savefig(os.path.join(path_to_data+'/../fig','hours-weekend-'+str(personal_value)+'-personal-device-'+str(device_type)+'.pdf'),bbox_inches='tight')

Personal Value: 1
Device: 0


  if __name__ == '__main__':


Device: 1


  if __name__ == '__main__':


Personal Value: 2
Device: 0


  if __name__ == '__main__':


Device: 1


  if __name__ == '__main__':


Personal Value: 3
Device: 0


  if __name__ == '__main__':


Device: 1


  if __name__ == '__main__':


In [22]:
hours_week={}
hours_weekend={}

for personal_value in range(1,3):
    
    print('Personal Value:', personal_value)
    
    for device_type in [0,1]:
        
        print('Device:', device_type)
        
        for location_name in ['home','secondary']:
            
            print('Location:', location_name)
            
            hours_week[(personal_value,device_type,location_name)]=spark.read.parquet(os.path.join(path_to_data,source,country,'personal_hours','week',str(personal_value),str(device_type),location_name)).toPandas()

            hours_weekend[(personal_value,device_type,location_name)]=spark.read.parquet(os.path.join(path_to_data,source,country,'personal_hours','weekend',str(personal_value),str(device_type),location_name)).toPandas()

Personal Value: 1
Device: 0
Location: home
Location: secondary
Device: 1
Location: home
Location: secondary
Personal Value: 2
Device: 0
Location: home
Location: secondary
Device: 1
Location: home
Location: secondary


In [24]:
for personal_value in range(1,4):
    
    print('Personal Value:', personal_value)
    
    for device_type in [0,1]:
        
        print('Device:', device_type)
        
        for location_name in ['home','secondary']:
            
            print('Location:', location_name)

            fig,ax=plt.subplots(figsize=(8,5))
            hours_week[(personal_value,device_type,location_name)].set_index('hour')['count'].sort_index().plot(ax=ax,kind='bar',color='k')
            ax.tick_params(which='both',direction='in',pad=3)
            plt.xticks(rotation=0)
            ax.set_xlabel('Hour at Personal Location')
            ax.set_ylabel('Number of Pings')
            plt.savefig(os.path.join(path_to_data+'/../fig','hours-week-'+str(personal_value)+'-personal-device-'+str(device_type)+'-'+str(location_name)+'.pdf'),bbox_inches='tight')

            fig,ax=plt.subplots(figsize=(8,5))
            hours_weekend[(personal_value,device_type,location_name)].set_index('hour')['count'].sort_index().plot(ax=ax,kind='bar',color='k')
            ax.tick_params(which='both',direction='in',pad=3)
            plt.xticks(rotation=0)
            ax.set_xlabel('Hour at Personal Location')
            ax.set_ylabel('Number of Pings')
            plt.savefig(os.path.join(path_to_data+'/../fig','hours-weekend-'+str(personal_value)+'-personal-device-'+str(device_type)+'-'+str(location_name)+'.pdf'),bbox_inches='tight')

Personal Value: 1
Device: 0
Location: home


  del sys.path[0]


Location: secondary


  del sys.path[0]


Device: 1
Location: home


  del sys.path[0]


Location: secondary


  del sys.path[0]


Personal Value: 2
Device: 0
Location: home


  del sys.path[0]


Location: secondary


  del sys.path[0]


Device: 1
Location: home


  del sys.path[0]


Location: secondary


  del sys.path[0]


Personal Value: 3
Device: 0
Location: home


  del sys.path[0]


KeyError: (3, 0, 'home')