In [28]:
import os
import matplotlib as mpl
if os.environ.get('DISPLAY','') == '':
    mpl.use('Agg')

import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import pandas as pd
from timeit import default_timer as timer

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,hour,dayofweek,to_timestamp,size,isnan,lit,date_format,to_timestamp,struct,expr,explode,collect_list
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType, DoubleType
import pyspark.sql.functions as F
from pyspark.sql import Window

In [2]:
try:
    spark
except NameError:
    print('Create Spark')
    spark=SparkSession.builder.appName("").getOrCreate()

Create Spark


In [42]:
source='cuebiq'
country='US'
cutoff_morning=8
cutoff_night=20

if os.getenv('CLUSTER')=='PRINCE':
    path_to_data='/scratch/spf248/covid/data'
    file_week='*00-099fe598-7adb-4411-b68c-a60d0c28cb7a-c000.snappy.parquet'
    file_weekend='*00-54e95071-216b-4a7a-89f3-de881c0bf379-c000.snappy.parquet'
    file_users='*0-ca003b73-1dea-4468-acde-e43360e39ed4-c000.snappy.parquet'
else:
    path_to_data='/user/spf248/covid/data'
    file_week='*'
    file_weekend='*'
    file_users='*'

# Distributions of Personal Hours

In [5]:
users=spark.read.parquet(os.path.join(path_to_data,source,country,'users',file_users))
users.cache()

DataFrame[cuebiq_id: string, n_days: bigint, device_type: float, n_pings: bigint]

In [6]:
personal_week=spark.read.parquet(os.path.join(path_to_data,source,country,'personal_week',file_week))
personal_week.cache()
personal_weekend=spark.read.parquet(os.path.join(path_to_data,source,country,'personal_weekend',file_weekend))
personal_weekend.cache()

DataFrame[cuebiq_id: string, point: struct<longitude:float,latitude:float>, hour: int, count: bigint]

In [9]:
personal_counts=(personal_week.unionByName(personal_weekend)).groupby(
'cuebiq_id','point').agg({'count':'sum'}).withColumnRenamed('sum(count)','count')

users=users.join(personal_counts.groupby('cuebiq_id').count().withColumnRenamed(
'count','n_personal'),on='cuebiq_id')

In [52]:
window = Window.partitionBy("cuebiq_id")

personal_home=personal_week.filter(
(personal_week['hour']<=cutoff_morning)|(personal_week['hour']>=cutoff_night)).groupby(
'cuebiq_id','point').agg({'count':'sum'}).withColumnRenamed('sum(count)','count').withColumn(
'max_count', F.max('count').over(window)).select('cuebiq_id','point')

personal_others=personal_counts.join(personal_home, on=['cuebiq_id','point'], how='left_anti')

In [64]:
personal_others.show()

+--------------------+--------------------+-----+
|           cuebiq_id|               point|count|
+--------------------+--------------------+-----+
|0013e58eddce6e000...|[-71.408295, 42.5...|    2|
|001b9e8dba94897bc...|[-121.24312, 37.7...|  565|
|001da3968dd884d3e...|[-90.66987, 42.71...|    1|
|0023f8ad4466c55da...|[-75.24996, 39.73...|  295|
|002b6489da17cb3ad...|[-98.39718, 29.38...|    1|
|002f30e939deb6193...|[-115.07455, 36.1...|    5|
|003256136e933886d...|[-98.48584, 29.58...|   13|
|00348fbf9cef92d8c...|[-77.4644, 37.45237]|   12|
|00379da09103a77fc...|[-81.243546, 34.1...|  391|
|004139e03dd5342aa...|[-102.145386, 31....|  345|
|0041c28ed5c01b65c...|[-83.67403, 34.02...|  139|
|0044e720c222c49db...|[-86.86765, 34.22...|  161|
|0045a6949e0402970...|[-118.007416, 34....|  429|
|0045c75fcb9552d5b...|[-84.874054, 32.5...|  148|
|0049734b96a4e77eb...|[-82.1938, 34.23725]|  107|
|004bc78f9d027894a...|[-87.65749, 41.83...|    2|
|00651a98bd8e60254...|[-83.96301, 39.66...|   13|


In [None]:
hours_week={}
hours_weekend={}

for personal_value in range(1,4):
    
    print('Personal Value:', personal_value)
    
    for device_type in [0,1]:
    
        hours_week[(personal_value,device_type)]=personal_week.join(users.filter(
        users['n_personal']==personal_value).filter(
        users['device_type']==device_type),on='cuebiq_id').groupby(
        'hour').agg({'count':'sum'}).withColumnRenamed('sum(count)','count').write.mode("overwrite").parquet(
        os.path.join(path_to_data,source,country,'personal_hours','week',str(personal_value),str(device_type)))

        hours_weekend[(personal_value,device_type)]=personal_weekend.join(
        users.filter(users['n_personal']==personal_value).filter(
        users['device_type']==device_type),on='cuebiq_id').groupby(
        'hour').agg({'count':'sum'}).withColumnRenamed('sum(count)','count').write.mode("overwrite").parquet(
        os.path.join(path_to_data,source,country,'personal_hours','weekend',str(personal_value),str(device_type)))

# Load and Plots

In [28]:
hours_week={}
hours_weekend={}

for personal_value in range(1,4):
    
    print('Personal Value:', personal_value)
    
    for device_type in [0,1]:
    
        hours_week[(personal_value,device_type)]=spark.read.parquet(
        os.path.join(path_to_data,source,country,'personal_hours','week',str(personal_value),str(device_type))).toPandas()

        hours_weekend[(personal_value,device_type)]=spark.read.parquet(
        os.path.join(path_to_data,source,country,'personal_hours','weekend',str(personal_value),str(device_type))).toPandas()

Personal Value: 1
Personal Value: 2
Personal Value: 3


In [29]:
for personal_value in range(1,4):
    
    print('Personal Value:', personal_value)
    
    for device_type in [0,1]:
    
        fig,ax=plt.subplots(figsize=(8,5))
        hours_week[(personal_value,device_type)].set_index('hour')['count'].sort_index().plot(ax=ax,kind='bar',color='k')
        ax.tick_params(which='both',direction='in',pad=3)
        plt.xticks(rotation=0)
        ax.set_xlabel('Hour at Personal Location')
        ax.set_ylabel('Number of Pings')
        plt.savefig(os.path.join(path_to_data+'/../fig','hours-week-'+str(personal_value)+'-device-'+str(device_type)+'.pdf'),bbox_inches='tight')

        fig,ax=plt.subplots(figsize=(8,5))
        hours_weekend[(personal_value,device_type)].set_index('hour')['count'].sort_index().plot(ax=ax,kind='bar',color='k')
        ax.tick_params(which='both',direction='in',pad=3)
        plt.xticks(rotation=0)
        ax.set_xlabel('Hour at Personal Location')
        ax.set_ylabel('Number of Pings')
        plt.savefig(os.path.join(path_to_data+'/../fig','hours-weekend-'+str(personal_value)+'-device-'+str(device_type)+'.pdf'),bbox_inches='tight')

Personal Value: 1
Personal Value: 2


  from ipykernel import kernelapp as app


Personal Value: 3


  import sys
  from ipykernel import kernelapp as app
  import sys
  from ipykernel import kernelapp as app
