In [None]:
import pandas as pd
import csv
import string
from tqdm import tqdm_notebook as tqdm
import numpy as np
import time
import math


def haversine(coord1, coord2):     # this is a function that calculates the distance between two GPS coordinates
    R = 6372800  # Earth radius in meters
    lat1, lon1 = coord1
    lat2, lon2 = coord2
    
    phi1, phi2 = math.radians(lat1), math.radians(lat2) 
    dphi       = math.radians(lat2 - lat1)
    dlambda    = math.radians(lon2 - lon1)
    
    a = math.sin(dphi/2)**2 + \
        math.cos(phi1)*math.cos(phi2)*math.sin(dlambda/2)**2
    
    return 2*R*math.atan2(math.sqrt(a), math.sqrt(1 - a))


df_block_group = pd.read_csv('cbg_geographic_data.csv')    # This is the truncated file that only includes IL ZIP codes
df_zipcodes = pd.read_csv('datasets_5391_8097_zip_lat_long.csv') # This is the truncated file that only includes GPS coordinates in or close to IL

output = {}
for index1, row1 in df_block_group.iterrows():
    distance = 100000000 # very large number
    zipcode = '000000'
    for index2, row2 in df_zipcodes.iterrows():
        if haversine( [row1['latitude'], row1['longitude']], [row2['LAT'], row2['LNG']]) < distance:
            distance = haversine( [row1['latitude'], row1['longitude']], [row2['LAT'], row2['LNG']])
            zipcode = row2['ZIP']
    output[int(row1['census_block_group'])] = int(zipcode)


In [None]:
w = csv.writer(open("CBG_zipcode.csv", "w"))
for key, val in output.items():
    w.writerow([key, val])

In [None]:
# This snippet simply eliminates all non-Cook county ZIP codes from the CBG_zipcode.csv. Constant truncating of data
# was necessary in order to reduce computation time.

df_cbg_zipcode = pd.read_csv('CBG_zipcode.csv')

df_Cook_zip = pd.read_csv('cook_zipcodes.csv')

w = csv.writer(open("CBG_zipcode_2.csv", "w"))
for index1, row1 in df_cbg_zipcode.iterrows():
    b = 0
    for index2, row2 in df_Cook_zip.iterrows():
        if row1['Zipcode'] == row2['Cook_zipcodes']:
            b = 1
    if b == 1:
        w.writerow(row1)

In [None]:
# code for getting data from SAFEGRAPH only for Cook county cbg's between dates Febr 01 - Apr 30

# Note: This code was modified and run on the Midway cluster, as 90 different small scripts, each for a single date.
# The 90 codes were run in parallel and then the resulting CSV's concatenated using the cat command in bash.

# For this code to run, the social distancing data, which is downloaded as folders of gunzipped files corresponding to
# each date, must be copied to a single folder and unzipped. The resulting file for Febr 01 2020, for example, has the 
# name 2020-02-01-social-distancing.csv.

df_CBG_zipcode = pd.read_csv('CBG_zipcode_2.csv')

date = '2020-{0}-{1}'

date_cluster_0 = '2020-0{0}-0{1}'

date_cluster = '2020-0{0}-{1}'

days_in_month = [29,31,30]  

w = csv.writer("Social_distancing_data_Cook_county.csv")

for i in range(2,5):
    for j in range(1,days_in_month[i-2]+1):
        if j > 9:
            df_social_dist = pd.read_csv(date_cluster.format(i,j) + '-social-distancing.csv')
        else:
            df_social_dist = pd.read_csv(date_cluster_0.format(i,j) + '-social-distancing.csv')
        output = []
        for index1, row1 in df_social_dist.iterrows():
            for index2, row2 in df_CBG_zipcode.iterrows():
                if row1['origin_census_block_group'] == row2['CBG']:
                    row_output = [row2['CBG'], row2['Zipcode'], date.format(i,j)]
                    for column in df_social_dist.columns:
                        if column != 'origin_census_block_group' and column != 'date_range_start' and column != 'date_range_end':
                            row_output.append(row1[column])
                    output.append(row_output)
        for item in output:
            w.writerow(item)


In [None]:
# This snippet of code aggregates the data per ZIP code. If the data is a MEDIAN, then it computes the MEDIAN of
# all values across different CBG's. If the data is a count, it adds together the values across different CBG's. 

def read_list(line):
    line = line.strip('[]').split(',')
    return [int(x) for x in line]


df_Cook_zip = pd.read_csv('cook_zipcodes.csv')
df_social_dist = pd.read_csv('Social_distancing_data_Cook_county.csv')

w = csv.writer(open("Social_distancing_data_Cook_febr_april.csv", "w"))
columns = ['zipcode', 'date', 'device_count', 'distance_traveled_from_home', 'completely_home_device_count', 'median_home_dwell_time', 'part_time_work_behavior_devices', 'full_time_work_behavior_devices', 'delivery_behavior_devices', 'median_non_home_dwell_time', 'candidate_device_count', 'median_percentage_time_home']

    
w.writerow(columns)

date = '2020-{0}-{1}'
days_in_month = [29,31,30]

for index1, row1 in df_Cook_zip.iterrows():
    for i in range(2,5):
        for j in range(1,days_in_month[i-2]+1):
            device_count = 0
            distance_traveled_from_home = [] 
            completely_home_device_count = 0
            median_home_dwell_time = [] 
            part_time_work_behavior_devices = 0 
            full_time_work_behavior_devices = 0
            delivery_behavior_devices = 0
            median_non_home_dwell_time = [] 
            candidate_device_count = 0
            median_percentage_time_home = [] 
            date_local = date.format(i,j)
            for index2, row2 in df_social_dist.iterrows():
                if row2['zipcode'] == row1['Cook_zipcodes'] and row2['date'] == date_local:
                    device_count += row2['device_count']
                    distance_traveled_from_home.append(row2['distance_traveled_from_home'])
                    completely_home_device_count += row2['completely_home_device_count']
                    median_home_dwell_time.append(row2['median_home_dwell_time']) 
                    part_time_work_behavior_devices += row2['part_time_work_behavior_devices'] 
                    full_time_work_behavior_devices += row2['full_time_work_behavior_devices']
                    delivery_behavior_devices += row2['delivery_behavior_devices']
                    median_non_home_dwell_time.append(row2['median_non_home_dwell_time'])  
                    candidate_device_count += row2['candidate_device_count']
                    median_percentage_time_home.append(row2['median_percentage_time_home'])  
            distance_traveled_from_home = np.median(distance_traveled_from_home)
            median_home_dwell_time = np.median(median_home_dwell_time)
            median_non_home_dwell_time = np.median(median_non_home_dwell_time)
            median_percentage_time_home = np.median(median_percentage_time_home)
            output = [int(row1['Cook_zipcodes']), date_local, int(device_count), distance_traveled_from_home, int(completely_home_device_count), median_home_dwell_time, int(part_time_work_behavior_devices), int(full_time_work_behavior_devices), int(delivery_behavior_devices), median_non_home_dwell_time, int(candidate_device_count), median_percentage_time_home]
            w.writerow(output)