In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from shapely.geometry import MultiPolygon, Polygon, Point
import numpy as np
import contextily as ctx
import h3
import geopandas as gpd

In [238]:
data_path = '/Users/shivyucel/Documents/SDS_2021.nosync/SDS_2020-2021/SDS_Thesis/Data/h3/'

## Load in Hexagon ID Info

In [198]:
hex_ids = pd.read_csv(data_path + '/commuting/h3_IDs.csv')

In [199]:
zones = np.array(commuting['SOURCE'].unique())

## Load in Population, Income, and In-Degree

In [200]:
populations = np.loadtxt(data_path + 'population/new_h3_pop.csv', delimiter =',')

In [201]:
len(populations)

2599

In [202]:
income = pd.read_csv(data_path + '/demographics/h3_income.csv')
income = income[['h3', 'weighted_income_avg_head']]
income.rename(columns={'h3': 'h3_id', 'weighted_income_avg_head':'income'}, inplace=True)

In [203]:
commuting = pd.DataFrame(np.loadtxt(data_path + f"/commuting/real_commuting_for_ED.csv", delimiter=","))
commuting.rename(columns={0: 'SOURCE', 1: 'TARGET', 2:'FLUX'}, inplace=True)
commuting.drop_duplicates(['SOURCE', 'TARGET'], inplace=True)

In [204]:
in_degree = pd.DataFrame(commuting.groupby('TARGET').count()).reset_index()
in_degree = in_degree[['TARGET', 'FLUX']]
in_degree = in_degree.merge(hex_ids, left_on='TARGET', right_on='0')
in_degree.rename(columns = {'Unnamed: 0': 'h3_id', 'FLUX': 'in_degree', 'TARGET': 'numeric_id'}, inplace=True)
in_degree = in_degree.iloc[:, :3]

## Load in Weighted Infection Delays

In [205]:
weighted_ID = pd.read_csv('/Users/shivyucel/Documents/SDS_2021.nosync/SDS_2020-2021/SDS_Thesis/Data/paper_data/weighted_infection_delay.csv')
weighted_ID.drop('Unnamed: 0', axis=1 ,inplace=True)
weighted_ID.columns = weighted_ID.columns.astype(int)
weighted_ID.columns = zones[weighted_ID.columns]
weighted_ID_hex = pd.DataFrame(weighted_ID.columns).merge(hex_ids, left_on=0, right_on='0').rename(columns={'Unnamed: 0': 'hexids'})
weighted_ID.columns = weighted_ID_hex.hexids

## Create Files with All Info

In [208]:
merged = income.merge(in_degree, on='h3_id')

In [209]:
median_ID = pd.DataFrame(weighted_ID[:10].median()).reset_index()

In [210]:
median_ID.rename(columns = {0: 'weighted_ID'}, inplace=True)

In [211]:
master = merged.merge(median_ID, left_on='h3_id', right_on='hexids')[['h3_id', 'income', 'in_degree', 'numeric_id', 'weighted_ID']]

In [212]:
master['income_quartile'] = pd.qcut(master['income'], q=4, labels=[1, 2, 3, 4])
master['in_degree_quartile'] = pd.qcut(master['in_degree'], q=4, labels=[1, 2, 3, 4])

In [218]:
master.sort_values(by='numeric_id', inplace=True)

In [221]:
master['pop'] = populations

In [240]:
master.to_csv('/Users/shivyucel/Documents/SDS_2021.nosync/SDS_2020-2021/SDS_Thesis/Data/paper_data/weighted_hexagon_data.csv')

## Outbreak Splitting Infection Delay

In [69]:
grouped_low = pd.read_csv('/Users/shivyucel/Documents/SDS_2021.nosync/SDS_2020-2021/SDS_Thesis/Data/paper_data/v2_grouped_low.csv')
grouped_high = pd.read_csv('/Users/shivyucel/Documents/SDS_2021.nosync/SDS_2020-2021/SDS_Thesis/Data/paper_data/v2_grouped_high.csv')

In [70]:
grouped_low.drop('Unnamed: 0', axis=1 ,inplace=True)
grouped_high.drop('Unnamed: 0', axis=1 ,inplace=True)

In [72]:
low_delays = pd.DataFrame(grouped_low.iloc[:10, ].median())

In [73]:
high_delays = pd.DataFrame(grouped_high.iloc[:10, ].median())

In [74]:
low_delays = low_delays.rename(columns={0:'low_ID'})
high_delays = high_delays.rename(columns={0:'high_ID'})

In [75]:
low_delays.reset_index(inplace=True)
high_delays.reset_index(inplace=True)

In [76]:
low_delays['index'] = low_delays['index'].astype(float)
high_delays['index'] = high_delays['index'].astype(float)

In [78]:
low_delays = low_delays.merge(master, left_on='index', right_on='numeric_id')

In [79]:
high_delays = high_delays.merge(master, left_on='index', right_on='numeric_id')

In [81]:
low_delays['income_bin'] = pd.qcut(low_delays['income'], q=4, labels=[1, 2, 3, 4])

In [82]:
high_delays['income_bin'] = pd.qcut(high_delays['income'], q=4, labels=[1, 2, 3, 4])

In [84]:
low_delays['centrality_bin'] = pd.qcut(low_delays['in_degree'], q=4, labels=[1, 2, 3, 4])

In [85]:
high_delays['centrality_bin'] = pd.qcut(high_delays['in_degree'], q=4, labels=[1, 2, 3, 4])

In [86]:
low_delays['outbreak_centrality'] = 'Low Outbreak Centrality'
high_delays['outbreak_centrality'] = 'High Outbreak Centrality'

In [87]:
low_delays.rename(columns={'low_ID': 'infection_delay'}, inplace=True)
high_delays.rename(columns={'high_ID': 'infection_delay'}, inplace=True)

In [88]:
final = low_delays.append(high_delays)

In [90]:
final.to_csv('/Users/shivyucel/Documents/SDS_2021.nosync/SDS_2020-2021/SDS_Thesis/Data/paper_data/longform_outbreak_split_delays.csv')