In [2]:
import pandas as pd
import numpy as np

In [2]:
data_path = ''

## Load in Hexagon ID Info
hex_IDs.csv is a simple numerical index for the unique hexagon values (1-2599) associated with the ID number and code

In [198]:
hex_ids = pd.read_csv('h3_IDs.csv')

In [199]:
zones = np.array(commuting['SOURCE'].unique())

## Load in Population, Income, and In-Degree

In [7]:
populations = np.loadtxt(data_path + 'population/new_h3_pop.csv', delimiter =',')

In [202]:
income = pd.read_csv(data_path + '/demographics/h3_income.csv')
income = income[['h3', 'weighted_income_avg_head']]
income.rename(columns={'h3': 'h3_id', 'weighted_income_avg_head':'income'}, inplace=True)

In [203]:
commuting = pd.DataFrame(np.loadtxt(data_path + f"/commuting/real_commuting_for_ED.csv", delimiter=","))
commuting.rename(columns={0: 'SOURCE', 1: 'TARGET', 2:'FLUX'}, inplace=True)
commuting.drop_duplicates(['SOURCE', 'TARGET'], inplace=True)

In [204]:
in_degree = pd.DataFrame(commuting.groupby('TARGET').count()).reset_index()
in_degree = in_degree[['TARGET', 'FLUX']]
in_degree = in_degree.merge(hex_ids, left_on='TARGET', right_on='0')
in_degree.rename(columns = {'Unnamed: 0': 'h3_id', 'FLUX': 'in_degree', 'TARGET': 'numeric_id'}, inplace=True)
in_degree = in_degree.iloc[:, :3]

## Load in Weighted Infection Delays

In [205]:
weighted_ID = pd.read_csv('/Users/shivyucel/Documents/SDS_2021.nosync/SDS_2020-2021/SDS_Thesis/Data/paper_data/weighted_infection_delay.csv')
weighted_ID.drop('Unnamed: 0', axis=1 ,inplace=True)
weighted_ID.columns = weighted_ID.columns.astype(int)
weighted_ID.columns = zones[weighted_ID.columns]
weighted_ID_hex = pd.DataFrame(weighted_ID.columns).merge(hex_ids, left_on=0, right_on='0').rename(columns={'Unnamed: 0': 'hexids'})
weighted_ID.columns = weighted_ID_hex.hexids

## Create Files with All Info

In [208]:
merged = income.merge(in_degree, on='h3_id')

# Get median infection delay values over first 10 days
median_ID = pd.DataFrame(weighted_ID[:10].median()).reset_index()
median_ID.rename(columns = {0: 'weighted_ID'}, inplace=True)

In [209]:
master = merged.merge(median_ID, left_on='h3_id', right_on='hexids')[['h3_id', 'income', 'in_degree', 'numeric_id', 'weighted_ID']]
master['income_quartile'] = pd.qcut(master['income'], q=4, labels=[1, 2, 3, 4])
master['in_degree_quartile'] = pd.qcut(master['in_degree'], q=4, labels=[1, 2, 3, 4])
master.sort_values(by='numeric_id', inplace=True)
master['pop'] = populations

In [221]:
master.to_csv('/Users/shivyucel/Documents/SDS_2021.nosync/SDS_2020-2021/SDS_Thesis/Data/paper_data/weighted_hexagon_data.csv')

## Outbreak Splitting Infection Delay

In [69]:
grouped_low = pd.read_csv('/Users/shivyucel/Documents/SDS_2021.nosync/SDS_2020-2021/SDS_Thesis/Data/paper_data/v2_grouped_low.csv')
grouped_high = pd.read_csv('/Users/shivyucel/Documents/SDS_2021.nosync/SDS_2020-2021/SDS_Thesis/Data/paper_data/v2_grouped_high.csv')

In [70]:
grouped_low.drop('Unnamed: 0', axis=1 ,inplace=True)
grouped_high.drop('Unnamed: 0', axis=1 ,inplace=True)

In [72]:
# Get median infection delay values over first 10 days
low_delays = pd.DataFrame(grouped_low.iloc[:10, ].median())
high_delays = pd.DataFrame(grouped_high.iloc[:10, ].median())

In [73]:
low_delays = low_delays.rename(columns={0:'low_ID'})
high_delays = high_delays.rename(columns={0:'high_ID'})

low_delays.reset_index(inplace=True)
high_delays.reset_index(inplace=True)

low_delays['index'] = low_delays['index'].astype(float)
high_delays['index'] = high_delays['index'].astype(float)

low_delays = low_delays.merge(master, left_on='index', right_on='numeric_id')
high_delays = high_delays.merge(master, left_on='index', right_on='numeric_id')
low_delays['income_bin'] = pd.qcut(low_delays['income'], q=4, labels=[1, 2, 3, 4])
high_delays['income_bin'] = pd.qcut(high_delays['income'], q=4, labels=[1, 2, 3, 4])

In [84]:
low_delays['centrality_bin'] = pd.qcut(low_delays['in_degree'], q=4, labels=[1, 2, 3, 4])
high_delays['centrality_bin'] = pd.qcut(high_delays['in_degree'], q=4, labels=[1, 2, 3, 4])

low_delays['outbreak_centrality'] = 'Low Outbreak Centrality'
high_delays['outbreak_centrality'] = 'High Outbreak Centrality'

In [85]:
low_delays.rename(columns={'low_ID': 'infection_delay'}, inplace=True)
high_delays.rename(columns={'high_ID': 'infection_delay'}, inplace=True)

In [86]:
final = low_delays.append(high_delays)
final.to_csv('/Users/shivyucel/Documents/SDS_2021.nosync/SDS_2020-2021/SDS_Thesis/Data/paper_data/longform_outbreak_split_delays.csv')

## Testing

### Part I

In [6]:
from scipy.stats import f_oneway

In [2]:
# load weighted infection delay value table
df = pd.read_csv('data_files/result_data/weighted_hexagon_data.csv')

In [15]:
# each group below controls for income quartile and tests weighted infection delay distribution across centrality quartiles

df_income1_indegree1 = df[(df['income_quartile'] == 1) & (df['in_degree_quartile'] == 1)]['weighted_ID'].values
df_income1_indegree2 = df[(df['income_quartile'] == 1) & (df['in_degree_quartile'] == 2)]['weighted_ID'].values
df_income1_indegree3 = df[(df['income_quartile'] == 1) & (df['in_degree_quartile'] == 3)]['weighted_ID'].values
df_income1_indegree4 = df[(df['income_quartile'] == 1) & (df['in_degree_quartile'] == 4)]['weighted_ID'].values

df_income2_indegree1 = df[(df['income_quartile'] == 2) & (df['in_degree_quartile'] == 1)]['weighted_ID'].values
df_income2_indegree2 = df[(df['income_quartile'] == 2) & (df['in_degree_quartile'] == 2)]['weighted_ID'].values
df_income2_indegree3 = df[(df['income_quartile'] == 2) & (df['in_degree_quartile'] == 3)]['weighted_ID'].values
df_income2_indegree4 = df[(df['income_quartile'] == 2) & (df['in_degree_quartile'] == 4)]['weighted_ID'].values

df_income3_indegree1 = df[(df['income_quartile'] == 3) & (df['in_degree_quartile'] == 1)]['weighted_ID'].values
df_income3_indegree2 = df[(df['income_quartile'] == 3) & (df['in_degree_quartile'] == 2)]['weighted_ID'].values
df_income3_indegree3 = df[(df['income_quartile'] == 3) & (df['in_degree_quartile'] == 3)]['weighted_ID'].values
df_income3_indegree4 = df[(df['income_quartile'] == 3) & (df['in_degree_quartile'] == 4)]['weighted_ID'].values

df_income4_indegree1 = df[(df['income_quartile'] == 4) & (df['in_degree_quartile'] == 1)]['weighted_ID'].values
df_income4_indegree2 = df[(df['income_quartile'] == 4) & (df['in_degree_quartile'] == 2)]['weighted_ID'].values
df_income4_indegree3 = df[(df['income_quartile'] == 4) & (df['in_degree_quartile'] == 3)]['weighted_ID'].values
df_income4_indegree4 = df[(df['income_quartile'] == 4) & (df['in_degree_quartile'] == 4)]['weighted_ID'].values

print(f_oneway(df_income1_indegree1, df_income1_indegree2, df_income1_indegree3, df_income1_indegree4))

print(f_oneway(df_income2_indegree1, df_income2_indegree2, df_income2_indegree3, df_income2_indegree4))

print(f_oneway(df_income3_indegree1, df_income3_indegree2, df_income3_indegree3, df_income3_indegree4))

print(f_oneway(df_income4_indegree1, df_income4_indegree2, df_income4_indegree3, df_income4_indegree4))


F_onewayResult(statistic=721.2482412060185, pvalue=1.0882675548422818e-205)
F_onewayResult(statistic=613.9531231104972, pvalue=1.247013782581693e-188)
F_onewayResult(statistic=483.9489566317073, pvalue=1.274798421935157e-164)
F_onewayResult(statistic=552.1207479199736, pvalue=9.1059413001952e-178)


### Part II

In [8]:
from scipy.stats import ttest_ind

In [3]:
df = pd.read_csv('data_files/result_data/longform_outbreak_split_delays.csv')

In [7]:
df.groupby('outbreak_centrality')['infection_delay'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
outbreak_centrality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
High Outbreak Centrality,2599.0,7.249165,0.570394,4.717822,6.894848,7.290538,7.640882,8.924146
Low Outbreak Centrality,2599.0,7.56893,0.471436,5.369714,7.29298,7.569273,7.850733,9.24625


In [12]:
# split infection delay values based on high or low centrality outbreak location, compare distributions
high_outbreak_centrality = df[df['outbreak_centrality'] == 'High Outbreak Centrality']['infection_delay'].values
low_outbreak_centrality = df[df['outbreak_centrality'] == 'Low Outbreak Centrality']['infection_delay'].values

In [13]:
print(ttest_ind(high_outbreak_centrality, low_outbreak_centrality))

Ttest_indResult(statistic=-22.029353806637893, pvalue=6.79466867301726e-103)
