In [2]:
import geopandas as gpd
import pandas as pd
from covid19dh import covid19
import requests
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from shapely.geometry import Point, LineString
import contextily as ctx

## Airports

In [28]:
airports = gpd.read_file('../raw data/Trans_AirportPoint.shp')

In [29]:
public_airports = airports[(airports.fcode == 20000) & (airports.ownertype == 1)]
public_airports

In [30]:
key_airports = public_airports[public_airports.name.str.contains('ational Airport')]

In [31]:
key_airports.name.to_list()

In [58]:
key_airports.to_file('key_airports_locs.gpkg', index=False, driver="GPKG")
key_airports

In [32]:
locs = []
for _, row in key_airports.iterrows():
    x, y = row.geometry.x, row.geometry.y  # longitude, latitude
    url = f"https://geocoding.geo.census.gov/geocoder/geographies/coordinates?x={x}&y={y}&benchmark=Public_AR_Current&vintage=Current_Current&format=json"
    response = requests.get(url).json()
    
    fips = response['result']['geographies']['Counties'][0]['GEOID']
    county = response['result']['geographies']['Counties'][0]['NAME']
    locs.append([fips, county])

In [33]:
len(locs)

In [34]:
locs_df = pd.DataFrame(locs, columns=['fips', 'county_name'])
locs_df

## Highways

In [37]:
rd0 = gpd.read_file('../raw data/Trans_RoadSegment_0.shp')

In [38]:
rd1 = gpd.read_file('../raw data/Trans_RoadSegment_1.shp')

In [39]:
roads = pd.concat([rd0, rd1], axis=0)

In [40]:
interstates = roads[~roads.interstate.isna()][['interstate', 'stco_fipsc']]
interstates['interstate'] = interstates['interstate'].astype(str).str.split(',')
interstates = interstates.explode('interstate')

In [41]:
interstate_matrix = interstates.assign(value=1).pivot_table(
    index='stco_fipsc', columns='interstate', values='value',
    aggfunc='max', fill_value=0
)

In [42]:
interstate_matrix

## Exogenous factors

### Negative values are NOT real negatives but represent that the value is a guess

See https://covid19datahub.io/articles/docs.html

In [7]:
data = pd.read_csv('../raw data/USA_exog_var.csv')

In [8]:
exo_county = data[~data.administrative_area_level_3.isna()]

In [9]:
exo_il_counties = exo_county[exo_county.administrative_area_level_2 == 'Illinois']
exo_il_counties.head()

In [10]:
exo_il_counties.drop(['id', 'administrative_area_level_2', 'iso_alpha_2', 'iso_alpha_3', 'iso_currency', 'iso_numeric', 'key_google_mobility', 'key_apple_mobility', 'key_jhu_csse', 'key_gadm', 'key_gadm'], axis=1, inplace=True)

In [24]:
exo_il_counties.head()

### County coordinates (temporary-ish)

In [25]:
counties_coords = exo_il_counties[['key_local', 'latitude', 'longitude']].drop_duplicates()

In [26]:
counties_coords.key_local = counties_coords.key_local.apply(lambda x: str(x)[:-2])
counties_coords.set_index('key_local', inplace=True)
counties_coords

## County Adjacencies

https://www.census.gov/geographies/reference-files/time-series/geo/county-adjacency.html? county adjacencies

In [50]:
county_adj = pd.read_csv('../raw data/county_adjacency2025.txt', delimiter='|', dtype=str)

In [51]:
county_adj

In [52]:
county_adj['state'] = county_adj['County Name'].apply(lambda x: x[-2:])

In [53]:
il_adj = county_adj[county_adj.state == 'IL']
il_adj['neighbor_state'] = il_adj['Neighbor Name'].apply(lambda x: x[-2:])
il_adj

In [54]:
il_county_adj_matrix_base = il_adj[il_adj.neighbor_state == 'IL'].pivot_table(
    index='County GEOID', columns='Neighbor GEOID', values='Length',
    aggfunc='sum', fill_value=0
)

In [55]:
il_county_adj_matrix_base

## R0
https://pmc.ncbi.nlm.nih.gov/articles/PMC10067514

In [157]:
r0 = pd.DataFrame([['Alpha', 'Beta', 'Gamma', 'Delta', 'Omicron'], [1.19, 1.15, 1.21, 1.17, 1.53]]).T
r0.columns = ['variant', 'r0']

In [159]:
r0.to_csv('r0.csv', index=False)

## Airport Adjacency Matrix

In [36]:
locs_df

In [35]:
airport_fips = locs_df.fips.drop_duplicates().to_frame('x')
airport_fips_dummy = locs_df.fips.drop_duplicates().to_frame('y')
airport_fips['key'] = 1
airport_fips_dummy['key'] = 1

airport_fips_cartesian = pd.merge(airport_fips, airport_fips_dummy, on='key').drop('key', axis=1)
airport_fips_cartesian

In [43]:
il_county_adj_matrix_airport = il_county_adj_matrix_base.map(lambda x: 0)
il_county_adj_matrix_airport.index = il_county_adj_matrix_airport.index.astype(int)
il_county_adj_matrix_airport.columns = il_county_adj_matrix_airport.columns.astype(int)

In [44]:
for _, row in airport_fips_cartesian.iterrows():
    il_county_adj_matrix_airport.loc[int(row.x), int(row.y)] = 1

In [45]:
np.fill_diagonal(il_county_adj_matrix_airport.values, 0)
il_county_adj_matrix_airport

In [46]:
il_county_adj_matrix_airport.to_csv('major_airports_adj_matrix.csv', index=True)

## Highway Adjacency Matrix

In [60]:
il_county_adj_matrix_highway = interstate_matrix.dot(interstate_matrix.T)
np.fill_diagonal(il_county_adj_matrix_highway.values, 0)

In [61]:
il_county_adj_matrix_highway = il_county_adj_matrix_highway.reindex(index=il_county_adj_matrix_airport.index, columns=il_county_adj_matrix_airport.index, fill_value=0)

Combined Adjacency Matrix

In [63]:
il_county_adj_matrix_base.to_csv('border_adj_matrix.csv', index=True)
il_county_adj_matrix_airport.to_csv('airport_adj_matrix.csv', index=True)
il_county_adj_matrix_highway.to_csv('highway_adj_matrix.csv', index=True)

In [49]:
il_county_adj_matrix_base = pd.read_csv('../processed data/border_adj_matrix.csv', index_col=0)
il_county_adj_matrix_airport = pd.read_csv('../processed data/airport_adj_matrix.csv', index_col=0)
il_county_adj_matrix_highway = pd.read_csv('../processed data/highway_adj_matrix.csv', index_col=0)

In [50]:
def linear_comb(border_weight, airport_weight, highway_weight):
    return (il_county_adj_matrix_base.map(lambda x: float(x)) * border_weight + 
            airport_weight * il_county_adj_matrix_airport.map(lambda x: float(x)) + 
            highway_weight * il_county_adj_matrix_highway.map(lambda x: float(x)))

In [51]:
adj_mat = linear_comb(0.001020023053, 100, 20)

In [52]:
adj_mat

## NYT Cases Data

In [171]:
cases_20 = pd.read_csv('../raw data/us-counties-2020.csv')
cases_21 = pd.read_csv('../raw data/us-counties-2021.csv')
cases_22 = pd.read_csv('../raw data/us-counties-2022.csv')
cases_23 = pd.read_csv('../raw data/us-counties-2023.csv')

In [172]:
cases_21

In [173]:
cases_20.shape, cases_21.shape, cases_22.shape, cases_23.shape

In [174]:
cases = pd.concat([cases_20, cases_21, cases_22, cases_23], axis=0)
cases.head()

In [175]:
cases.geoid = cases.geoid.apply(lambda x: str(x)[4:])
cases.head()

## Variant Prevalence

In [161]:
variant = pd.read_csv('../raw data/variant_prevalence_il.csv')
variant_2 = pd.read_csv('../raw data/variant_prevalence_2_il.csv')
variant_omicron = pd.read_csv('../raw data/omicron_prevalence_il.csv')

In [4]:
variant.shape, variant_2.shape, variant_omicron.shape

In [5]:
combined_variant = pd.concat([variant, variant_2, variant_omicron], axis=0)

In [6]:
combined_variant.head()

In [11]:
exo_il_counties

In [12]:
props = combined_variant[['date', 'proportion', 'lineage', 'location']]
props.head()

In [13]:
props.location = props.location.apply(lambda x: x[-5:])
props

In [14]:
props_ts = props.pivot_table(values='proportion', index=['location', 'date'], columns='lineage', fill_value=0).reset_index()

In [15]:
props_ts.date = pd.to_datetime(props_ts.date)

In [16]:
exo_il_counties.date = pd.to_datetime(exo_il_counties.date)
exo_il_counties.key_local = exo_il_counties.key_local.apply(lambda x: str(int(x)))

In [17]:
exo_il_counties.rename({'key_local':'location'}, axis=1, inplace=True)

In [18]:
exo_il_counties.iloc[:, 1:29] = exo_il_counties.iloc[:, 1:29].map(lambda x: np.abs(x))

In [147]:
# exo_il_counties.to_csv('cases_and_exogenous_il.csv', index=False)

In [19]:
props_ts.head()

In [20]:
props_ts.iloc[:,2:].sum(axis=1).max()

In [35]:
props_ts.to_csv('pivoted_prevalences.csv', index=False)

In [20]:
props_ts['Omicron'] = props_ts.iloc[:, -4:].sum(axis=1)

In [21]:
props_ts = props_ts.drop(['omicron_1', 'omicron_2', 'omicron_3', 'omicron_4'], axis=1)

In [22]:
props_ts

In [23]:
prop_ts_dfilled = (props_ts
       .set_index(['location', 'date'])    
       .groupby(level=0)                     
       .apply(lambda g: g.droplevel(0).asfreq('D'))  
       .reset_index())                       


In [29]:
prop_ts_dfilled

In [34]:
prop_ts_dfilled.groupby('location').apply(lambda x: x.isna().sum())

In [None]:
for loc, sub in prop_ts_dfilled.groupby('location'):
    sub.plot(x='date', y='Omicron', ax=plt.gca())
plt.show()

In [38]:
num_cols = prop_ts_dfilled.select_dtypes(include='number').columns

out = (prop_ts_dfilled.groupby('location', group_keys=False)
         .apply(lambda g: (g.set_index('date')
                             .assign(**{
                               col: g.set_index('date')[col]
                                       .interpolate(method='time', limit_area='inside')
                               for col in num_cols
                             })
                             .reset_index()))
      )


In [None]:
for loc, sub in out.groupby('location'):
    sub.plot(x='date', y='Alpha', ax=plt.gca())
plt.show()

In [68]:
out.date.max()

In [42]:
out[num_cols].sum(axis=1).max()

In [181]:
cases.rename({'geoid':'location'}, inplace=True, axis=1)

In [179]:
cases.date = pd.to_datetime(cases.date)

In [311]:
combined = cases.merge(out, on=['date', 'location'], how='inner')

In [312]:
combined.shape

In [313]:
combined.drop(['county', 'state', 'deaths', 'deaths_avg', 'deaths_avg_per_100k'], axis=1, inplace=True)
combined.head()

In [314]:
# combined = exo_il_counties.merge(out, on=['date', 'location'], how='outer')
combined = combined.set_index('date')
def expand_to_daily(g):
    end = pd.Timestamp('2023-01-01')
    idx = pd.date_range(pd.Timestamp('2021-01-01'), end, freq='D')
    out = g.reindex(idx)
    return out

# IMPORTANT: keep group key as an index level (avoid losing it)
combined = (combined.groupby('location', group_keys=True)
           .apply(expand_to_daily))

# Make the date an index level name for clarity
combined.index = combined.index.set_names(['location', 'date'])
combined.drop('location', inplace=True, axis=1)
combined.reset_index(inplace=True)

In [315]:
combined[['cases', 'cases_avg', 'cases_avg_per_100k']] = combined[['cases', 'cases_avg', 'cases_avg_per_100k']].fillna(0)
combined.head()

In [316]:
combined = combined.groupby('location').apply(lambda g: g.interpolate())

In [317]:
combined.columns

In [326]:
combined[['Alpha', 'Beta', 'Delta', 'Epsilon', 'Gamma', 'Iota', 'Omicron']].sum(axis=1)

In [327]:
combined['Other'] = 1 - combined[['Alpha', 'Beta', 'Delta', 'Epsilon', 'Gamma', 'Iota', 'Omicron']].sum(axis=1)

In [329]:
combined.loc[combined.cases == 0, ['Alpha', 'Beta', 'Delta', 'Epsilon', 'Gamma', 'Iota', 'Omicron', 'Other']] = 0

In [330]:
combined.reset_index(drop=True, inplace=True)

In [331]:
var_cases = pd.concat([combined[['location', 'date']], combined[['Alpha', 'Beta', 'Delta', 'Epsilon', 'Gamma', 'Iota', 'Omicron', 'Other']].multiply(combined.cases, axis=0)], axis=1)

In [332]:
var_cases

In [333]:
indices = exo_il_counties[['location', 'date'] + exo_il_counties.filter(like='index').columns.to_list()]

In [334]:
combined_covariates_v3 = var_cases.merge(indices, how='left', on=['location', 'date']).groupby('location').apply(lambda g: g.interpolate()).reset_index(drop=True)

In [335]:
combined_covariates_v3.loc[:, ~combined_covariates_v3.columns.isin(['location', 'date'])] = combined_covariates_v3.select_dtypes(include='number').map(lambda x: 0 if x<0 else x)

In [336]:
combined_covariates_v3

In [284]:
groups = combined_covariates_v3['location'].unique()
fig, axes = plt.subplots(nrows=10, ncols=11, figsize=(30, 30), sharex=True)

axes = axes.flatten()

for ax, g in zip(axes, groups):
    sub = combined_covariates_v3[combined_covariates_v3['location'] == g]
    ax.plot(sub['date'], sub['Alpha'], color='red')
    ax.plot(sub['date'], sub['Omicron'], color='blue')
    ax.plot(sub['date'], sub['Delta'], color='green')
    ax.set_title(f'County {g}')
    ax.set_ylabel('y')

axes[-1].set_xlabel('date')
plt.tight_layout()
plt.show()


In [287]:
tmp = combined_covariates_v3.groupby('date').sum().reset_index()

plt.plot(tmp['date'], tmp['Alpha'], color='red')
plt.plot(tmp['date'], tmp['Omicron'], color='blue')
plt.plot(tmp['date'], tmp['Delta'], color='green')
plt.tight_layout()
plt.show()


In [299]:
combined_covariates_v3

In [337]:
combined_covariates_v3_rolling = combined_covariates_v3.copy()
combined_covariates_v3_rolling[combined_covariates_v3.columns.to_list()[2:]] = (combined_covariates_v3.groupby('location')[combined_covariates_v3.columns.to_list()[2:]]
         .transform(lambda s: s.rolling(7, min_periods=1, center=True).mean()))
combined_covariates_v3_rolling.head()

In [354]:
tmp_3 = combined_covariates_v3_rolling.groupby('date').sum().reset_index()

fig, ax1 = plt.subplots(figsize=(12,8))

ax2 = ax1.twinx()  # create secondary y-axis

ax1.plot(tmp_3['date'], tmp_3['Alpha'], color='red', label='Alpha')
ax1.plot(tmp_3['date'], tmp_3['Omicron'], color='orange', label='Omicron')
ax1.plot(tmp_3['date'], tmp_3['Delta'], color='blue', label='Delta')
ax1.plot(tmp_3['date'], tmp_3['Beta'], color='brown', label='Beta')
ax1.plot(tmp_3['date'], tmp_3['Epsilon'], color='yellow', label='Epsilon')
ax1.plot(tmp_3['date'], tmp_3['Gamma'], color='green', label='Gamma')
ax1.plot(tmp_3['date'], tmp_3['Other'], color='gray', label='Other Variants')

ax2.plot(tmp_3['date'], tmp_3.government_response_index/102, label='Government Response', color='black')
ax2.plot(tmp_3['date'], tmp_3.stringency_index/102, label='Stringency', color='lightsteelblue')
ax2.plot(tmp_3['date'], tmp_3.containment_health_index/102, label='Containment Health', color='pink')

lines_1, labels_1 = ax1.get_legend_handles_labels()
lines_2, labels_2 = ax2.get_legend_handles_labels()
ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc='upper right')
ax1.set_ylabel('7-day Rolling Average of Cases')
ax2.set_ylabel('Average Index Strength')
ax2.set_ylim(ymin=0, ymax=100)
plt.xlabel('Date')
plt.tight_layout()
plt.show()


In [339]:
combined_covariates_v3_rolling.to_csv('../processed data/rolling_covariates_v3.csv', index=False)