In [1]:
import geopandas as gpd
import pandas as pd
import requests
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from shapely.geometry import LineString, MultiLineString, Point, MultiPoint
from shapely import set_precision
import contextily as ctx
from shapely.ops import unary_union, linemerge, snap
from shapely.validation import make_valid
from math import isfinite
from scipy.spatial import cKDTree
from matplotlib.colors import Normalize
from matplotlib.cm import ScalarMappable
import matplotlib.lines as mlines
import matplotlib.patches as mpatches


## States

In [None]:
states = gpd.read_file('../raw data/state level data/tl_2024_us_state/tl_2024_us_state.shp').to_crs(5070)
states.head()

In [None]:
states.shape

In [None]:
states = states[~states.STATEFP.isin(['02', '14', '15', '72', '07', '03', '43', '52', '78', '72', '69', '66', '60'])]
states.shape

In [None]:
states['centroid'] = states.geometry.centroid
states.head()

## Highways

Weights are $W^H_{i,j} = \frac{M_i M_j}{C_{ij}}$, where $M_i = \sum_i \text{AADT} \times \text{lanes} \times \text{miles}$ and $C_{ij}$ is the shortest travel time between the centroids of states $i$ and $j$. Weights are then normalized by dividing by the maximum value.

In [None]:
highways = gpd.read_file('../raw data/state level data/NHS/National_Highway_System_(NHS).shp').to_crs(5070)

In [None]:
highways.head()

In [None]:
highways.shape

In [None]:
interstates = highways[highways.SIGNT1 == 'I']

In [None]:
interstates = interstates[['STFIPS', 'CTFIPS', 'ROUTEID', 'SIGNN1', 'LNAME', 'SPEED_LIMI','AADT','THROUGH_LA','MILES', 'geometry']]

In [None]:
interstates.STFIPS = interstates.STFIPS.astype(int).astype(str).str.zfill(2)

In [None]:
interstates = interstates[~interstates.STFIPS.isin(['02', '15', '72'])]

In [None]:
interstates.SPEED_LIMI = interstates.SPEED_LIMI.apply(lambda x: 65 if x < 45 else x)

## Graph construction

In [None]:
interstates["geometry"] = interstates["geometry"].apply(make_valid)

In [None]:
# explode multilines so we work with LineString pieces
interstates = interstates.explode(index_parts=False).reset_index(drop=True)


In [None]:
interstates["geometry"] = interstates.geometry.apply(lambda g: set_precision(g, 200))

In [None]:
U = unary_union(interstates.geometry)  # noding happens here

def iter_lines(geom):
    if geom.is_empty: 
        return
    if isinstance(geom, LineString):
        yield geom
    elif isinstance(geom, MultiLineString):
        for l in geom.geoms:
            yield from iter_lines(l)
    else:
        m = linemerge(geom)
        if isinstance(m, (LineString, MultiLineString)):
            yield from iter_lines(m)

noded_lines = list(iter_lines(U))
noded = gpd.GeoDataFrame(geometry=noded_lines, crs=interstates.crs)
noded = noded.reset_index().rename(columns={"index":"nid"})

In [None]:
noded_attr = gpd.overlay(noded, interstates[['AADT', 'THROUGH_LA', 'SPEED_LIMI', "geometry"]],
                         how="identity", keep_geom_type=False)

In [None]:
noded_attr["miles_piece"] = noded_attr.length * 0.000621371

In [None]:
w = noded_attr["miles_piece"].clip(lower=1e-6)
agg = (noded_attr
       .assign(w=w)
       .groupby("nid")
       .apply(lambda df: pd.Series({
           "miles_piece": df["miles_piece"].sum(),
           "speed": np.average(df['SPEED_LIMI'], weights=df["w"]),
           "AADT":  np.average(df['AADT'],  weights=df["w"]),
           "lanes": np.average(df['THROUGH_LA'],  weights=df["w"]),
       }))
       .reset_index())

noded_final = noded.merge(agg, on="nid", how="left")
noded_final["travel_min"] = (noded_final["miles_piece"] / noded_final["speed"].clip(lower=1e-6)) * 60.0


In [None]:
noded_final.rename({'speed':'SPEED_LIMI', 'lanes':'THROUGH_LA', 'miles_piece':'MILES'}, axis=1, inplace=True)

In [None]:
# saved
# seg_in_state = gpd.overlay(
#     interstates[['AADT', 'THROUGH_LA', 'MILES', "geometry"]],
#     states[['STATEFP', "geometry"]],
#     how="intersection",
#     keep_geom_type=False,
# )


In [None]:
# seg_in_state.to_file('../raw data/state level data/seg_in_state.shp')

In [None]:
seg_in_state = gpd.read_file('../raw data/state level data/NHS/highway_overlay_shp/seg_in_state.shp')
seg_in_state.head()

In [None]:
seg_in_state["part_miles"] = seg_in_state.length * 0.000621371

In [None]:
seg_in_state["cap"] = (
    seg_in_state['AADT'].astype(float).fillna(0.0)
    * seg_in_state['THROUGH_LA'].astype(float).fillna(1.0)
    * seg_in_state["part_miles"].clip(lower=1e-6)
)

M_by_state = (
    seg_in_state.groupby('STATEFP', as_index=False)["cap"].sum()
    .rename(columns={"cap": "M"})
)


In [None]:
states = states.merge(M_by_state, on='STATEFP', how="left")
states["M"] = states["M"].fillna(0.0)


In [None]:
def iter_lines(geom):
    """Yield LineString pieces from geometry, flattening MultiLineString."""
    if geom is None or geom.is_empty:
        return
    if isinstance(geom, LineString):
        yield geom
    elif isinstance(geom, MultiLineString):
        for ls in geom.geoms:
            if not ls.is_empty:
                yield ls

def coords2node(x, y, ndp=6):
    """Quantize coordinates for stable node keys (avoid floating-point duplicates)."""
    return round(float(x), ndp), round(float(y), ndp)


In [None]:
G = nx.Graph()

for _, r in noded_final.iterrows():
    speed = float(r['SPEED_LIMI'])
    miles = float(r['MILES'])
    # Guard against weird rows
    if not (isfinite(speed) and isfinite(miles)) or miles <= 0 or speed <= 0:
        continue
    travel_min = (miles / speed) * 60.0

    for ls in iter_lines(r.geometry):
        # Connect segment endpoints (you could also break at every vertex, but endpoints suffice for routing here)
        x0, y0 = ls.coords[0]
        x1, y1 = ls.coords[-1]
        u = coords2node(x0, y0)
        v = coords2node(x1, y1)
        # Combine parallel edges by keeping the minimum time (or sum—here min is reasonable)
        if G.has_edge(u, v):
            G[u][v]["travel_min"] = min(G[u][v]["travel_min"], travel_min)
            G[u][v]["miles"] = min(G[u][v]["miles"], miles)
        else:
            G.add_edge(u, v, travel_min=travel_min, miles=miles)


In [None]:
components = list(nx.connected_components(G))
len(components)

In [None]:
[len(c) for c in components]

In [None]:
plt.hist([len(c) for c in components])
plt.show()

In [None]:
largest_nodes = max(components, key=len)

In [None]:
G = G.subgraph(largest_nodes).copy()

In [None]:
def graph_edges_gdf(G, crs):
    rows = []
    for u, v, d in G.edges(data=True):
        rows.append({
            "u": u, "v": v,
            "travel_min": d.get("travel_min", np.nan),
            "miles": d.get("miles", np.nan),
            "geometry": LineString([u, v]),
        })
    return gpd.GeoDataFrame(rows, geometry="geometry", crs=crs)

edges_gdf = graph_edges_gdf(G, crs=states.crs)  # G nodes are (x,y) in same CRS as `states`
edges_gdf.plot()

In [None]:
node_xy = np.array(list(G.nodes))

In [None]:
kdt = cKDTree(node_xy)
def nearest_node(pt):
    d, idx = kdt.query([pt.x, pt.y])
    return tuple(node_xy[idx])
states["graph_node"] = states["centroid"].apply(nearest_node)

In [None]:
states.graph_node

In [None]:
for n in states.graph_node:
    if n not in G:
        print('False')

In [None]:
n = len(states)
T = np.full((n, n), np.inf, dtype=float)
state_ids = states['STATEFP'].tolist()

# Precompute single-source Dijkstra from each anchor
for i, src in enumerate(states["graph_node"]):
    # print(src)
    dist = nx.single_source_dijkstra_path_length(G, src, weight="travel_min")
    # print(len(dist))
    # Map to destination anchors
    for j, dst in enumerate(states["graph_node"]):
        T[i, j] = dist[dst]

In [None]:
# Clean up any zeros/diagonal
for i in range(n):
    T[i, i] = np.inf  # set to inf so weight becomes 0 on diagonal


In [None]:
alpha = 1 # state mass weight
beta = 2 # travel time weight

In [None]:
M = pd.read_csv('../processed data/state_level/highway weight components/highway_masses.csv', index_col=0)
T = pd.read_csv('../processed data/state_level/highway weight components/travel_times.csv', index_col=0)

In [None]:
M = M.mass.to_numpy()
# Impedance matrix C = T (minutes). Avoid divide-by-zero/infs later.
C = np.where(np.isfinite(T) & (T > 0), T, np.nan)

# w_ij = (M_i^alpha * M_j^alpha) / (C_ij^beta)
W = (M[:, None]**alpha) * (M[None, :]**alpha) / (C**beta)
W[~np.isfinite(W)] = 0.0
np.fill_diagonal(W, 0.0)



In [None]:
M = states["M"].to_numpy()  # shape (n,)
# Impedance matrix C = T (minutes). Avoid divide-by-zero/infs later.
C = np.where(np.isfinite(T) & (T > 0), T, np.nan)

# w_ij = (M_i^alpha * M_j^alpha) / (C_ij^beta)
W = (M[:, None]**alpha) * (M[None, :]**alpha) / (C**beta)
W[~np.isfinite(W)] = 0.0
np.fill_diagonal(W, 0.0)

# row_sums = W.sum(axis=1, keepdims=True)
# # Avoid division by zero for isolated states (no interstate mass or disconnected)
# W = np.divide(W, np.where(row_sums == 0, 1.0, row_sums))


In [None]:
pd.DataFrame(M, index=state_ids, columns=['mass']).to_csv('../processed data/state_level/highway_masses.csv', index=True)

In [None]:
pd.DataFrame(T, index=state_ids, columns=state_ids).to_csv('../processed data/state_level/travel_times.csv', index=True)

In [None]:
weights = pd.DataFrame(W, index=T.index, columns=T.columns)

In [None]:
weights

In [None]:
weights.to_csv('../processed data/state_level/highway_weights_unscaled.csv', index=True)

In [None]:
weights = weights / weights.max().max()
# weights.index = state_ids
# weights.columns = state_ids

In [None]:
weights # normalized weights

In [None]:
weights.loc[17, '18']

In [None]:
weights.to_csv('../processed data/state_level/state_level_highway_weights.csv', index=True)

In [None]:
weights.max().max()

In [None]:
M # state masses

In [None]:
T # travel time (minutes)

In [None]:
travel_times = pd.DataFrame(T)
travel_times.index = state_ids
travel_times.columns = state_ids

In [None]:
travel_times.loc['36', :]

In [None]:
W_norm = W/W.max()

In [None]:
W_norm.max()

In [None]:
r, c = weights.stack().idxmax()   # (row_label, col_label)
v = weights.loc[r, c]

In [None]:
r, c

In [None]:
W_norm

In [None]:
weights

In [None]:
STATE_ID_COL = "STATEFP"
anchors = states.copy()

# 1) Build an edges table (top-K per state to reduce clutter)
K = 10
rows = []
ids = anchors[STATE_ID_COL].tolist()
pts = list(anchors.centroid)

for i in range(len(ids)):
    # get top-K j (exclude self, nonpositive)
    wrow = W_norm[i, :].copy()
    wrow[i] = 0.0
    js = np.argsort(wrow)[::-1][:K]
    for j in js:
        if wrow[j] <= 0: 
            continue
        rows.append({
            "i": ids[i], "j": ids[j],
            "w": float(wrow[j]),
            "geometry": LineString([(pts[i].x, pts[i].y), (pts[j].x, pts[j].y)])
        })

edges_df = gpd.GeoDataFrame(rows, geometry="geometry", crs=states.crs)

# 2) Map weights to color/linewidth
vals = edges_df["w"].to_numpy()
colors = plt.cm.inferno(vals)
lw = 0.4 + 3.0 * (vals - np.percentile(vals, 5)) / (np.percentile(vals, 95) - np.percentile(vals, 5))

# 3) Plot
fig, ax = plt.subplots(figsize=(11, 8))
states.boundary.plot(ax=ax, linewidth=0.6, color="lightgray", facecolor='gray')
edges_df.plot(ax=ax, color=colors, linewidth=lw, alpha=0.9)
anchors.plot(ax=ax, markersize=25, color="black")

sm = ScalarMappable(cmap="inferno")
sm.set_array([])
cbar = plt.colorbar(sm, ax=ax, fraction=0.03, pad=0.01)
cbar.set_label("gravity weight w_ij")

ax.set_axis_off()
ax.set_title(f"Top-{K} State-to-State Gravity Edges")
plt.tight_layout()
plt.show()


## Airports

Weights are $W^A_i = \sum_{j} \frac{E_i E_j}{\max(E_{i,j})}$, where $E_i$ is the total enplanement of all airports within a 20 mile radius of state $i$ and $\max(E_{i,j})$ is the maximum enplanement value. Weights are then normalized by dividing by the maximum value.

In [None]:
airports = pd.read_excel('../raw data/state level data/all-airport-data.xlsx')
airports.head()

In [None]:
airports['NPIAS Hub'].value_counts()

In [None]:
airports = airports.merge(states[['STUSPS', 'STATEFP']], how='inner', left_on='State Id', right_on='STUSPS')

In [None]:
airports = airports[['Loc Id', 'STATEFP', 'NPIAS Hub', 'ARP Latitude DD', 'ARP Longitude DD']]

In [None]:
airports['geometry'] = airports.apply(lambda row: Point(row['ARP Longitude DD'], row['ARP Latitude DD']), axis=1)

In [None]:
geo_airports = gpd.GeoDataFrame(airports, geometry='geometry', crs=4326).to_crs(states.crs)

In [None]:
large = geo_airports[geo_airports['NPIAS Hub'] == 'Large']
medium = geo_airports[geo_airports['NPIAS Hub'] == 'Medium']
small = geo_airports[geo_airports['NPIAS Hub'] == 'Small']

In [None]:
relevant_airports = geo_airports[geo_airports['NPIAS Hub'].isin(['Large', 'Medium', 'Small'])]

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
states.plot(ax=ax, facecolor='grey', edgecolor='lightgray', linewidth=0.5, zorder=1)
large.plot(ax=ax, facecolor='white', edgecolor='red', linewidth=0.5, zorder=3, marker='H', markersize=20)
medium.plot(ax=ax, facecolor='white', edgecolor='black', linewidth=0.5, zorder=2, marker='s', markersize=10)
small.plot(ax=ax, facecolor='white', edgecolor='green', linewidth=0.5, zorder=2, marker='^', markersize=5)
handles = [
    mlines.Line2D([], [], color='red', marker='H', markersize=15,
                  markerfacecolor='white', label='Large Airports'),
    mlines.Line2D([], [], color='black', marker='s', markersize=10,
              markerfacecolor='white', label='Medium Airports'),
    mlines.Line2D([], [], color='green', marker='^', markersize=5,
                  markerfacecolor='white', label='Small Airports'),

]
plt.legend(handles=handles, loc='lower left')
plt.show()

In [None]:
MILES_20_M = 20 * 1609.344

In [None]:
states["geom_20mi"] = states.geometry.buffer(MILES_20_M)
states.head()

In [None]:
buf_gdf = states[['STATEFP', "geom_20mi"]].rename(columns={"geom_20mi":"geometry"}).set_geometry("geometry")
cand = gpd.sjoin(relevant_airports, buf_gdf, how="inner", predicate="within")  # airports matched to states whose 20mi buffer contains them


In [None]:
traffic = pd.read_excel('../raw data/state level data/ARP-cy2024-all-enplanements.xlsx')
traffic.head()

In [None]:
cand_traffic = cand.merge(traffic[['Locid', 'CY 24 Enplanements']], how='left', left_on='Loc Id', right_on='Locid')

In [None]:
cand_traffic = cand_traffic.drop(['STATEFP_left', 'ARP Latitude DD', 'ARP Longitude DD', 'index_right', 'Locid'], axis=1).rename({'STATEFP_right':'STATEFP', 'CY 24 Enplanements':'enplanements'}, axis=1)

In [None]:
cand_traffic.head()

In [None]:
mass = cand_traffic.pivot_table(values='enplanements', index='STATEFP', aggfunc='sum')

In [None]:
mass.to_csv('../processed data/state_level/airport_masses.csv', index=True)

In [None]:
mass_norm = mass / mass.max()

In [None]:
airweights = pd.DataFrame(np.array(mass_norm) * np.array(mass.T))

In [None]:
np.allclose(airweights, airweights.T, atol=1e-9)

In [None]:
airweights.index = airweights.columns = mass_norm.index
np.fill_diagonal(airweights.values, 0)

In [None]:
airweights.to_csv('../processed data/state_level/airport_weights_unscaled.csv', index=True)

In [None]:
airweights = airweights / airweights.max().max()
airweights

In [None]:
weights.loc[airweights.index, airweights.index].to_csv('state_level_highway_weights.csv', index=True)

In [None]:
airweights.to_csv('state_level_airport_weights.csv', index=True)

In [None]:
anchors = states[[STATE_ID_COL, "geometry"]].copy()
anchors = anchors.set_index(STATE_ID_COL)
anchors["anchor_pt"] = anchors.geometry.centroid  # make anchor points
ptmap = anchors["anchor_pt"].to_dict()            # {state_id: Point(...)}

# Only keep states that exist in airweights
ids = [s for s in airweights.index if s in ptmap]
air = airweights.loc[ids, ids]  # reindex to the intersection/order

# 1) Build an edges table (top-K per state to reduce clutter)
K = 3
rows = []

for src_id in ids:
    wrow = air.loc[src_id].copy()         # Series indexed by destination state IDs
    wrow.loc[src_id] = 0.0                # exclude self
    wrow = wrow[wrow > 0]                 # keep positive weights only
    top = wrow.sort_values(ascending=False).head(K)  # top-K by value

    for dst_id, w in top.items():
        # Skip if either point missing
        if (src_id not in ptmap) or (dst_id not in ptmap):
            continue
        p_src = ptmap[src_id]
        p_dst = ptmap[dst_id]
        rows.append({
            "i": src_id,
            "j": dst_id,
            "w": float(w),
            "geometry": LineString([(p_src.x, p_src.y), (p_dst.x, p_dst.y)])
        })

edges_df = gpd.GeoDataFrame(rows, geometry="geometry", crs=states.crs)

# 2) Map weights to color/linewidth (with proper normalization)
vals = edges_df["w"].to_numpy()
if len(vals) == 0:
    raise ValueError("No edges to plot (all weights were zero or missing).")

# robust normalization to 5th–95th percentiles; guard against constant vals
lo = np.percentile(vals, 5)
hi = np.percentile(vals, 95)
if hi - lo < 1e-12:
    lo = vals.min()
    hi = vals.max() + 1e-12

cmap = plt.cm.inferno
colors = cmap(vals)
lw = 0.4 + 3.0 * ( (vals - lo) / (hi - lo) )

# 3) Plot
fig, ax = plt.subplots(figsize=(11, 8))

# states outline (optional facecolor remove to avoid tint)
states.boundary.plot(ax=ax, linewidth=0.6, color="lightgray")

# edges colored by weight
edges_df.plot(ax=ax, color=colors, linewidth=lw, alpha=0.9)

# anchors on top (optional)
gpd.GeoSeries([ptmap[s] for s in ids], crs=states.crs).plot(ax=ax, markersize=25, color="black")

# colorbar tied to the same norm/cmap
sm = ScalarMappable(cmap=cmap)
sm.set_array([])
cbar = plt.colorbar(sm, ax=ax, fraction=0.03, pad=0.01)
cbar.set_label("gravity weight w_ij")

ax.set_axis_off()
ax.set_title(f"Top-{K} State-to-State Gravity Edges")
plt.tight_layout()
plt.show()


## Adjacency Weights

In [None]:
county_adj = pd.read_csv('../raw data/county_adjacency2025.txt', delimiter='|', dtype=str)

In [None]:
county_adj['state_fips'] = county_adj['County GEOID'].apply(lambda x: x[:2])
county_adj = county_adj[~county_adj.state_fips.isin(['02', '14', '15', '72', '07', '03', '43', '52', '78', '72', '69', '66', '60'])]
county_adj['neighbor_state_fips'] = county_adj['Neighbor GEOID'].apply(lambda x: x[:2])
county_adj.Length = county_adj.Length.astype(float)
county_adj.head()

In [None]:
border_adj_matrix = county_adj.pivot_table(index='state_fips', columns='neighbor_state_fips', aggfunc='sum', values='Length', fill_value=0)
np.fill_diagonal(border_adj_matrix.values, 0)
border_adj_matrix

In [None]:
border_adj_matrix.to_csv('../processed data/state_level/border_weights_unscaled.csv', index=True)

In [None]:
(border_adj_matrix == border_adj_matrix.T).sum().sum() / (49*49)

In [None]:
border_adj_matrix = border_adj_matrix / border_adj_matrix.max().max()
border_adj_matrix

In [None]:
border_adj_matrix.to_csv('../processed data/state_level/state_level_border_weights.csv', index=True)

## State-level Variant Data

In [None]:
variants = pd.read_csv('../raw data/state level data/variant_prevalence_state_level.csv')
variants.head()

In [None]:
variants.date = pd.to_datetime(variants.date)

In [None]:
prevalences = variants.pivot_table(index=['date', 'location'], columns='lineage', values='proportion').reset_index()

In [None]:
prevalences_21_22 = prevalences[(prevalences.date >= '2021-01-01') & (prevalences.date < '2023-01-01')].reset_index(drop=True)

In [None]:
prevalences_21_22.shape

In [None]:
all_days = pd.date_range('2021-01-01', '2022-12-31', freq='D')

def interp_group(g):
    loc = g['location'].iloc[0]
    g = (g.set_index('date')
           .reindex(all_days))              # daily grid
    g.index.name = 'date'
    g['location'] = loc

    num_cols = g.select_dtypes('number').columns
    # interpolate only AFTER the first non-NaN (no backfilling before start)
    g[num_cols] = (g[num_cols]
                     .interpolate(method='time', limit_area='inside')
                     .fillna(0))
    return g.reset_index()

prev_interp = (prevalences_21_22.groupby('location', group_keys=False)
         .apply(interp_group)
         .rename(columns={'index': 'date'}))


In [None]:
prev_interp['Omicron'] = prev_interp[['omicron_1', 'omicron_2', 'omicron_3', 'omicron_4']].sum(axis=1)

In [None]:
prev_interp.drop(['omicron_1', 'omicron_2', 'omicron_3', 'omicron_4'], axis=1, inplace=True)
prev_interp

In [None]:
groups = prev_interp['location'].unique()
fig, axes = plt.subplots(nrows=7, ncols=7, figsize=(30, 30), sharex=True)

axes = axes.flatten()

for ax, g in zip(axes, groups):
    sub = prev_interp[prev_interp['location'] == g]
    ax.plot(sub['date'], sub['Alpha'], color='red')
    ax.plot(sub['date'], sub['Omicron'], color='blue')
    ax.plot(sub['date'], sub['Delta'], color='green')
    ax.set_title(f'{g[-2:]}')
    ax.set_ylabel('y')

axes[-1].set_xlabel('date')
plt.tight_layout()
plt.show()


In [None]:
prev_interp.location = prev_interp.location.apply(lambda x: x[-2:])

In [None]:
prev_interp_comb = prev_interp.merge(states[['STATEFP', 'STUSPS']], left_on='location', right_on='STUSPS', how='left').drop(['location', 'STUSPS'], axis=1).rename({'STATEFP':'location'}, axis=1)

In [None]:
prev_interp_comb.to_csv('../processed data/state_level/state_level_prevalences.csv', index=False)

In [None]:
prev_interp_comb = pd.read_csv('../processed data/state_level/state_level_prevalences.csv')

## Exogenous Variables

In [None]:
exog = pd.read_csv('../raw data/state level data/state level covariates.csv')
exog.head()

In [None]:
exog_states = exog[exog.administrative_area_level_1 == 'United States']
exog_states.head()

In [None]:
exog_states.key_local = exog_states.key_local.astype(int).astype(str).str.zfill(2)

In [None]:
exog_states = exog_states[~exog_states.key_local.isin(['02', '14', '15', '72', '07', '03', '43', '52', '78', '72', '69', '66', '60'])]

In [None]:
exog_states.key_local.value_counts()

In [None]:
exog_states = exog_states[['date', 'school_closing', 'workplace_closing', 'cancel_events',
       'gatherings_restrictions', 'transport_closing',
       'stay_home_restrictions', 'internal_movement_restrictions',
       'international_movement_restrictions', 'information_campaigns',
       'testing_policy', 'contact_tracing', 'facial_coverings',
       'vaccination_policy', 'elderly_people_protection',
       'government_response_index', 'stringency_index',
       'containment_health_index', 'economic_support_index', 'key_local', 'population']]

In [None]:
exog_states[['school_closing', 'workplace_closing', 'cancel_events',
       'gatherings_restrictions', 'transport_closing',
       'stay_home_restrictions', 'internal_movement_restrictions',
       'international_movement_restrictions', 'information_campaigns',
       'testing_policy', 'contact_tracing', 'facial_coverings',
       'vaccination_policy', 'elderly_people_protection',
       'government_response_index', 'stringency_index',
       'containment_health_index', 'economic_support_index']] = exog_states[['school_closing', 'workplace_closing', 'cancel_events',
       'gatherings_restrictions', 'transport_closing',
       'stay_home_restrictions', 'internal_movement_restrictions',
       'international_movement_restrictions', 'information_campaigns',
       'testing_policy', 'contact_tracing', 'facial_coverings',
       'vaccination_policy', 'elderly_people_protection',
       'government_response_index', 'stringency_index',
       'containment_health_index', 'economic_support_index']].apply(np.abs)

In [None]:
exog_states.date = pd.to_datetime(exog_states.date, format='mixed')

In [None]:
all_days = pd.date_range('2021-01-01', '2022-12-31', freq='D')

def interp_group(g):
    loc = g['key_local'].iloc[0]
    g = (g.set_index('date')
           .reindex(all_days))              # daily grid
    g.index.name = 'date'
    g['key_local'] = loc

    num_cols = g.select_dtypes('number').columns
    # interpolate only AFTER the first non-NaN (no backfilling before start)
    g[num_cols] = (g[num_cols]
                     .interpolate(method='time', limit_area='inside')
                     .fillna(0))
    return g.reset_index()

exog_states_interp = (exog_states.groupby('key_local', group_keys=False)
         .apply(interp_group)
         .rename(columns={'index': 'date'}))


In [None]:
exog_states_interp

In [None]:
prev_interp_comb.location = prev_interp_comb.location.astype(str).str.zfill(2)
prev_interp_comb.date = pd.to_datetime(prev_interp_comb.date)

In [None]:
covariates = exog_states_interp.merge(prev_interp_comb, left_on=['date', 'key_local'], right_on=['date', 'location'], how='inner')

In [None]:
covariates.columns

In [None]:
cases = pd.read_csv('../raw data/state level data/us_states_daily cases.csv')
cases.head()

In [None]:
cases['location'] = cases.geoid.apply(lambda x: x[-2:])
cases.date = pd.to_datetime(cases.date)

In [None]:
fully_combined = covariates.drop('key_local', axis=1).merge(cases[['date', 'location', 'cases', 'deaths']], on=['date', 'location'], how='left')

In [None]:
fully_combined['Other'] = 1 - fully_combined[['Alpha', 'Beta', 'Delta', 'Epsilon', 'Gamma', 'Iota', 'Omicron']].sum(axis=1)

In [None]:
fully_combined[['Alpha', 'Beta', 'Delta', 'Epsilon', 'Gamma', 'Iota', 'Omicron', 'Other']] = fully_combined[['Alpha', 'Beta', 'Delta', 'Epsilon', 'Gamma', 'Iota', 'Omicron', 'Other']].multiply(fully_combined.cases, axis=0).map(lambda x: 0 if x < 0 else x)

In [None]:
fully_combined.columns

In [None]:
fully_combined.drop('cases', axis=1, inplace=True)

In [None]:
groups = fully_combined['location'].unique()
fig, axes = plt.subplots(nrows=7, ncols=7, figsize=(30, 30), sharex=True)

axes = axes.flatten()

for ax, g in zip(axes, groups):
    sub = fully_combined[fully_combined['location'] == g]
    ax.plot(sub['date'], sub['Alpha'], color='red')
    ax.plot(sub['date'], sub['Omicron'], color='blue')
    ax.plot(sub['date'], sub['Delta'], color='green')
    ax.set_title(f'{g[-2:]}')
    ax.set_ylabel('y')

axes[-1].set_xlabel('date')
plt.tight_layout()
plt.show()


In [None]:
fully_combined = fully_combined[['date', 'location', 'people_vaccinated', 'people_fully_vaccinated',
       'school_closing', 'workplace_closing', 'cancel_events',
       'gatherings_restrictions', 'transport_closing',
       'stay_home_restrictions', 'internal_movement_restrictions',
       'international_movement_restrictions', 'information_campaigns',
       'testing_policy', 'contact_tracing', 'facial_coverings',
       'vaccination_policy', 'elderly_people_protection',
       'government_response_index', 'stringency_index',
       'containment_health_index', 'economic_support_index', 'population',
       'Alpha', 'Beta', 'Delta', 'Epsilon', 'Gamma', 'Iota', 'Omicron',
       'deaths', 'Other']]

In [None]:
fully_combined.to_csv('../processed data/state_level/daily_covariates_state_level.csv', index=False)

In [None]:
fully_combined = pd.read_csv('../processed data/state_level/daily_covariates_state_level.csv')
fully_combined.head()

In [None]:
fully_combined.sort_values(['location', 'date'], inplace=True)

In [None]:
rolled = fully_combined.groupby('location')[fully_combined.columns.to_list()[2:]].transform(lambda s: s.rolling(7, min_periods=1).mean())

In [None]:
fully_combined_rolling = pd.concat([fully_combined[['date', 'location']].reset_index(drop=True), rolled.reset_index(drop=True)], axis=1)

In [None]:
fully_combined_rolling.head()

In [None]:
groups = fully_combined_rolling['location'].unique()
fig, axes = plt.subplots(nrows=7, ncols=7, figsize=(30, 30), sharex=True)

axes = axes.flatten()

for ax, g in zip(axes, groups):
    sub = fully_combined_rolling[fully_combined_rolling['location'] == g]
    ax.plot(sub['date'], sub['Alpha'], color='red')
    ax.plot(sub['date'], sub['Omicron'], color='blue')
    ax.plot(sub['date'], sub['Delta'], color='green')
    ax.set_title(f'{g}')
    ax.set_ylabel('y')

axes[-1].set_xlabel('date')
plt.tight_layout()
plt.show()


In [None]:
fully_combined_rolling.to_csv('../processed data/state_level/rolled_covariates_state_level.csv', index=False)

## Extra Covariates

In [4]:
pop = pd.read_csv('../raw data/state level data/extra covariates/population.csv', dtype={'state_fips':str})

Unnamed: 0,state_fips,Name,Population,median_age
0,1,Alabama,5157699,39.6
1,4,Arizona,7582384,39.4
2,5,Arkansas,3088354,39.1
3,6,California,39431263,38.4
4,8,Colorado,5957494,38.0


In [6]:
income = pd.read_excel('../raw data/state level data/extra covariates/median_income.xlsx')
income.head()

Unnamed: 0,State,Income
0,Alabama,65560
1,Alaska,91260
2,Arizona,84700
3,Arkansas,64840
4,California,100600


In [12]:
density = pd.read_excel('../raw data/state level data/extra covariates/density.xlsx')
density.State = density.State.str.strip()
density.head()

Unnamed: 0,State,Density_per_mile
0,District of Columbia,11131.0
1,New Jersey,1263.0
2,Rhode Island,1060.0
3,Massachusetts,898.0
4,Connecticut,747.0


In [15]:
exog = pop.merge(income, left_on='Name', right_on='State', how='inner').merge(density, left_on='Name', right_on='State', how='inner').drop(['Name', 'State_x', 'State_y'], axis=1)

In [16]:
exog.to_csv('../processed data/state_level/state_level_characteristics.csv')