# Imports

In [13]:
import  numpy as np
import  pandas as pd
import  matplotlib.pyplot as plt

from    datetime import datetime
import  copy
import  math
import  os
import  pickle
pd.set_option('display.max_columns', 200)

from    google.colab import drive
drive.mount('/content/gdrive')

PROJECT_PATH  = '/content/gdrive/MyDrive/OperAI/final-project'
VIZ_PATH      = os.path.join(PROJECT_PATH, 'viz')
DATA_PATH     = os.path.join(PROJECT_PATH, 'data')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [14]:
from    sklearn.model_selection import train_test_split
from    sklearn.preprocessing import OneHotEncoder, StandardScaler

from    tqdm.notebook import tqdm as blue_tqdm

# Data Import and Info

The model will be a national prediciton model, where each prediciton of daily deaths is based on individual models for each state. This will require aggregating the data at the state level. Over counties $j$ and states $i$ for age group $a$:

$$y^\text{national}_t = \sum_j \text{deaths}_{j,t}$$

$$\hat{y}^\text{national}_t = \sum_i f_{i}(X_{i,t})$$

where $f$ is a state-level model.

$X_{i,t}$ will comprise:

- Deaths
  - `deaths` (target)
  - `deaths` (lagged)
- Dose Administration Percentages
  - `dose_admin_pct_<Age group>`: Dose Administration Percentage (DAP)
  
  $$\text{doses administered}_{i,t,a} = \sum_{j \in i} \frac{\text{doses administered}_{j,a}}{\text{completeness}_{j,a}}$$

  $$\text{doses available}_{i,t,a} = \sum_{j \in i} \text{doses available}_{t,a} \times \text{population}_{j,t,a}$$

  $$\text{DAP}_{i,t,a} = \frac{\text{doses administered}_{i,t,a}}{\text{doses available}_{i,t,a}}$$



- Social Vulnerability Index (SVI) Population Exposure
  - `pop_in_<SVI category>`: Total number of people exposed to counties with given SVI index.
- Metro Area Population
  - `pop_in_<Metro category>`: Total number of people in metro versus non-metro areas.
- Time
  - `month`: categorical for seasonal trends
  - `dayofweek`: categorical for week data trends

# State-Level Preprocessing

In [15]:
df = pd.read_pickle(os.path.join(DATA_PATH, 'df_state_timeseries_v2.pkl'))
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34164 entries, 0 to 34163
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   recip_state             34164 non-null  object        
 1   date                    34164 non-null  datetime64[ns]
 2   phase_1_train           34164 non-null  bool          
 3   phase_1_test            34164 non-null  bool          
 4   phase_2_train           34164 non-null  bool          
 5   phase_2_test            34164 non-null  bool          
 6   month                   34164 non-null  category      
 7   dayofweek               34164 non-null  category      
 8   population              34164 non-null  float64       
 9   pct_pop_in_svi_ctgy_A   34164 non-null  float64       
 10  pct_pop_in_svi_ctgy_B   34164 non-null  float64       
 11  pct_pop_in_svi_ctgy_C   34164 non-null  float64       
 12  pct_pop_in_svi_ctgy_D   34164 non-null  float6

## Convert Categorical to OneHot Vectors

In [16]:
X_vars_cat = [
    'month',
    'dayofweek',
]

ohe = OneHotEncoder(sparse_output=False, drop='first').fit(df.loc[:, X_vars_cat])

ohe_df = pd.DataFrame(
    data = ohe.transform(df.loc[:, X_vars_cat]),
    columns = ohe.get_feature_names_out(),
    index = df.index
)

df = pd.concat([df, ohe_df], axis=1)
df.columns

Index(['recip_state', 'date', 'phase_1_train', 'phase_1_test', 'phase_2_train',
       'phase_2_test', 'month', 'dayofweek', 'population',
       'pct_pop_in_svi_ctgy_A', 'pct_pop_in_svi_ctgy_B',
       'pct_pop_in_svi_ctgy_C', 'pct_pop_in_svi_ctgy_D', 'pct_pop_in_metro',
       'pct_pop_in_nonmetro', 'pct_doses_admin_5plus',
       'pct_doses_admin_12plus', 'pct_doses_admin_18plus',
       'pct_doses_admin_65plus', 'cum_deaths', 'daily_deaths', 'cum_death_pct',
       'daily_death_pct', 'month_2', 'month_3', 'month_4', 'month_5',
       'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11',
       'month_12', 'dayofweek_1', 'dayofweek_2', 'dayofweek_3', 'dayofweek_4',
       'dayofweek_5', 'dayofweek_6'],
      dtype='object')

## Segment into train and test

In [17]:
# Define vars ------------------------------------------------------------------

id_vars = [
    'recip_state',
    'date',
]

X_vars_init = [
    'population',
    'pct_pop_in_svi_ctgy_A',
    'pct_pop_in_svi_ctgy_B',
    'pct_pop_in_svi_ctgy_C',
    'pct_pop_in_svi_ctgy_D',
    'pct_pop_in_metro',
    'pct_pop_in_nonmetro',
    'pct_doses_admin_5plus',
    'pct_doses_admin_12plus',
    'pct_doses_admin_18plus',
    'pct_doses_admin_65plus',
    'daily_death_pct',
]

X_vars_ohe = list(ohe.get_feature_names_out())

X_num = X_vars_init + X_vars_ohe

X_vars = id_vars + X_num

y_vars = [
    'recip_state',
    'date',
    'daily_deaths',
    # 'daily_death_pct'
]

# Segment ----------------------------------------------------------------------

X_tr_phase1 = df.loc[df['phase_1_train'], X_vars]
X_te_phase1 = df.loc[df['phase_1_test'],  X_vars]
X_tr_phase2 = df.loc[df['phase_2_train'], X_vars]
X_te_phase2 = df.loc[df['phase_2_test'],  X_vars]

y_tr_phase1 = df.loc[df['phase_1_train'], y_vars]
y_te_phase1 = df.loc[df['phase_1_test'],  y_vars]
y_tr_phase2 = df.loc[df['phase_2_train'], y_vars]
y_te_phase2 = df.loc[df['phase_2_test'],  y_vars]


print('Shapes')
print('-'*60)
print(f'[Phase 1 Train]  x: {X_tr_phase1.shape}\t y: {y_tr_phase1.shape}')
print(f'[Phase 1 Test]   x: {X_te_phase1.shape}\t y: {y_te_phase1.shape}')
print(f'[Phase 2 Train]  x: {X_tr_phase2.shape}\t y: {y_tr_phase2.shape}')
print(f'[Phase 2 Test]   x: {X_te_phase2.shape}\t y: {y_te_phase2.shape}')
print('-'*60)

Shapes
------------------------------------------------------------
[Phase 1 Train]  x: (24648, 31)	 y: (24648, 3)
[Phase 1 Test]   x: (4732, 31)	 y: (4732, 3)
[Phase 2 Train]  x: (29380, 31)	 y: (29380, 3)
[Phase 2 Test]   x: (4784, 31)	 y: (4784, 3)
------------------------------------------------------------


## Fit scaler on train and transform both train and test

In [18]:
scaler1 = StandardScaler()
scaler2 = StandardScaler()

# Fit to train data
scaler1.fit(X_tr_phase1.loc[:, X_num])
scaler2.fit(X_tr_phase2.loc[:, X_num])

# Transform train and test data

## Phase 1---------------------------------------------------------

### X_tr_phase1
X_tr_phase1.loc[:, X_num] = pd.DataFrame(
    data = scaler1.transform(X_tr_phase1.loc[:, X_num]),
    columns = X_tr_phase1.loc[:, X_num].columns,
    index = X_tr_phase1.loc[:, X_num].index
)

### X_te_phase1
X_te_phase1.loc[:, X_num] = pd.DataFrame(
    data = scaler1.transform(X_te_phase1.loc[:, X_num]),
    columns = X_te_phase1.loc[:, X_num].columns,
    index = X_te_phase1.loc[:, X_num].index
)

## Phase 2---------------------------------------------------------
### X_tr_phase2
X_tr_phase2.loc[:, X_num] = pd.DataFrame(
    data = scaler1.transform(X_tr_phase2.loc[:, X_num]),
    columns = X_tr_phase2.loc[:, X_num].columns,
    index = X_tr_phase2.loc[:, X_num].index
)

### X_tr_phase1
X_te_phase2.loc[:, X_num] = pd.DataFrame(
    data = scaler1.transform(X_te_phase2.loc[:, X_num]),
    columns = X_te_phase2.loc[:, X_num].columns,
    index = X_te_phase2.loc[:, X_num].index
)

# Check
X_te_phase2.head()

Unnamed: 0,recip_state,date,population,pct_pop_in_svi_ctgy_A,pct_pop_in_svi_ctgy_B,pct_pop_in_svi_ctgy_C,pct_pop_in_svi_ctgy_D,pct_pop_in_metro,pct_pop_in_nonmetro,pct_doses_admin_5plus,pct_doses_admin_12plus,pct_doses_admin_18plus,pct_doses_admin_65plus,daily_death_pct,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6
29380,AK,2022-07-01,-0.803137,0.377298,1.230778,-0.890944,-0.992242,-2.590342,2.590342,1.340906,0.136414,0.192408,0.405424,0.374329,-0.366021,-0.387925,-0.259938,-0.264532,-0.259938,3.780254,-0.264532,-0.259938,-0.264532,-0.259938,-0.343401,-0.409253,-0.409253,-0.409253,2.464676,-0.405733,-0.409253
29432,AK,2022-07-02,-0.803137,0.377298,1.230778,-0.890944,-0.992242,-2.590342,2.590342,1.340906,0.136414,0.192408,0.405424,0.374329,-0.366021,-0.387925,-0.259938,-0.264532,-0.259938,3.780254,-0.264532,-0.259938,-0.264532,-0.259938,-0.343401,-0.409253,-0.409253,-0.409253,-0.405733,2.464676,-0.409253
29484,AK,2022-07-03,-0.803137,0.377298,1.230778,-0.890944,-0.992242,-2.590342,2.590342,1.340906,0.136414,0.192408,0.405424,0.374329,-0.366021,-0.387925,-0.259938,-0.264532,-0.259938,3.780254,-0.264532,-0.259938,-0.264532,-0.259938,-0.343401,-0.409253,-0.409253,-0.409253,-0.405733,-0.405733,2.443479
29536,AK,2022-07-04,-0.803137,0.377298,1.230778,-0.890944,-0.992242,-2.590342,2.590342,1.340906,0.136414,0.192408,0.405424,0.374329,-0.366021,-0.387925,-0.259938,-0.264532,-0.259938,3.780254,-0.264532,-0.259938,-0.264532,-0.259938,-0.343401,-0.409253,-0.409253,-0.409253,-0.405733,-0.405733,-0.409253
29588,AK,2022-07-05,-0.803137,0.377298,1.230778,-0.890944,-0.992242,-2.590342,2.590342,1.340906,0.136414,0.192408,0.405424,0.374329,-0.366021,-0.387925,-0.259938,-0.264532,-0.259938,3.780254,-0.264532,-0.259938,-0.264532,-0.259938,-0.343401,2.443479,-0.409253,-0.409253,-0.405733,-0.405733,-0.409253


## Export

In [19]:
covid_datasets = (
    X_tr_phase1, y_tr_phase1,
    X_te_phase1, y_te_phase1,
    X_tr_phase2, y_tr_phase2,
    X_te_phase2, y_te_phase2
)

with open(os.path.join(DATA_PATH, 'covid_preprocessed_data_v3.pkl'), 'wb') as f:
    pickle.dump(covid_datasets, f)

# County-Level Preprocessing

In [20]:
df = pd.read_pickle(os.path.join(DATA_PATH, 'df_county_timeseries.pkl'))
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2114883 entries, 0 to 2114882
Data columns (total 23 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   fips                    object        
 1   date                    datetime64[ns]
 2   phase_1_train           bool          
 3   phase_1_test            bool          
 4   phase_2_train           bool          
 5   phase_2_test            bool          
 6   month                   category      
 7   dayofweek               category      
 8   population              float64       
 9   pct_pop_in_svi_ctgy_A   float64       
 10  pct_pop_in_svi_ctgy_B   float64       
 11  pct_pop_in_svi_ctgy_C   float64       
 12  pct_pop_in_svi_ctgy_D   float64       
 13  pct_pop_in_metro        float64       
 14  pct_pop_in_nonmetro     float64       
 15  pct_doses_admin_5plus   float64       
 16  pct_doses_admin_12plus  float64       
 17  pct_doses_admin_18plus  float64       
 18  pc

## Convert Categorical to OneHot Vectors

In [21]:
X_vars_cat = [
    'month',
    'dayofweek',
]

ohe = OneHotEncoder(sparse_output=False, drop='first').fit(df.loc[:, X_vars_cat])

ohe_df = pd.DataFrame(
    data = ohe.transform(df.loc[:, X_vars_cat]),
    columns = ohe.get_feature_names_out(),
    index = df.index
)

df = pd.concat([df, ohe_df], axis=1)
df.columns

Index(['fips', 'date', 'phase_1_train', 'phase_1_test', 'phase_2_train',
       'phase_2_test', 'month', 'dayofweek', 'population',
       'pct_pop_in_svi_ctgy_A', 'pct_pop_in_svi_ctgy_B',
       'pct_pop_in_svi_ctgy_C', 'pct_pop_in_svi_ctgy_D', 'pct_pop_in_metro',
       'pct_pop_in_nonmetro', 'pct_doses_admin_5plus',
       'pct_doses_admin_12plus', 'pct_doses_admin_18plus',
       'pct_doses_admin_65plus', 'cum_deaths', 'daily_deaths', 'cum_death_pct',
       'daily_death_pct', 'month_2', 'month_3', 'month_4', 'month_5',
       'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11',
       'month_12', 'dayofweek_1', 'dayofweek_2', 'dayofweek_3', 'dayofweek_4',
       'dayofweek_5', 'dayofweek_6'],
      dtype='object')

## Segement into train and test

In [23]:
# Define vars ------------------------------------------------------------------

id_vars = [
    'fips',
    'date',
]

X_vars_init = [
    'population',
    'pct_pop_in_svi_ctgy_A',
    'pct_pop_in_svi_ctgy_B',
    'pct_pop_in_svi_ctgy_C',
    'pct_pop_in_svi_ctgy_D',
    'pct_pop_in_metro',
    'pct_pop_in_nonmetro',
    'pct_doses_admin_5plus',
    'pct_doses_admin_12plus',
    'pct_doses_admin_18plus',
    'pct_doses_admin_65plus',
    'daily_death_pct',
]

X_vars_ohe = list(ohe.get_feature_names_out())

X_num = X_vars_init + X_vars_ohe

X_vars = id_vars + X_num

y_vars = [
    'fips',
    'date',
    'daily_deaths',
    # 'daily_death_pct'
]

# Segment ----------------------------------------------------------------------

X_tr_phase1 = df.loc[df['phase_1_train'], X_vars]
X_te_phase1 = df.loc[df['phase_1_test'],  X_vars]
X_tr_phase2 = df.loc[df['phase_2_train'], X_vars]
X_te_phase2 = df.loc[df['phase_2_test'],  X_vars]

y_tr_phase1 = df.loc[df['phase_1_train'], y_vars]
y_te_phase1 = df.loc[df['phase_1_test'],  y_vars]
y_tr_phase2 = df.loc[df['phase_2_train'], y_vars]
y_te_phase2 = df.loc[df['phase_2_test'],  y_vars]


print('Shapes')
print('-'*60)
print(f'[Phase 1 Train]  x: {X_tr_phase1.shape}\t y: {y_tr_phase1.shape}')
print(f'[Phase 1 Test]   x: {X_te_phase1.shape}\t y: {y_te_phase1.shape}')
print(f'[Phase 2 Train]  x: {X_tr_phase2.shape}\t y: {y_tr_phase2.shape}')
print(f'[Phase 2 Test]   x: {X_te_phase2.shape}\t y: {y_te_phase2.shape}')
print('-'*60)

Shapes
------------------------------------------------------------
[Phase 1 Train]  x: (1525806, 31)	 y: (1525806, 3)
[Phase 1 Test]   x: (292929, 31)	 y: (292929, 3)
[Phase 2 Train]  x: (1818735, 31)	 y: (1818735, 3)
[Phase 2 Test]   x: (296148, 31)	 y: (296148, 3)
------------------------------------------------------------


## Fit scaler on train and transform both train and test

In [24]:
scaler1 = StandardScaler()
scaler2 = StandardScaler()

# Fit to train data
scaler1.fit(X_tr_phase1.loc[:, X_num])
scaler2.fit(X_tr_phase2.loc[:, X_num])

# Transform train and test data

## Phase 1---------------------------------------------------------

### X_tr_phase1
X_tr_phase1.loc[:, X_num] = pd.DataFrame(
    data = scaler1.transform(X_tr_phase1.loc[:, X_num]),
    columns = X_tr_phase1.loc[:, X_num].columns,
    index = X_tr_phase1.loc[:, X_num].index
)

### X_te_phase1
X_te_phase1.loc[:, X_num] = pd.DataFrame(
    data = scaler1.transform(X_te_phase1.loc[:, X_num]),
    columns = X_te_phase1.loc[:, X_num].columns,
    index = X_te_phase1.loc[:, X_num].index
)

## Phase 2---------------------------------------------------------
### X_tr_phase2
X_tr_phase2.loc[:, X_num] = pd.DataFrame(
    data = scaler1.transform(X_tr_phase2.loc[:, X_num]),
    columns = X_tr_phase2.loc[:, X_num].columns,
    index = X_tr_phase2.loc[:, X_num].index
)

### X_tr_phase1
X_te_phase2.loc[:, X_num] = pd.DataFrame(
    data = scaler1.transform(X_te_phase2.loc[:, X_num]),
    columns = X_te_phase2.loc[:, X_num].columns,
    index = X_te_phase2.loc[:, X_num].index
)

# Check
X_te_phase2.head()

Unnamed: 0,fips,date,population,pct_pop_in_svi_ctgy_A,pct_pop_in_svi_ctgy_B,pct_pop_in_svi_ctgy_C,pct_pop_in_svi_ctgy_D,pct_pop_in_metro,pct_pop_in_nonmetro,pct_doses_admin_5plus,pct_doses_admin_12plus,pct_doses_admin_18plus,pct_doses_admin_65plus,daily_death_pct,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6
1818735,1001,2022-07-01,-0.185311,-0.577948,-0.576513,1.733128,-0.577948,1.267469,-1.267469,3.172457,1.870896,1.849631,0.919417,-0.166465,-0.366021,-0.387925,-0.259938,-0.264532,-0.259938,3.780254,-0.264532,-0.259938,-0.264532,-0.259938,-0.343401,-0.409253,-0.409253,-0.409253,2.464676,-0.405733,-0.409253
1821954,1001,2022-07-02,-0.185311,-0.577948,-0.576513,1.733128,-0.577948,1.267469,-1.267469,3.172457,1.870896,1.849631,0.919417,-0.166465,-0.366021,-0.387925,-0.259938,-0.264532,-0.259938,3.780254,-0.264532,-0.259938,-0.264532,-0.259938,-0.343401,-0.409253,-0.409253,-0.409253,-0.405733,2.464676,-0.409253
1825173,1001,2022-07-03,-0.185311,-0.577948,-0.576513,1.733128,-0.577948,1.267469,-1.267469,3.172457,1.870896,1.849631,0.919417,-0.166465,-0.366021,-0.387925,-0.259938,-0.264532,-0.259938,3.780254,-0.264532,-0.259938,-0.264532,-0.259938,-0.343401,-0.409253,-0.409253,-0.409253,-0.405733,-0.405733,2.443479
1828392,1001,2022-07-04,-0.185311,-0.577948,-0.576513,1.733128,-0.577948,1.267469,-1.267469,3.172457,1.870896,1.849631,0.919417,-0.166465,-0.366021,-0.387925,-0.259938,-0.264532,-0.259938,3.780254,-0.264532,-0.259938,-0.264532,-0.259938,-0.343401,-0.409253,-0.409253,-0.409253,-0.405733,-0.405733,-0.409253
1831611,1001,2022-07-05,-0.185311,-0.577948,-0.576513,1.733128,-0.577948,1.267469,-1.267469,3.172457,1.870896,1.849631,0.919417,-0.166465,-0.366021,-0.387925,-0.259938,-0.264532,-0.259938,3.780254,-0.264532,-0.259938,-0.264532,-0.259938,-0.343401,2.443479,-0.409253,-0.409253,-0.405733,-0.405733,-0.409253


## Export

In [25]:
covid_datasets = (
    X_tr_phase1, y_tr_phase1,
    X_te_phase1, y_te_phase1,
    X_tr_phase2, y_tr_phase2,
    X_te_phase2, y_te_phase2
)

with open(os.path.join(DATA_PATH, 'covid_preprocessed_data_county.pkl'), 'wb') as f:
    pickle.dump(covid_datasets, f)