# Setup

In [7]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from torch.nn import BCELoss
from sklearn.metrics import brier_score_loss
from scipy.stats import norm
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
df = pd.read_csv('/content/drive/MyDrive/Colab_Projects/AML_final_project/data_geo_hos.csv')
df.head()

Unnamed: 0,hosp_yn,death_yn,sex_female,age_0,age_18,age_50,age_65,race_native,race_asian,race_black,race_other,race_native_pacific,race_white,ethnicity_hispanic,case_month,state_fips_code,county_fips_code,case_onset_interval,symptom_status,used_bed_ratio,used_icu_ratio
0,False,False,True,False,True,False,False,False,True,False,False,False,False,True,2021-02,34,34003,0,True,0.718141,0.451212
1,False,False,True,False,True,False,False,False,False,False,True,False,False,True,2021-02,51,51700,0,True,0.746529,0.774469
2,False,False,True,False,True,False,False,False,False,False,False,False,True,True,2021-02,51,51540,0,True,0.86641,0.755197
3,False,False,True,False,True,False,False,False,False,False,False,False,True,True,2020-08,36,36013,0,True,0.761232,0.659091
4,False,False,True,False,True,False,False,False,False,False,False,False,True,True,2020-12,39,39023,0,True,0.660607,0.855514


# Prepare Training- and Test Data

In [36]:
df = df.drop(columns = ['ethnicity_hispanic', 'case_month', 'state_fips_code', 'county_fips_code', 'case_onset_interval', 'symptom_status'])
df = df.sample(frac=1)

data = df.values

TRAIN_SPLIT = 0.8
train_size = int(TRAIN_SPLIT*data.shape[0])

data_train = data[:train_size]
data_test  = data[train_size:]

H_train        = data_train[:,0].astype(bool)
F_train        = data_train[:,1].astype(bool)
features_train = data_train[:,2:]

H_test        = data_test[:,0].astype(bool)
F_test        = data_test[:,1].astype(bool)
features_test = data_test[:,2:]

# Fit Naive Bayes Model to Data

In [37]:
# hospitalization

bayes_params = {}

features_train_H = features_train[H_train]
N = features_train_H.shape[0]

features_train_not_H = features_train[np.logical_not(H_train)]
N_not = features_train_not_H.shape[0]

for i, (feature, feature_not) in enumerate(zip(features_train_H.transpose(), features_train_not_H.transpose())):
  bayes_params[i] = {}

  if type(feature[0]) is bool:
    feature     = feature.astype(bool)
    feature_not = feature_not.astype(bool)
    p     = feature.sum()/N
    p_not = feature_not.sum()/N_not
    bayes_params[i]['type'] = bool
    bayes_params[i]['params_H'] = [p, p_not]

  elif type(feature[0]) is float:
    feature = feature.astype(float)
    feature_not = feature_not.astype(float)
    mu, std         = norm.fit(feature)
    mu_not, std_not = norm.fit(feature_not)
    bayes_params[i]['type'] = float
    bayes_params[i]['params_H'] = [[mu, std], [mu_not, std_not]]

  else:
    raise TypeError('Feature types must be either bool or float!')

In [38]:
# fatality

features_train_F = features_train[F_train]
N = features_train_F.shape[0]

features_train_not_F = features_train[np.logical_not(F_train)]
N_not = features_train_not_F.shape[0]

for i, (feature, feature_not) in enumerate(zip(features_train_F.transpose(), features_train_not_F.transpose())):

  if type(feature[0]) is bool:
    feature     = feature.astype(bool)
    feature_not = feature_not.astype(bool)
    p     = feature.sum()/N
    p_not = feature_not.sum()/N_not
    bayes_params[i]['params_F'] = [p, p_not]

  elif type(feature[0]) is float:
    feature = feature.astype(float)
    feature_not = feature_not.astype(float)
    mu, std         = norm.fit(feature)
    mu_not, std_not = norm.fit(feature_not)
    bayes_params[i]['params_F'] = [[mu, std], [mu_not, std_not]]

  else:
    raise TypeError('Feature types must be either bool or float!')

# Define Prediction Function

In [39]:
def predict_NBC(features, H_train, F_train, bayes_params):

  for i, feature in enumerate(features):
    if type(feature) is not bayes_params[i]['type']:
      raise TypeError('Feature types not compatible with parameter dictionary!')

  Q_H = H_train.sum()/H_train.shape[0] 
  P_H = 1 - Q_H
  Q_F = F_train.sum()/F_train.shape[0] 
  P_F = 1 - Q_F

  for i, feature in enumerate(features):
    if bayes_params[i]['type'] is bool:
      p_H, p_not_H = bayes_params[i]['params_H']
      p_F, p_not_F = bayes_params[i]['params_F']

      if feature:
        Q_H *= p_H
        P_H *= p_not_H
        Q_F *= p_F
        P_F *= p_not_F
      else:
        Q_H *= 1 - p_H
        P_H *= 1 - p_not_H
        Q_F *= 1 - p_F
        P_F *= 1 - p_not_F

    elif bayes_params[i]['type'] is float:
      norm_H, norm_not_H = bayes_params[i]['params_H']
      norm_F, norm_not_F = bayes_params[i]['params_F'] 

      Q_H *= norm.pdf(feature, loc = norm_H[0], scale = norm_H[0])
      P_H *= norm.pdf(feature, loc = norm_not_H[0], scale = norm_not_H[0])
      Q_F *= norm.pdf(feature, loc = norm_F[0], scale = norm_F[0])
      P_F *= norm.pdf(feature, loc = norm_not_F[0], scale = norm_not_F[0])

    else:
      raise TypeError('Encountered invalid type in parameter dictionary!')

  p_cond_H = Q_H/(Q_H + P_H) 
  p_cond_F = Q_F/(Q_F + P_F)
  return p_cond_H, p_cond_F 

# Evaluate Performance

In [None]:
# create predictions for test set
pred_H = np.empty((data_test.shape[0],))
pred_F = np.empty((data_test.shape[0],))

for i, feature in enumerate(tqdm(features_test, leave = True, position = 0)):
  H, F = predict_NBC(feature, H_train, F_train, bayes_params)
  pred_H[i] = H
  pred_F[i] = F

100%|██████████| 630290/630290 [1:12:55<00:00, 144.04it/s]


In [None]:
# save result to reload later
np.save('/content/drive/MyDrive/Colab_Projects/AML_final_project/NBC_H_prediction.npy', pred_H)
np.save('/content/drive/MyDrive/Colab_Projects/AML_final_project/NBC_F_prediction.npy', pred_F)

In [25]:
# create prediction and target array
pred_H = np.load('/content/drive/MyDrive/Colab_Projects/AML_final_project/NBC_H_prediction.npy')
pred_F = np.load('/content/drive/MyDrive/Colab_Projects/AML_final_project/NBC_F_prediction.npy')

prediction = np.empty((pred_H.shape[0], 2))
prediction[:,0] = pred_H
prediction[:,1] = pred_F

target = np.empty((pred_H.shape[0], 2))
target[:,0] = H_test.astype(float)
target[:,1] = F_test.astype(float)

In [27]:
# calculate binary cross entropy loss
loss = BCELoss()
with torch.no_grad():
  L = loss(torch.tensor(prediction), torch.tensor(target))

In [29]:
# calculate brier score
brier_H = brier_score_loss(target[:,0], prediction[:,0])
brier_F = brier_score_loss(target[:,1], prediction[:,1])

In [55]:
print("Binary Cross Entropy Loss: ", L.item())
print("Brier Score (hospitalization): ", brier_H)
print("Brier Score (fatality): ", brier_F)

Binary Cross Entropy Loss:  0.3149935660561603
Brier Score (hospitalization):  0.08173240212604245
Brier Score (fatality):  0.022537784287806627


In [44]:
# check wether the network produces reasonable results for simple test cases:
# case_1 -> male, <18 years old, white
# case_2 -> male, 65+ years old, white
case_1 = [False, True, False, False, False, False, False, False, False, False, True, 0.718141, 0.451212] # young
case_2 = [False, False, False, False, True, False, False, False, False, False, True, 0.718141, 0.451212] # old

hosp_1, fata_1 = predict_NBC(case_1, H_train, F_train, bayes_params)
hosp_2, fata_2 = predict_NBC(case_2, H_train, F_train, bayes_params)
hosp_1 = round(100*hosp_1, 2)
fata_1 = round(100*fata_1, 2)
hosp_2 = round(100*hosp_2, 2)
fata_2 = round(100*fata_2, 2)

print("case_1 (young)")
print("probability of hospitalization: ", hosp_1, "%")
print("probability of death: ", fata_1, "%")
print("\n")
print("case_2 (old)")
print("probability of hospitalization: ", hosp_2, "%")
print("probability of death: ", fata_2, "%")

case_1 (young)
probability of hospitalization:  1.35 %
probability of death:  0.0 %


case_2 (old)
probability of hospitalization:  42.04 %
probability of death:  32.89 %
