# Setup

In [1]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
import torch
from torch.nn import BCELoss
from sklearn.metrics import brier_score_loss
from scipy.stats import norm
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Prepare Training- and Test Data

In [2]:
# train data 

with open('/content/drive/MyDrive/Colab_Projects/AML_final_project/data/data_train.pt', 'rb') as file:
    X_train, y_train = pickle.load(file)

# boolean columns
bool_col = [0,1,2,3,4,5,6,7,8,9,10,11,13]
features_train = []
for i, col in enumerate(X_train.transpose()):
  if i in bool_col:
    features_train.append(col.astype(bool)) 
  else:
    features_train.append(col.astype(float))

H_train = y_train[:,0].astype(bool)
F_train = y_train[:,1].astype(bool)

In [3]:
# test data
with open('/content/drive/MyDrive/Colab_Projects/AML_final_project/data/data_test.pt', 'rb') as file:
    X_test, y_test = pickle.load(file)

# boolean columns
bool_col = [0,1,2,3,4,5,6,7,8,9,10,11,13]
features_test = []
for i, col in enumerate(X_test.transpose()):
  if i in bool_col:
    features_test.append(col.astype(bool)) 
  else:
    features_test.append(col.astype(float))

H_test = y_test[:,0].astype(bool)
F_test = y_test[:,1].astype(bool)

# Fit Naive Bayes Model to Data

In [4]:
# hospitalization

bayes_params = {}

N_H     = H_train.sum()
N_H_not = np.logical_not(H_train).sum()
N_F     = F_train.sum()
N_F_not = np.logical_not(F_train).sum()

for i, feature_train in enumerate(features_train):
  feature_train_H     = feature_train[H_train]
  feature_train_H_not = feature_train[np.logical_not(H_train)]
  feature_train_F     = feature_train[F_train]
  feature_train_F_not = feature_train[np.logical_not(F_train)]
  bayes_params[i] = {}

  if feature_train.dtype == bool:
    p_H     = feature_train_H.sum()/N_H
    p_H_not = feature_train_H_not.sum()/N_H_not
    p_F     = feature_train_F.sum()/N_F
    p_F_not = feature_train_F_not.sum()/N_F_not
    bayes_params[i]['type'] = bool
    bayes_params[i]['params_H'] = [p_H, p_H_not]
    bayes_params[i]['params_F'] = [p_F, p_F_not]

  elif feature_train.dtype == float:
    mu_H, std_H         = norm.fit(feature_train_H)
    mu_H_not, std_H_not = norm.fit(feature_train_H_not)
    mu_F, std_F         = norm.fit(feature_train_F)
    mu_F_not, std_F_not = norm.fit(feature_train_F_not)
    bayes_params[i]['type'] = float
    bayes_params[i]['params_H'] = [[mu_H, std_H], [mu_H_not, std_H_not]]
    bayes_params[i]['params_F'] = [[mu_F, std_F], [mu_F_not, std_F_not]]

  else:
    raise TypeError('Feature types must be either bool or float!')

# Define Prediction Function

In [5]:
def predict_NBC(features, H_train, F_train, bayes_params):

  for i, feature in enumerate(features):
    if feature.dtype != bayes_params[i]['type']:
      raise TypeError('Feature types not compatible with parameter dictionary!')

  Q_H = H_train.sum()/H_train.shape[0] 
  P_H = 1 - Q_H
  Q_F = F_train.sum()/F_train.shape[0] 
  P_F = 1 - Q_F

  for i, feature in enumerate(features):
    if bayes_params[i]['type'] is bool:
      p_H, p_not_H = bayes_params[i]['params_H']
      p_F, p_not_F = bayes_params[i]['params_F']

      if feature:
        Q_H *= p_H
        P_H *= p_not_H
        Q_F *= p_F
        P_F *= p_not_F
      else:
        Q_H *= 1 - p_H
        P_H *= 1 - p_not_H
        Q_F *= 1 - p_F
        P_F *= 1 - p_not_F

    elif bayes_params[i]['type'] is float:
      norm_H, norm_not_H = bayes_params[i]['params_H']
      norm_F, norm_not_F = bayes_params[i]['params_F'] 

      Q_H *= norm.pdf(feature, loc = norm_H[0], scale = norm_H[0])
      P_H *= norm.pdf(feature, loc = norm_not_H[0], scale = norm_not_H[0])
      Q_F *= norm.pdf(feature, loc = norm_F[0], scale = norm_F[0])
      P_F *= norm.pdf(feature, loc = norm_not_F[0], scale = norm_not_F[0])

    else:
      raise TypeError('Encountered invalid type in parameter dictionary!')

  p_cond_H = Q_H/(Q_H + P_H) 
  p_cond_F = Q_F/(Q_F + P_F)
  return p_cond_H, p_cond_F 

# Create Test Prediction

In [8]:
# create predictions for test set
pred_H = np.empty((y_test.shape[0],))
pred_F = np.empty((y_test.shape[0],))
pred_H[:] = np.nan
pred_F[:] = np.nan

total = y_test.shape[0]

for i, feature in enumerate(tqdm(zip(*features_test), leave = True, position = 0, total = total)):
  feature = list(feature)
  H, F = predict_NBC(feature, H_train, F_train, bayes_params)
  pred_H[i] = H
  pred_F[i] = F

100%|██████████| 578417/578417 [2:23:09<00:00, 67.34it/s]


In [14]:
# save result
pred = {"hosp": pred_H, "death": pred_F}
with open('/content/drive/MyDrive/Colab_Projects/AML_final_project/data/NBC.pt', 'wb') as handle:
    pickle.dump(pred, handle)