In [1]:
import numpy as np
import pandas as pd
# from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, brier_score_loss

import matplotlib.pyplot as plt

# Data

In [2]:
data = pd.read_csv('../data/data.csv')
data = data.drop(columns=['Unnamed: 0', 'state_fips_code', 'county_fips_code', 'case_month'])
data.pop_estimate_2019 = data.pop_estimate_2019.str.replace(',', '')
data

Unnamed: 0,hosp_yn,death_yn,sex_female,age_0,age_18,age_50,age_65,race_native,race_asian,race_black,...,economic_typology_2015,pop_estimate_2019,less_hs_pct,hs_only_pct,some_college_pct,bachelor_higher_pct,unempl_rate,med_hh_income_19,med_hh_income_pct_state_total_19,poverty_pct_19
0,False,False,True,False,True,False,False,False,False,False,...,0.0,55869,11.5,33.6,28.4,26.6,4.9,58233.0,112.481888,12.1
1,False,False,True,False,True,False,False,False,False,False,...,0.0,55869,11.5,33.6,28.4,26.6,4.9,58233.0,112.481888,12.1
2,False,False,True,False,False,True,False,False,False,False,...,0.0,55869,11.5,33.6,28.4,26.6,4.9,58233.0,112.481888,12.1
3,False,False,True,False,False,True,False,False,False,False,...,0.0,55869,11.5,33.6,28.4,26.6,4.9,58233.0,112.481888,12.1
4,False,False,True,False,True,False,False,False,False,False,...,0.0,55869,11.5,33.6,28.4,26.6,4.9,58233.0,112.481888,12.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240793,False,False,False,False,False,True,False,False,False,False,...,0.0,72999,7.3,38.0,33.0,21.6,6.7,57325.0,89.323280,10.7
240794,False,False,False,False,True,False,False,False,False,False,...,0.0,72999,7.3,38.0,33.0,21.6,6.7,57325.0,89.323280,10.7
240795,False,False,False,False,True,False,False,False,False,False,...,0.0,72999,7.3,38.0,33.0,21.6,6.7,57325.0,89.323280,10.7
240796,False,False,True,False,False,False,True,False,False,False,...,0.0,72999,7.3,38.0,33.0,21.6,6.7,57325.0,89.323280,10.7


In [3]:
Z = data.iloc[:, :].values.astype(float)
# del data

X, y = Z[:, 2:], Z[:, :2]
del Z
print(f'{X.shape = }')
print(f'{y.shape = }')

X.shape = (240798, 28)
y.shape = (240798, 2)


# Preprocessing

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
sc_X_train = StandardScaler()
X_train = sc_X_train.fit_transform(X_train)
X_test = sc_X_train.transform(X_test)

# Training

In [6]:
# classifier = SVC(kernel='linear', C=0.01)
classifier_hosp = SGDClassifier(loss='log', alpha=0.0001, shuffle=True, n_jobs=8)
classifier_hosp.fit(X_train, y_train[:, 0])

SGDClassifier(loss='log', n_jobs=8)

In [7]:
classifier_death = SGDClassifier(loss='log', alpha=0.0001, shuffle=True, n_jobs=8)
classifier_death.fit(X_train, y_train[:, 1])

SGDClassifier(loss='log', n_jobs=8)

# Evaluation

In [8]:
y_pred_hosp = classifier_hosp.predict(X_test)
y_pred_death = classifier_death.predict(X_test)

y_pred_proba_hosp = classifier_hosp.predict_proba(X_test)
y_pred_proba_death = classifier_death.predict_proba(X_test)

In [20]:
print(f'accuracy: {accuracy_score(y_test[:, 0], y_pred_hosp).round(4)}')
print('confusion matrix:')
print(confusion_matrix(y_test[:, 0], y_pred_hosp))
print(f'brier loss: {brier_score_loss(y_test[:, 0], y_pred_proba_hosp[:, 1]).round(4)}')

accuracy: 0.9461
confusion matrix:
[[45536    22]
 [ 2575    27]]
brier loss: 0.0472


In [19]:
print(f'accuracy: {accuracy_score(y_test[:, 1], y_pred_death).round(4)}')
print('confusion matrix:')
print(confusion_matrix(y_test[:, 1], y_pred_death))
print(f'brier loss: {brier_score_loss(y_test[:, 1], y_pred_proba_death[:, 1]).round(4)}')

accuracy: 0.9948
confusion matrix:
[[47889     0]
 [  249    22]]
brier loss: 0.0049


In [11]:
# 'hosp_yn', 'death_yn', 'sex_female', 'age_0', 'age_18', 'age_50',
# 'age_65', 'race_native', 'race_asian', 'race_black', 'race_other',
# 'race_native_pacific', 'race_white', 'ethnicity_hispanic',
# 'case_onset_interval', 'symptom_status', 'used_bed_ratio',
# 'used_icu_ratio', 'rural-urban_Continuum Code_2013',
# 'urban_influence_code_2013', 'economic_typology_2015',
# 'pop_estimate_2019', 'less_hs_pct', 'hs_only_pct', 'some_college_pct',
# 'bachelor_higher_pct', 'unempl_rate', 'med_hh_income_19',
# 'med_hh_income_pct_state_total_19', 'poverty_pct_19'

In [46]:
case1 = np.array([[
    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0.2, 0.1, # used_icu_ratio
    2, 2, 4, 0.2, 50000, 0.4, 0.25, 0.15, 0.03, 24892, 24892, 0.2
]])


print(f'hosp probability: {classifier_hosp.predict_proba(sc_X_train.transform(case1))[0, 1]}')
print(f'death probability: {classifier_death.predict_proba(sc_X_train.transform(case1))[0, 1]}')

hosp probability: 0.9999999999996012
death probability: 0.0
