**IMPORTS**

In [13]:
import keras, pickle
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier

from imblearn.over_sampling import RandomOverSampler

from pathlib import Path

# Hospital patient survival

In [2]:
survival_df = pd.read_csv('datasets/final_patient_survival_reduced.csv')

In [3]:
survival_df.head()

Unnamed: 0,age,bmi,elective_surgery,ethnicity,gender,height,pre_icu_days,weight,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor,hospital_death
0,68.0,22.73,0,0.0,0.0,180.3,0.541667,73.9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
1,77.0,27.42,0,0.0,1.0,160.0,0.927778,70.2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
2,25.0,31.95,0,0.0,1.0,172.7,0.000694,95.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,67.0,27.56,0,0.0,0.0,190.5,0.000694,100.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
4,59.0,57.45,0,0.0,1.0,165.1,0.000694,156.6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0


In [4]:
survival_df["hospital_death"].value_counts()

0    73471
1     7025
Name: hospital_death, dtype: int64

## Prepare data

*isolate independent and dependent variables*

In [5]:
# independent variables (for prediction)
X = survival_df.loc[:, survival_df.columns != "hospital_death"].values

# dependent variable (to predict)  
y = survival_df.loc[:, survival_df.columns == "hospital_death"].values   

In [6]:
X.shape, y.shape

((80496, 16), (80496, 1))

*split training and test data*

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

*scale features*

In [8]:
scaler = StandardScaler()
X_train[:, :] = scaler.fit_transform(X_train)
X_test[:, :] = scaler.transform(X_test)

*balance target variable by oversampling* source: https://towardsdatascience.com/how-to-balance-a-dataset-in-python-36dff9d12704

In [9]:
over_sampler = RandomOverSampler(random_state=45)
X_train_bal, y_train_bal = over_sampler.fit_resample(X_train, y_train)

## DL

**Keras model setup:** fully-connected structure with 3 layers. Using ReLU activation function on 2 layers and Sigmoid function on the 3rd layer.

*source:* https://machinelearningmastery.com/tutorial-first-neural-network-python-keras/ (works with a dataset about diabetes among Pima women)

### Build DL model

In [17]:
# define model
model = keras.Sequential(
    [
        keras.layers.Dense(90, input_dim=16, activation='relu'),
        keras.layers.Dense(80, input_dim=16, activation='relu'),
        keras.layers.Dense(1, activation='sigmoid')
    ]
)

**Compile model**

In [18]:
model.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    metrics=['accuracy']
)

**Train model**

In [19]:
model.fit(X_train_bal, y_train_bal, epochs=75, batch_size=120)

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/75
Epoch 44/75
Epoch 45/75
Epoch 46/75
Epoch 47/75
Epoch 48/75
Epoch 49/75
Epoch 50/75
Epoch 51/75
Epoch 52/75
Epoch 53/75
Epoch 54/75
Epoch 55/75
Epoch 56/75
Epoch 57/75
Epoch 58/75
Epoch 59/75
Epoch 60/75
Epoch 61/75
Epoch 62/75
Epoch 63/75
Epoch 64/75
Epoch 65/75
Epoch 66/75
Epoch 67/75
Epoch 68/75
Epoch 69/75
Epoch 70/75
Epoch 71/75
Epoch 72/75
Epoch 73/75
Epoch 74/75
Epoch 75/75


<keras.callbacks.History at 0x19cbed75b50>

**Test model**

In [20]:
test_loss, test_acc = model.evaluate(X_test, y_test)



**SAVE MODEL** -- https://www.tensorflow.org/guide/keras/save_and_serialize

In [21]:
model.save('models/survival_prediction.h5')

## ML

Logistic regression

In [22]:
clf = LogisticRegression(max_iter=10000)
clf.fit(
    X_train_bal,
    y_train_bal.reshape(-1,)
)

Decision tree

In [23]:
clf_tree = DecisionTreeClassifier()
clf_tree.fit(X_train_bal, y_train_bal)

**SAVE MODELS**

In [24]:
filepath = Path('models/survival_prediction_logistic.pkl')
pickle.dump(clf, open(filepath, "wb"))

filepath = Path('models/survival_prediction_clf_tree.pkl')
pickle.dump(clf_tree, open(filepath, "wb"))
