In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tensorflow as tf
from tensorflow import keras

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import wandb
from wandb.integration.keras import WandbCallback

# Load the data
wandb.login()
data = pd.read_csv("./MIMIC_data.csv")


In [None]:

from sklearn.utils import resample
# data_clean = data.dropna()
# y = data_clean['outcome']
# X = data_clean.drop(columns='outcome')
def balance_dataset(df, column, method='undersample', random_state=42):
    """
    Balance a dataset based on the values in a specified column.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame
    column (str): Column name to balance by
    method (str): 'undersample' or 'oversample'
    random_state (int): Random state for reproducibility
    
    Returns:
    pandas.DataFrame: Balanced DataFrame
    """
    # Get value counts
    value_counts = df[column].value_counts()
    print(f"Original class distribution:\n{value_counts}\n")
    
    if method == 'undersample':
        # Get the minimum class count
        min_count = value_counts.min()
        
        # Create balanced dataframe
        balanced_df = pd.concat([
            resample(df[df[column] == val],
                    replace=False,
                    n_samples=min_count,
                    random_state=random_state)
            for val in value_counts.index
        ])
        
    elif method == 'oversample':
        # Get the maximum class count
        max_count = value_counts.max()
        
        # Create balanced dataframe
        balanced_df = pd.concat([
            resample(df[df[column] == val],
                    replace=True,
                    n_samples=max_count,
                    random_state=random_state)
            if count < max_count else df[df[column] == val]
            for val, count in value_counts.items()
        ])
    
    print(f"Balanced class distribution:\n{balanced_df[column].value_counts()}")
    return balanced_df.reset_index(drop=True)



balanced_data_under = balance_dataset(data, 'outcome', method='undersample', random_state=42)
balanced_data_over = balance_dataset(data, 'outcome', method='oversample', random_state=42)

print(data.shape)
print(balanced_data_under.shape)
print(balanced_data_over.shape)

data = balanced_data_over

In [None]:


# build input pipeline using tf.data


def get_train_and_val(data_feaure_and_prediction, BATCH_SIZE = 64):
    """_summary_
    this function gets the relevant feaure and variables 
    Args:
        data_feaure_and_prediction (_type_): the last colum should be the output, the first ones should be the feature(s)
        BATCH_SIZE (int, optional): _description_. Defaults to 64.

    Returns:
        _type_: _description_
    """
    # data_feaure_and_prediction = data_feaure_and_prediction.dropna() # drop the irrelevant values
    X = data_feaure_and_prediction[data_feaure_and_prediction.keys()[:-1]] # the feature
    y = data_feaure_and_prediction[data_feaure_and_prediction.keys()[-1]] # its prediction

    X_standard = StandardScaler()
    X_standard = X_standard.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_standard, y, test_size=0.2, random_state=42)

    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    train_dataset = train_dataset.shuffle(buffer_size=1024).batch(BATCH_SIZE)

    val_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
    val_dataset = val_dataset.batch(BATCH_SIZE)
    return [train_dataset, val_dataset]


In [None]:
def make_model(feature_num, input_name="input name"):
    inputs = keras.Input((feature_num,), name=input_name)
    x1 = keras.layers.Dense(64, activation="relu")(inputs)
    x1 = keras.layers.Dropout(0.3)(x1)

    x2 = keras.layers.Dense(64, activation="relu")(x1)
    x2 = keras.layers.Dropout(0.3)(x2)

    outputs = keras.layers.Dense(10, name="predictions")(x2)

    return keras.Model(inputs=inputs, outputs=outputs)



In [None]:
def train_step(x, y, model, optimizer, loss_fn, train_acc_metric):
    with tf.GradientTape() as tape:
        logits = model(x, training=True)
        loss_value = loss_fn(y, logits)

    grads = tape.gradient(loss_value, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))

    train_acc_metric.update_state(y, logits) # update the weights of the network

    return loss_value

def test_step(x, y, model, loss_fn, val_acc_metric):
    val_logits = model(x, training=False)
    loss_value = loss_fn(y, val_logits)
    val_acc_metric.update_state(y, val_logits)

    return loss_value

def train(train_dataset, val_dataset,  model, optimizer,
          train_acc_metric, val_acc_metric, loss_fn,
          epochs=10, log_step=200, val_log_step=50):
  
    for epoch in range(epochs):
        print("\nStart of epoch %d" % (epoch,))

        train_loss = []   
        val_loss = []

        # Iterate over the batches of the dataset
        for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
            loss_value = train_step(x_batch_train, y_batch_train, 
                                    model, optimizer, 
                                    loss_fn, train_acc_metric)
            train_loss.append(float(loss_value))

        # Run a validation loop at the end of each epoch
        for step, (x_batch_val, y_batch_val) in enumerate(val_dataset):
            val_loss_value = test_step(x_batch_val, y_batch_val, 
                                       model, loss_fn, 
                                       val_acc_metric)
            val_loss.append(float(val_loss_value))
            
        # Display metrics at the end of each epoch
        train_acc = train_acc_metric.result()
        print("Training acc over epoch: %.4f" % (float(train_acc),))

        val_acc = val_acc_metric.result()
        print("Validation acc: %.4f" % (float(val_acc),))

        # Reset metrics at the end of each epoch
        train_acc_metric.reset_state()
        val_acc_metric.reset_state()

        # ⭐: log metrics using wandb.log
        wandb.log({'epochs': epoch,
                   'loss': np.mean(train_loss),
                   'acc': float(train_acc), 
                   'val_loss': np.mean(val_loss),
                   'val_acc':float(val_acc)})

In [None]:
# # initialize wandb with your project name and optionally with configutations.
# # play around with the config values and see the result on your wandb dashboard.
# feature = ["age"]

# config = {
#             "epochs": 10,
#             "batch_size": 32,
#             "log_step": 200,
#             "val_log_step": 50,
#             "architecture": "CNN",
#             "dataset": "MIMIC"
#     }

# run = wandb.init(project='my-tf-integration', config=config)
# config = wandb.config

# prediction = "outcome"

# model_data = data[feature+[prediction]].dropna()
# [train_dataset, val_dataset] = get_train_and_val(model_data)


# # Initialize model.
# model = make_model(len(feature))

# # Instantiate an optimizer to train the model.
# optimizer = keras.optimizers.Adam()
# # Instantiate a loss function.
# loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# # Prepare the metrics.
# train_acc_metric = keras.metrics.SparseCategoricalAccuracy()
# val_acc_metric = keras.metrics.SparseCategoricalAccuracy()


# train(train_dataset,
#     val_dataset, 
#     model,
#     optimizer,
#     train_acc_metric,
#     val_acc_metric,
#     epochs=config.epochs, 
#     log_step=config.log_step, 
#     val_log_step=config.val_log_step)

# run.finish()  # In Jupyter/Colab, let us know you're finished!




In [None]:
# initialize wandb with your project name and optionally with configutations.
# play around with the config values and see the result on your wandb dashboard.

def run_with_feature(feature):
      config = {
            "epochs": 20,
            "batch_size": 32,
            "log_step": 200,
            "val_log_step": 50,
            "feature_used": str(feature)
      }

      run = wandb.init(project='my-tf-integration', config=config)
      config = wandb.config

      prediction = "outcome"

      model_data = data[feature+[prediction]].dropna()
      [train_dataset, val_dataset] = get_train_and_val(model_data)


      # Initialize model.
      model = make_model(len(feature))

      # Instantiate an optimizer to train the model.
      optimizer = keras.optimizers.Adam()
      # Instantiate a loss function.
      loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

      # Prepare the metrics.
      train_acc_metric = keras.metrics.SparseCategoricalAccuracy()
      val_acc_metric = keras.metrics.SparseCategoricalAccuracy()


      train(train_dataset,
            val_dataset, 
            model,
            optimizer,
            train_acc_metric,
            val_acc_metric,
            loss_fn,
            epochs=config.epochs, 
            log_step=config.log_step, 
            val_log_step=config.val_log_step)

      run.finish()  # In Jupyter/Colab, let us know you're finished!


# feature = ["age"]
# run_with_feature(feature)

In [None]:
data_keys = data.drop(columns=['outcome']).keys()
len(data_keys)
# data_keys = data_keys[ 

train_acc_dict = dict.fromkeys(data_keys, 0)
val_acc_dict = dict.fromkeys(data_keys, 0)

for elem in data_keys:
    # try:
    run_with_feature([elem])
    #     train_acc_dict[elem] = train_acc_metric
    #     val_acc_dict[elem] = val_acc_dict
    # except:
    #     pass

In [20]:


# feature0 = ['Urine output', 'Lymphocyte', 'Bicarbonate', 'Leucocyte', 'Urea nitrogen', 'Anion gap', 'Neutrophils', 'Blood calcium', 'Lactic acid','PH']
# run_with_feature(feature0)

feature1 =["Urine output", "Lymphocyte", "Bicarbonate", "Leucocyte", "Urea nitrogen",
    "Anion gap", "Neutrophils", "Blood calcium", "Lactic acid", "PH", "Basophils",
    "Respiratory rate", "Blood sodium", "RDW", "Blood potassium", "NT-proBNP", "PT",
    "Systolic blood pressure", "heart rate", "Renal failure", "Chloride", "Platelets",
    "INR", "atrialfibrillation", "deficiencyanemias", "Diastolic blood pressure",
    "hypertensive", "EF", "BMI", "Magnesium ion", "temperature", "Creatinine", "MCH",
    "PCO2", "SP O2", "hematocrit", "MCV", "diabetes", "Hyperlipemia", "age", "ID",
    "COPD", "CHD with no MI", "Creatine kinase", "glucose", "gendera", "depression"
]

run_with_feature(feature1)




Start of epoch 0
Training acc over epoch: 0.1710
Validation acc: 0.4805

Start of epoch 1
Training acc over epoch: 0.4756
Validation acc: 0.6688

Start of epoch 2
Training acc over epoch: 0.6466
Validation acc: 0.7468

Start of epoch 3
Training acc over epoch: 0.6954
Validation acc: 0.7662

Start of epoch 4
Training acc over epoch: 0.7117
Validation acc: 0.7727

Start of epoch 5
Training acc over epoch: 0.7313
Validation acc: 0.8052

Start of epoch 6
Training acc over epoch: 0.7818
Validation acc: 0.8377

Start of epoch 7
Training acc over epoch: 0.7785
Validation acc: 0.8312

Start of epoch 8
Training acc over epoch: 0.7866
Validation acc: 0.8506

Start of epoch 9
Training acc over epoch: 0.8029
Validation acc: 0.8636

Start of epoch 10
Training acc over epoch: 0.8436
Validation acc: 0.8766

Start of epoch 11
Training acc over epoch: 0.8404
Validation acc: 0.8831

Start of epoch 12
Training acc over epoch: 0.8599
Validation acc: 0.8896

Start of epoch 13
Training acc over epoch: 0.84

0,1
acc,▁▄▆▆▆▆▇▇▇▇▇▇█▇██████
epochs,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
loss,█▅▄▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▄▅▆▆▆▇▇▇▇██████████
val_loss,█▅▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁

0,1
acc,0.90554
epochs,19.0
loss,0.24534
val_acc,0.9026
val_loss,0.25483


In [None]:
# Now for practice sake I will also write this NN with the same normal annotation we learned in class.
X = clean_data[['age','Blood sodium']]
y = clean_data[['outcome']]

# standardtize the data
X_standrd = StandardScaler()
X_standrd = X_standrd.fit_transform(X)


# 2
X_train, X_test, y_train, y_test = train_test_split(X_standrd, y, test_size=0.2, random_state=42)

# 3 construct DNN
model = Sequential([
    Dense(64, activation='relu',input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid')
])

# 4 Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=20, validation_split=0.2, verbose=1, batch_size=32)

# what does this does?
eval_ = model.evaluate(X_test, y_test, verbose=1)[1]
print(eval_)
