In [1]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM

In [2]:
# Step 1: Load the Excel Sheet
file_path = ('Canada_Hosp1_COVID_InpatientData.xlsx')

# Read all sheets into separate DataFrames
admission_data = pd.read_excel(file_path, sheet_name='Data-at-admission')
days_breakdown_data = pd.read_excel(file_path, sheet_name='Days-breakdown')
hlos_data = pd.read_excel(file_path, sheet_name='Hospital-length-of-stay')
msl_data = pd.read_excel(file_path, sheet_name='Medication-Static-List')

In [3]:
merged_data = pd.merge(days_breakdown_data, admission_data[['id', 'age', 'sex', 'height', 'weight', 'comorbidities']], 
                       left_on='parent_id', right_on='id', how='inner')
merged_data = pd.merge(merged_data, hlos_data[['parent_id', 'hospital_length_of_stay']], 
                       on='parent_id', how='inner')

In [4]:
# Create the adjusted HLOS (HLOS - day)
merged_data['adjusted_hlos'] = merged_data['hospital_length_of_stay'] - merged_data['day']

In [5]:
merged_data_new = merged_data[merged_data['adjusted_hlos'] > 0]

In [6]:
col_to_keep = [
    'age', 'sex', 'height', 'weight', 'comorbidities',
    'systolic_blood_pressure', 'diastolic_blood_pressure', 'heart_rate', 
    'respiratory_rate', 'oxygen_saturation', 'temperature',
    'wbc', 'rbc', 'hemoglobin', 'hematocrit', 'mcv', 'mch', 'mchc', 'rdw', 
    'platelet_count', 'aptt_aptr', 'pt', 'alt', 'ast', 'serum_creatinine', 
    'sodium', 'potassium', 'total_serum_bilirubin', 'lactate', 'pao2', 'pao2_fio2', 
    'ph', 'ferritin', 'd_dimer', 'crp', 'hs_crp', 'intubated', 
    'adjusted_hlos'
]
merged_data_new = merged_data_new[col_to_keep]
merged_data_new.head()

Unnamed: 0,age,sex,height,weight,comorbidities,systolic_blood_pressure,diastolic_blood_pressure,heart_rate,respiratory_rate,oxygen_saturation,...,lactate,pao2,pao2_fio2,ph,ferritin,d_dimer,crp,hs_crp,intubated,adjusted_hlos
0,74,Male,,,"""[\""Hypertension\"",\""Diabetes\"",\""Other\""]""",119.0,54.0,79.0,18.0,94.0,...,,,,,921.0,,,,No,20
1,74,Male,,,"""[\""Hypertension\"",\""Diabetes\"",\""Other\""]""",133.0,64.0,73.0,18.0,98.0,...,,,,,,,,,No,19
2,74,Male,,,"""[\""Hypertension\"",\""Diabetes\"",\""Other\""]""",140.0,74.0,70.0,20.0,95.0,...,,,,,,,,,No,18
3,74,Male,,,"""[\""Hypertension\"",\""Diabetes\"",\""Other\""]""",154.0,78.0,77.0,18.0,95.0,...,,,,,,,,,No,17
4,74,Male,,,"""[\""Hypertension\"",\""Diabetes\"",\""Other\""]""",155.0,61.0,64.0,16.0,92.0,...,,,,,,,,,No,16


In [7]:
dropped_data = merged_data_new
threshold = 0.90

missing_percentage = dropped_data.isnull().mean()

columns_to_drop = missing_percentage[missing_percentage > threshold].index

print(f"Columns to drop due to high missing values: {list(columns_to_drop)}")

dropped_data = dropped_data.drop(columns=columns_to_drop)

print(f"Remaining columns after dropping: {dropped_data.columns}")

Columns to drop due to high missing values: ['aptt_aptr', 'pt', 'lactate', 'pao2_fio2', 'ferritin', 'crp', 'hs_crp']
Remaining columns after dropping: Index(['age', 'sex', 'height', 'weight', 'comorbidities',
       'systolic_blood_pressure', 'diastolic_blood_pressure', 'heart_rate',
       'respiratory_rate', 'oxygen_saturation', 'temperature', 'wbc', 'rbc',
       'hemoglobin', 'hematocrit', 'mcv', 'mch', 'mchc', 'rdw',
       'platelet_count', 'alt', 'ast', 'serum_creatinine', 'sodium',
       'potassium', 'total_serum_bilirubin', 'pao2', 'ph', 'd_dimer',
       'intubated', 'adjusted_hlos'],
      dtype='object')


In [8]:
dropped_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2895 entries, 0 to 4059
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       2895 non-null   int64  
 1   sex                       2895 non-null   object 
 2   height                    1691 non-null   float64
 3   weight                    2182 non-null   float64
 4   comorbidities             2895 non-null   object 
 5   systolic_blood_pressure   2872 non-null   float64
 6   diastolic_blood_pressure  2872 non-null   float64
 7   heart_rate                2888 non-null   float64
 8   respiratory_rate          2887 non-null   float64
 9   oxygen_saturation         2887 non-null   float64
 10  temperature               2888 non-null   float64
 11  wbc                       1655 non-null   float64
 12  rbc                       1655 non-null   float64
 13  hemoglobin                1655 non-null   float64
 14  hematocrit   

In [9]:
dropped_data = dropped_data[dropped_data['adjusted_hlos']>=0]

In [10]:
dropped_data['d_dimer'] = pd.to_numeric(dropped_data['d_dimer'], errors='coerce')

In [11]:
dropped_data['sex'] = dropped_data['sex'].map({'Male': 1, 'Female': 0})

# Convert 'intubated' column: map yes to 1, no to 0
dropped_data['intubated'] = dropped_data['intubated'].map({'Yes': 1, 'No': 0})

# Check if the conversion was successful
dropped_data[['sex', 'intubated']].head()

Unnamed: 0,sex,intubated
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [12]:
dropped_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2895 entries, 0 to 4059
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       2895 non-null   int64  
 1   sex                       2895 non-null   int64  
 2   height                    1691 non-null   float64
 3   weight                    2182 non-null   float64
 4   comorbidities             2895 non-null   object 
 5   systolic_blood_pressure   2872 non-null   float64
 6   diastolic_blood_pressure  2872 non-null   float64
 7   heart_rate                2888 non-null   float64
 8   respiratory_rate          2887 non-null   float64
 9   oxygen_saturation         2887 non-null   float64
 10  temperature               2888 non-null   float64
 11  wbc                       1655 non-null   float64
 12  rbc                       1655 non-null   float64
 13  hemoglobin                1655 non-null   float64
 14  hematocrit   

In [13]:
num_cols = dropped_data.select_dtypes(include=[np.number]).columns
dropped_data[num_cols] = dropped_data[num_cols].apply(lambda col: col.fillna(col.median()), axis=0)

# For categorical columns, fill with the most frequent value
cat_cols = dropped_data.select_dtypes(include=[object]).columns
dropped_data[cat_cols] = dropped_data[cat_cols].apply(lambda col: col.fillna(col.mode()[0]), axis=0)

In [14]:
dropped_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2895 entries, 0 to 4059
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       2895 non-null   int64  
 1   sex                       2895 non-null   int64  
 2   height                    2895 non-null   float64
 3   weight                    2895 non-null   float64
 4   comorbidities             2895 non-null   object 
 5   systolic_blood_pressure   2895 non-null   float64
 6   diastolic_blood_pressure  2895 non-null   float64
 7   heart_rate                2895 non-null   float64
 8   respiratory_rate          2895 non-null   float64
 9   oxygen_saturation         2895 non-null   float64
 10  temperature               2895 non-null   float64
 11  wbc                       2895 non-null   float64
 12  rbc                       2895 non-null   float64
 13  hemoglobin                2895 non-null   float64
 14  hematocrit   

In [15]:
# Apply label encoding to categorical columns
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
categorical_columns = ['comorbidities']
for col in categorical_columns:
    dropped_data[col] = dropped_data[col].astype(str)
    
    le = LabelEncoder()
    dropped_data[col] = le.fit_transform(dropped_data[col])
    label_encoders[col] = le  
dropped_data.head()

Unnamed: 0,age,sex,height,weight,comorbidities,systolic_blood_pressure,diastolic_blood_pressure,heart_rate,respiratory_rate,oxygen_saturation,...,ast,serum_creatinine,sodium,potassium,total_serum_bilirubin,pao2,ph,d_dimer,intubated,adjusted_hlos
0,74,1,167.6,77.0,102,119.0,54.0,79.0,18.0,94.0,...,46.0,102.0,142.0,4.1,8.0,88.0,7.4,1072.0,0,20
1,74,1,167.6,77.0,102,133.0,64.0,73.0,18.0,98.0,...,46.0,88.0,144.0,3.7,8.0,88.0,7.4,1072.0,0,19
2,74,1,167.6,77.0,102,140.0,74.0,70.0,20.0,95.0,...,46.0,73.0,142.0,4.0,8.0,88.0,7.4,1072.0,0,18
3,74,1,167.6,77.0,102,154.0,78.0,77.0,18.0,95.0,...,46.0,85.0,141.0,3.9,8.0,88.0,7.4,1072.0,0,17
4,74,1,167.6,77.0,102,155.0,61.0,64.0,16.0,92.0,...,46.0,80.5,138.0,4.0,8.0,88.0,7.4,1072.0,0,16


In [17]:
#Statistics
 
# Number of rows and columns
print(f"Number of rows (patients): {dropped_data.shape[0]}")
print(f"Number of columns (features): {dropped_data.shape[1]}")

# Mean of each column
print("Mean of each column:")
print(dropped_data.mean())

# Check for columns with all blank values
all_blank_cols = dropped_data.columns[dropped_data.isna().all()]
print(f"Columns with all blank values: {all_blank_cols}")


Number of rows (patients): 2895
Number of columns (features): 31
Mean of each column:
age                           67.559585
sex                            0.587910
height                       166.880035
weight                        79.392021
comorbidities                 89.194128
systolic_blood_pressure      127.577547
diastolic_blood_pressure      70.934370
heart_rate                    78.793782
respiratory_rate              20.868048
oxygen_saturation             94.071848
temperature                   36.803627
wbc                            8.995233
rbc                            4.352763
hemoglobin                   123.051813
hematocrit                     0.377097
mcv                           87.235682
mch                           28.499344
mchc                         325.695682
rdw                           13.869085
platelet_count               268.952332
alt                           58.645941
ast                           51.612090
serum_creatinine              93.4

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense
X = dropped_data.drop(columns=['adjusted_hlos'])  # Replace 'HLOS' with your target column name
y = dropped_data['adjusted_hlos'].values

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Reshape X for RNN: (samples, time_steps, features)
sequence_length = 10  # Number of time steps in each sequence
num_features = X_scaled.shape[1]  # Number of features

# Create sequences
def create_sequences(X, y, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(X) - seq_length):
        X_seq.append(X[i:i + seq_length])
        y_seq.append(y[i + seq_length])
    return np.array(X_seq), np.array(y_seq)

X_seq, y_seq = create_sequences(X_scaled, y, sequence_length)

# Step 2: Split into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

# Step 3: Build an RNN model
model = Sequential()
model.add(SimpleRNN(50, activation='relu', input_shape=(sequence_length, num_features)))
model.add(Dense(1))  # Output layer for regression (predicting HLOS)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Step 4: Train the RNN model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# Step 5: Evaluate the model
train_loss = model.evaluate(X_train, y_train)
test_loss = model.evaluate(X_test, y_test)

print(f"Train Loss (MSE): {train_loss}")
print(f"Test Loss (MSE): {test_loss}")

# Step 6: Predictions
y_pred = model.predict(X_test)

Epoch 1/20


  super().__init__(**kwargs)


[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - loss: 221.9540 - val_loss: 139.5515
Epoch 2/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 138.7345 - val_loss: 135.5587
Epoch 3/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 137.2391 - val_loss: 131.8927
Epoch 4/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 129.5045 - val_loss: 127.6649
Epoch 5/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 117.1318 - val_loss: 124.7534
Epoch 6/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 110.8196 - val_loss: 116.3249
Epoch 7/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 101.9509 - val_loss: 113.2551
Epoch 8/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 88.7095 - val_loss: 110.0249
Epoch 9/20
[1m73/73[0m [32m━━━━━━

In [19]:
import keras_tuner as kt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Function to build the RNN model
def build_rnn_model(hp):
    model = Sequential()
    # Tune the number of units in the RNN layer
    hp_units = hp.Int('units', min_value=32, max_value=128, step=16)
    model.add(SimpleRNN(hp_units, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
    
    # Tune the number of Dense layers and their units
    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(Dense(units=hp.Int(f'layer_{i}_units', min_value=32, max_value=128, step=16), activation='relu'))
    
    model.add(Dense(1))  # Output layer

    # Compile the model
    model.compile(optimizer='adam', loss='mse')
    
    return model

# Initialize Keras Tuner
tuner = kt.Hyperband(
    build_rnn_model,
    objective='val_loss',
    max_epochs=20,
    factor=3,
    directory='rnn_tuning',
    project_name='rnn_model'
)


# Search for the best hyperparameters
tuner.search(X_train, y_train, epochs=20, validation_split=0.2)

# Get the best model
best_model = tuner.get_best_models(num_models=1)[0]

# Train the best model
best_model.fit(X_train, y_train, epochs=20, validation_split=0.2)

# Evaluate on the test set
test_loss = best_model.evaluate(X_test, y_test)
print(f"Best Test MSE: {test_loss}")


Reloading Tuner from rnn_tuning\rnn_model\tuner0.json

Epoch 1/20


  saveable.load_own_variables(weights_store.get(inner_path))


[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - loss: 21.9516 - val_loss: 54.3265
Epoch 2/20
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 12.4862 - val_loss: 56.5562
Epoch 3/20
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 12.6969 - val_loss: 59.8245
Epoch 4/20
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 11.7763 - val_loss: 56.1958
Epoch 5/20
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 10.3868 - val_loss: 53.7212
Epoch 6/20
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 9.4157 - val_loss: 53.6272
Epoch 7/20
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 8.1962 - val_loss: 55.5709
Epoch 8/20
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 7.7701 - val_loss: 56.1245
Epoch 9/20
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [20]:
from sklearn.model_selection import KFold
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error

# Number of folds for cross-validation
k_folds = 5

# Initialize KFold cross-validation
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# To store the results of each fold
fold_train_losses = []
fold_test_losses = []

# Loop over each fold
for train_index, test_index in kf.split(X_seq):
    # Split the data into training and testing sets for the fold
    X_train_fold, X_test_fold = X_seq[train_index], X_seq[test_index]
    y_train_fold, y_test_fold = y_seq[train_index], y_seq[test_index]
    
    # Build the RNN model for each fold
    model = Sequential()
    model.add(SimpleRNN(50, activation='relu', input_shape=(sequence_length, num_features)))
    model.add(Dense(1))  # Output layer
    
    # Compile the model
    model.compile(optimizer=Adam(), loss='mean_squared_error')

    # Train the model for the current fold
    model.fit(X_train_fold, y_train_fold, epochs=20, batch_size=32, validation_data=(X_test_fold, y_test_fold), verbose=0)
    
    # Evaluate the model on the train and test sets
    train_loss = model.evaluate(X_train_fold, y_train_fold, verbose=0)
    test_loss = model.evaluate(X_test_fold, y_test_fold, verbose=0)
    
    # Store the results for this fold
    fold_train_losses.append(train_loss)
    fold_test_losses.append(test_loss)

    print(f"Fold {len(fold_train_losses)} - Train Loss (MSE): {train_loss}, Test Loss (MSE): {test_loss}")

# Calculate the average and standard deviation for the training and testing losses across all folds
avg_train_loss = np.mean(fold_train_losses)
avg_test_loss = np.mean(fold_test_losses)
std_train_loss = np.std(fold_train_losses)
std_test_loss = np.std(fold_test_losses)

print(f"Average Train Loss (MSE): {avg_train_loss}, Std: {std_train_loss}")
print(f"Average Test Loss (MSE): {avg_test_loss}, Std: {std_test_loss}")


In [None]:
import matplotlib.pyplot as plt

# Assuming `y_test` is the true values and `y_pred` is the predicted values from your RNN model
y_pred_rnn = model.predict(X_test)

# Plot actual vs predicted values for RNN
plt.figure(figsize=(10, 6))
plt.plot(y_test, label='Actual Values', color='blue', marker='o')
plt.plot(y_pred_rnn, label='Predicted Values (RNN)', color='red', marker='x')
plt.title('RNN: Actual vs Predicted Values')
plt.xlabel('Samples')
plt.ylabel('Adjusted HLOS')
plt.legend()
plt.grid(True)
plt.show()