In [None]:
from detectdd import config
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
from tensorflow import keras
# import tensorflow.keras

import root_config as rc
import pandas as pd
from detectdd.serializer import Serializer

from sklearn.model_selection import train_test_split

rc.configure()


bp_df = pd.read_csv(config.out_dir /'vitals_data-10.csv')
print(bp_df.dtypes)

bp_df["charttime"] = bp_df["charttime"].astype("datetime64[s]")
bp_df["dose_b_time"] = bp_df["dose_b_time"].astype("datetime64[s]")

bp_df = bp_df.sort_values(by=["stay_id", "dose_b_time", "charttime"])

bp_df["has_icd"] = 1
bp_df.head(50)

In [None]:
bp_df_no_icd = serializer.read_bp_results("10-no-icd")
bp_df_no_icd["has_icd"] = 0
bp_df_no_icd
df = bp_df
# df= pd.concat([bp_df, bp_df_no_icd])
df.head(50)

In [None]:
df["charttime"] = df["charttime"].astype("datetime64[s]")
df["dose_b_time"] = df["dose_b_time"].astype("datetime64[s]")

df.dtypes

In [None]:

## convert chart_time to timestep relative to dose b time
df["timestep"] = df["charttime"] - df["dose_b_time"] 
## and convert to float64 for tensor flow
df['timestep_float'] = df['timestep'].dt.total_seconds().astype('float64')

In [None]:
df = df[["stay_id", "timestep", "timestep_float", "heart_rate", "sbp", "dbp", "mbp", "dose_b_time", "charttime", "has_icd"]].sort_values(["stay_id", "dose_b_time", "timestep"])
df.head(50)

In [None]:
import pandas as pd
group_df = df[["stay_id", "dose_b_time", "timestep_float", "heart_rate", "sbp", "dbp", "mbp", "has_icd"]]
grouped_df = group_df.groupby(["stay_id", "has_icd", "dose_b_time"]).agg(lambda x: list(x)).reset_index()

print(grouped_df)
print(grouped_df.shape)

In [None]:
final_bp = grouped_df

# reduce the size of the data arrays to the p999 to get rid of the extreme outliers
length = pd.DataFrame()
length["val"] = [len(array) for array in final_bp["mbp"]]
print(length.describe(percentiles=[0.01, 0.025, 0.25, 0.5, 0.75, 0.975, 0.99, 0.999]))


max_size = 98 # p999

data_arrays = ["timestep_float", "heart_rate", "sbp", "dbp", "mbp"]
for data_array in data_arrays:
    final_bp[data_array] = final_bp[data_array].apply(lambda arr: arr[:max_size]) # shrink it

max_sequence_length = max(len(array) for array in final_bp["sbp"])
final_bp = final_bp

In [None]:
# first fill blank values through interpellation
from scipy.interpolate import interp1d

def interpolate_row(row):
    
    time_steps = np.array(row["timestep_float"])
    common_time_interval = range(0, 12*60, 10)
    
    for data_array in ["sbp", "mbp", "dbp", "heart_rate"]:

        # # convert to arrays for interpolation
        # row[data_array] = np.array(row[data_array])
        # 
        # nan_indices = np.isnan(row[data_array])
        # if np.count_nonzero(nan_indices) > 1:
        #     missing_values =  interp1d(time_steps[~nan_indices], row[data_array][~nan_indices] , kind='linear', fill_value='extrapolate')
        #     row[data_array] = missing_values(time_steps)
        
        interp_func = interp1d(time_steps, row[data_array], kind='nearest-up', fill_value='extrapolate')
        row["intr-"+data_array] = interp_func(common_time_interval)
    
    row["common_timestep"] = common_time_interval
    return row

# Apply the function to the specific subset of rows
final_bp = final_bp.apply(interpolate_row, axis=1)
final_bp.head(50)

In [None]:

print(len(final_bp.loc[0].timestep_float))
print(len(final_bp.loc[0]["heart_rate"]))
print(len(final_bp.loc[0]["intr-heart_rate"]))
print(len(final_bp.loc[0]["common_timestep"]))


In [None]:
from numpy import shape
from keras_preprocessing.sequence import pad_sequences

target = 'has_icd'

# Extract features and timestep data
features = ['heart_rate', 'sbp', 'dbp', 'mbp']
timesteps = final_bp['common_timestep'].tolist()

# Determine the maximum sequence length
max_sequence_length = max(len(seq) for seq in timesteps)
print(max_sequence_length)
print(max_sequence_length)

print(max(len(dosages) for dosages in final_bp.groupby('stay_id')))

# Organize data by stay_id
stay_ids = final_bp['stay_id'].unique()
X_sequences = []
y_sequences = []

for stay_id in stay_ids:
    stay_data = final_bp[final_bp['stay_id'] == stay_id]
    stay_X = []
    for index, row in stay_data.iterrows():
        dose_features = row[features].to_numpy()
    
        # Pad or truncate sequences to the maximum length
        dose_features_padded = pad_sequences(dose_features_padded, maxlen=max_sequence_length, dtype='float32', padding='post', truncating='post')
        stay_X.append(dose_features_padded)
    
    X_sequences.append(stay_X)
    # Extract targets for each stay_id 
    labels = stay_data[target].tolist() 
    y_sequences.append(labels[0])


X = X_sequences
y = y_sequences
print(y)

In [None]:
print(len(X))
print(len(y))

#Adjust the size of the testing set: we'll use 10% of the entire data. 
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.1, random_state = 1)

#Check the number of columns (features):
print(len(X_train))
print(type(X))
print(type(y))
y

In [None]:
print(max_sequence_length)

model = keras.Sequential()

max_feature_dim = 4

model.add(keras.layers.Masking(mask_value=0, input_shape=(max_sequence_length, None, max_feature_dim)))
model.add(keras.layers.Reshape((-1, 5)))
model.add(keras.layers.LSTM(64))
model.add(keras.layers.Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
ragged_tensors = []

final_bp['dose_num'] = df.groupby('stay_id').cumcount() + 1

for _, group in final_bp[['stay_id', 'dose_num', 'intr-heart_rate', "intr-sbp", "intr-mbp", "intr-dbp", 'common_timestep']].groupby('stay_id'):

    feature_data_list = group[['intr-heart_rate', "intr-sbp", "intr-mbp", "intr-dbp", 'common_timestep']].values.tolist()

    feature_data_ragged = tf.ragged.constant(feature_data_list, dtype=tf.float32)
    ragged_tensors.append(feature_data_ragged)

# Convert the list of RaggedTensor objects into a 3D RaggedTensor
sequences_ragged = tf.ragged.stack(ragged_tensors, axis=0)


In [None]:
# Convert the organized data into tensors
# X_tensor = tf.convert_to_tensor(X_train, dtype='float32')
y_tensor = tf.ragged.constant(y_sequences, dtype='float32')

simple = tf.constant(y, dtype='float32')

classifier = model.fit(sequences_ragged, simple, epochs=50, batch_size=100, verbose=1) #set verbose = 1 to see the fitting process

In [None]:
# Plot Accuracy over the epochs
plt.plot(classifier.history['accuracy'])
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.title('model accuracy')
plt.legend(['train'], loc='upper left')
plt.show()

# Plot Loss over the epochs
plt.plot(classifier.history['loss'])
plt.ylabel('loss')
plt.xlabel('epoch')
plt.title('model loss')
plt.legend(['train'], loc='upper left')
plt.show()

In [None]:
type(X_train)

In [None]:
X_train