Define the Environment

In [1]:
!pip install tensorflow==2.15.0

Collecting tensorflow==2.15.0
  Downloading tensorflow-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting ml-dtypes~=0.2.0 (from tensorflow==2.15.0)
  Downloading ml_dtypes-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting wrapt<1.15,>=1.11.0 (from tensorflow==2.15.0)
  Downloading wrapt-1.14.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting tensorboard<2.16,>=2.15 (from tensorflow==2.15.0)
  Downloading tensorboard-2.15.2-py3-none-any.whl.metadata (1.7 kB)
Collecting tensorflow-estimator<2.16,>=2.15.0 (from tensorflow==2.15.0)
  Downloading tensorflow_estimator-2.15.0-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting keras<2.16,>=2.15.0 (from tensorflow==2.15.0)
  Downloading keras-2.15.0-py3-none-any.whl.metadata (2.4 kB)
Downloading tensorflow-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (475.2 MB)


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Sepsis Environment**

In [3]:
import gym
from gym import spaces
import numpy as np
import pandas as pd
import tensorflow as tf

# Define the columns for state features and action features
state_cols = ['SOFA']  # Example state columns
action_cols = ['MaxVaso', 'Input4H']  # Medication columns

# Function to predict medication effects based on the model
def predict_medication_effects(model, state, iv_fluid_dosage, vp_dosage, history):
    current_state = state[state_cols].values.reshape(1, -1)  # Shape: (1, 1) if 'SOFA' is the only state feature
    action = np.array([vp_dosage, iv_fluid_dosage]).reshape(1, -1)  # Shape: (1, 2)

    # Concatenate current state and action into a single input array
    model_input = np.concatenate([current_state, action], axis=1)  # Shape: (1, 3)

    # Concatenate historical cases with current input
    model_input = np.concatenate([model_input, history.reshape(1, -1)], axis=1)  # Shape: (1, 12)

    # Predict the next state changes
    state_change = model.predict(model_input)

    # Update the state with the predicted changes
    next_state = state.copy()
    next_state[state_cols] += state_change[0]

    return next_state, state_change[0][0]

# Environment Setup
class SepsisEnv(gym.Env):
    def __init__(self, dataset_path, model_path):
        super(SepsisEnv, self).__init__()
        self.dataset = pd.read_csv(dataset_path)  # Load the dataset
        self.model = tf.keras.models.load_model(model_path)  # Load the trained model
        self.current_index = 0
        self.history_size = 3  # Size of history to maintain
        self.action_history = np.array([[2, 0, 0], [3, 0, 0], [9, 0, 0]])  # Initialize with example history
        self.action_space = spaces.Box(low=np.array([0, 0]), high=np.array([1, 200]), dtype=np.float32)  # Continuous action space

        # Calculate observation space size
        self.observation_size = len(state_cols) + len(action_cols) + len(state_cols) * self.history_size
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.observation_size,), dtype=np.float32)

    def reset(self):
        self.current_index = np.random.randint(0, len(self.dataset))
        self.action_history = np.array([[2, 0, 0], [3, 0, 0], [9, 0, 0]])  # Reset to example history
        return self._get_observation()

    def _get_observation(self):
        state = self.dataset.iloc[self.current_index][state_cols].values
        current_action = self.dataset.iloc[self.current_index][action_cols].values
        recent_history = self.action_history.flatten()  # Use action history

        return np.concatenate((state, current_action, recent_history))

    def step(self, action):
        state = self.dataset.iloc[self.current_index].copy()
        next_state = state.copy()

        iv_fluid_dosage = action[1]
        vp_dosage = action[0]

        next_state, sofa_change = predict_medication_effects(self.model, next_state, iv_fluid_dosage, vp_dosage, self.action_history)

        current_sofa = self.action_history[2][0]
        next_sofa = sofa_change

        done = next_sofa >= 25 or next_sofa <= 5

        reward = self.calculate_reward(current_sofa, next_sofa)

        # Update action history with current action (vp_dosage, iv_fluid_dosage)
        self.action_history = np.roll(self.action_history, -1, axis=0)  # Remove the oldest entry
        self.action_history[-1] = [sofa_change, vp_dosage, iv_fluid_dosage]

        self.current_index += 1

        # Prepare the observation to return
        observation = self._get_observation()

        # Return observation, reward, done status, and info dictionary
        info = {'predicted_sofa_state': next_state, 'action_applied': action}
        return observation, reward, done, info

    def calculate_reward(self, current_sofa, next_sofa):
        # Immediate reward based on SOFA score change
        if next_sofa < current_sofa:
            sofa_reward = (current_sofa - next_sofa) * 2
        elif next_sofa > current_sofa:
            sofa_reward = (next_sofa - current_sofa) * -2
        else:
            sofa_reward = 1  # Small reward for maintaining the SOFA score

        # Terminal reward based on episode end and SOFA score
        if next_sofa <= 5:
            terminal_reward = 20  # High reward for achieving a low SOFA score at the end
        else:
            terminal_reward = -10  # Penalty for high SOFA score at the end

        return sofa_reward + terminal_reward


**DDPG Agent**

In [4]:
import numpy as np
import random
from tensorflow.keras.layers import Dense, Input, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# DDPG Agent Class
class DDPGAgent:
    def __init__(self, env, state_dim, action_dim, action_high):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_high = action_high
        self.action_low = 0

        self.actor = self.build_actor()
        self.actor_target = self.build_actor()
        self.actor_target.set_weights(self.actor.get_weights())

        self.critic = self.build_critic()
        self.critic_target = self.build_critic()
        self.critic_target.set_weights(self.critic.get_weights())

        self.actor_optimizer = Adam(learning_rate=0.001)
        self.critic_optimizer = Adam(learning_rate=0.002)

        self.replay_buffer = ReplayBuffer()

    def build_actor(self):
        state_input = Input(shape=(self.state_dim,))
        x = Dense(32, activation='relu')(state_input)
        x = Dense(16, activation='relu')(x)
        action_output = Dense(self.action_dim, activation='tanh')(x)
        scaled_output = tf.multiply(action_output, self.action_high)
        model = Model(inputs=state_input, outputs=scaled_output)
        return model

    def build_critic(self):
        state_input = Input(shape=(self.state_dim,))
        action_input = Input(shape=(self.action_dim,))
        x = concatenate([state_input, action_input])
        x = Dense(32, activation='relu')(x)
        x = Dense(16, activation='relu')(x)
        q_value = Dense(1)(x)
        model = Model(inputs=[state_input, action_input], outputs=q_value)
        return model

    def update_target_networks(self, tau=0.005):
        self.update_target(self.actor_target.variables, self.actor.variables, tau)
        self.update_target(self.critic_target.variables, self.critic.variables, tau)

    def update_target(self, target_vars, source_vars, tau):
        for target_var, source_var in zip(target_vars, source_vars):
            target_var.assign(tau * source_var + (1 - tau) * target_var)
    def get_action(self, state):
        state = np.reshape(state, [2, self.state_dim])
        return self.actor.predict(state)[0]


    def train(self, batch_size):
        if len(self.replay_buffer) < batch_size:
            return

        minibatch = self.replay_buffer.sample(batch_size)


    # Convert all elements to numpy arrays and ensure correct shapes
        states = np.array([np.array(m[0]).reshape(-1) for m in minibatch])
        actions = np.array([np.array(m[1]).reshape(-1) for m in minibatch])
        rewards = np.array([np.array(m[2]).reshape(1) for m in minibatch])
        next_states = np.array([np.array(m[3]).reshape(-1) for m in minibatch])
        dones = np.array([np.array(m[4]).reshape(1) for m in minibatch])

    # Ensure correct final shapes
        states = states.reshape(batch_size, -1)  # (batch_size, state_dim)
        actions = actions.reshape(batch_size, -1)  # (batch_size, action_dim)
        rewards = rewards.reshape(batch_size, 1)  # (batch_size, 1)
        next_states = next_states.reshape(batch_size, -1)  # (batch_size, state_dim)
        dones = dones.reshape(batch_size, 1)  # (batch_size, 1)

    # Predict the next actions and Q-values
        next_actions = self.actor_target.predict(next_states)
        q_values = self.critic_target.predict([next_states, next_actions])

    # Compute the target Q-values
        targets = rewards + (1 - dones) * 0.99 * q_values  #Bell man Equation,here Discount factor is 0.99

    # Debugging print statements to check final shapes before training
        print(f"States shape: {states.shape}")
        print(f"Actions shape: {actions.shape}")
        print(f"Rewards shape: {rewards.shape}")
        print(f"Next states shape: {next_states.shape}")
        print(f"Dones shape: {dones.shape}")
        print(f"Next actions shape: {next_actions.shape}")
        print(f"Q-values shape: {q_values.shape}")
        print(f"Targets shape: {targets.shape}")

    # Train the critic network
        self.critic.train_on_batch([states, actions], targets)

        with tf.GradientTape() as tape:
            predicted_actions = self.actor(states)
            critic_value = self.critic([states, predicted_actions])
            actor_loss = -tf.math.reduce_mean(critic_value)

        actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables)
        self.actor_optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_variables))

        self.update_target_networks()

class ReplayBuffer:
    def __init__(self, buffer_size=10000):
        self.buffer = []
        self.buffer_size = buffer_size

    def add(self, state, action, reward, next_state, done):
        if len(self.buffer) >= self.buffer_size:
            self.buffer.pop(0)
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)


Training loop and Plotting

In [5]:
import matplotlib.pyplot as plt
import numpy as np

# Assuming you have defined the SepsisEnv and DDPGAgent classes
# and they are imported or defined elsewhere in your code.

# Load dataset and model path
dataset_path = "/content/drive/MyDrive/RL project/Dataset.csv"
model_path = "/content/drive/MyDrive/RL project/predict_state_model.keras"

# Initialize environment and DDPG agent
env = SepsisEnv(dataset_path, model_path)
state_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]
action_bound = 1  # Define according to your action space scaling

agent = DDPGAgent(env,state_size, action_size, action_bound)


# Train the agent
batch_size = 32
episodes = 170  # Use the number of rows in the subset as episodes
rewards = []
avg_rewards = []
print('state_size',state_size)

for episode in range(episodes):
    state = env.reset()
    state = np.reshape(state, [2, state_size])
    episode_reward = 0

    for time in range(40):
        action = agent.get_action(state)
        discrete_action = np.argmax(action)  # Assuming 'action' is a probability distribution over actions
        next_state, reward, done, info = env.step(action)
         # Reshape next_state to be consistent with state
        # next_state = np.reshape(next_state, (1, state_size))
        agent.replay_buffer.add(state[0], action, reward, next_state[0], done)
        state = next_state



        if done:
            print(f"episode: {episode}/{episodes}, score: {time}, e: {agent.actor_optimizer.learning_rate.numpy():.2}")
            break
        if len(agent.replay_buffer.buffer) > batch_size:
            agent.train(batch_size)

        state = next_state
        episode_reward += reward

        if done:
            break

    rewards.append(episode_reward)
    avg_rewards.append(np.mean(rewards[-10:]))

    print(f"Episode {episode}: Reward = {episode_reward}, Avg Reward = {np.mean(rewards[-10:])}")

# Plotting
plt.plot(rewards)
plt.plot(avg_rewards)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('DDPG Training Rewards')
plt.legend(['Reward', 'Avg Reward (last 10 episodes)'])
plt.show()

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


state_size 6
episode: 0/170, score: 0, e: 0.001
Episode 0: Reward = 0, Avg Reward = 0.0
episode: 1/170, score: 0, e: 0.001
Episode 1: Reward = 0, Avg Reward = 0.0
episode: 2/170, score: 0, e: 0.001
Episode 2: Reward = 0, Avg Reward = 0.0
episode: 3/170, score: 1, e: 0.001
Episode 3: Reward = -2.0566234588623047, Avg Reward = -0.5141558647155762
episode: 4/170, score: 1, e: 0.001
Episode 4: Reward = -2.436351776123047, Avg Reward = -0.8985950469970703
episode: 5/170, score: 0, e: 0.001
Episode 5: Reward = 0, Avg Reward = -0.748829205830892
episode: 6/170, score: 1, e: 0.001
Episode 6: Reward = -2.0666255950927734, Avg Reward = -0.9370858328683036
episode: 7/170, score: 0, e: 0.001
Episode 7: Reward = 0, Avg Reward = -0.8199501037597656
episode: 8/170, score: 0, e: 0.001
Episode 8: Reward = 0, Avg Reward = -0.7288445366753472
episode: 9/170, score: 1, e: 0.001
Episode 9: Reward = -2.273861885070801, Avg Reward = -0.8833462715148925
episode: 10/170, score: 1, e: 0.001
Episode 10: Reward =

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (32,) + inhomogeneous part.