In [33]:
import numpy as np
import gym
import tensorflow as tf
from tensorflow.keras import layers


In [76]:
# Create the environment
env = gym.make('CartPole-v1')

# Model definition using TensorFlow
def create_model():
    model = tf.keras.Sequential([
        layers.Dense(24, activation='relu'),
        layers.Dense(24, activation='relu'),
        layers.Dense(env.action_space.n, activation='linear')
    ])
    return model

model = create_model()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
loss_function = tf.keras.losses.MeanSquaredError()

# Action selection function
def choose_action(state, epsilon):
    if np.random.rand() < epsilon:
        return env.action_space.sample()
    else:
        state_tensor = tf.convert_to_tensor([state], dtype=tf.float32)  # Change here
        q_values = model(state_tensor)
        return np.argmax(q_values.numpy())

# Update train_step function to handle data types appropriately
@tf.function
def train_step(gamma,state_batch, action_batch, reward_batch, next_state_batch, done_batch):
    with tf.GradientTape() as tape:
        q_values = model(state_batch, training=True)
        action_indices = tf.stack([tf.range(action_batch.shape[0]), action_batch], axis=1)
        predicted_q = tf.gather_nd(q_values, indices=action_indices)

        future_q = model(next_state_batch)
        target_q = reward_batch + (1. - done_batch) * gamma * tf.reduce_max(future_q, axis=1)

        loss = loss_function(target_q, predicted_q)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

# Main training loop
def main():
    episodes = 500
    gamma = 0.99
    epsilon = 0.1
    min_epsilon = 0.01
    epsilon_decay = 0.995
    batch_size = 32
    buffer = []

    for episode in range(episodes):
        state_tuple = env.reset()
        state  = state_tuple[0]
        total_reward = 0
        done = False
        while not done:
            action = choose_action(state, epsilon)
            next_state, reward, done, info, _ = env.step(action)
            buffer.append((state, action, reward, next_state, done))
            state = next_state
            total_reward += reward

            if len(buffer) > batch_size:
                batch = np.random.choice(len(buffer), batch_size, replace=False)
                batch = [buffer[i] for i in batch]
                state_batch, action_batch, reward_batch, next_state_batch, done_batch = map(np.array, zip(*batch))
                
                state_batch = np.array(state_batch, dtype=np.float32)
                action_batch = np.array(action_batch, dtype=np.int32)
                reward_batch = np.array(reward_batch, dtype=np.float32)  # Make sure this is correct as shown above
                next_state_batch = np.array(next_state_batch, dtype=np.float32)
                done_batch = np.array(done_batch, dtype=np.float32)  # Typically this should be boolean or float for masking

                train_step(gamma,
                           tf.convert_to_tensor(state_batch, dtype=tf.float32),
                           tf.convert_to_tensor(action_batch, dtype=tf.int32),
                           tf.convert_to_tensor(reward_batch, dtype=tf.float32),
                           tf.convert_to_tensor(next_state_batch, dtype=tf.float32),
                           tf.convert_to_tensor(done_batch, dtype=tf.float32))  # Assuming this needs to be a float for computational reasons

        epsilon = max(min_epsilon, epsilon * epsilon_decay)
        print(f"Episode {episode} - Total Reward: {total_reward}, Epsilon: {epsilon}")

if __name__ == "__main__":
    main()



Episode 0 - Total Reward: 49.0, Epsilon: 0.0995
Episode 1 - Total Reward: 73.0, Epsilon: 0.09900250000000001
Episode 2 - Total Reward: 11.0, Epsilon: 0.0985074875
Episode 3 - Total Reward: 43.0, Epsilon: 0.09801495006250001
Episode 4 - Total Reward: 34.0, Epsilon: 0.09752487531218751
Episode 5 - Total Reward: 12.0, Epsilon: 0.09703725093562657
Episode 6 - Total Reward: 19.0, Epsilon: 0.09655206468094843
Episode 7 - Total Reward: 48.0, Epsilon: 0.09606930435754368
Episode 8 - Total Reward: 21.0, Epsilon: 0.09558895783575597
Episode 9 - Total Reward: 41.0, Epsilon: 0.09511101304657718
Episode 10 - Total Reward: 28.0, Epsilon: 0.09463545798134429
Episode 11 - Total Reward: 36.0, Epsilon: 0.09416228069143756
Episode 12 - Total Reward: 40.0, Epsilon: 0.09369146928798038
Episode 13 - Total Reward: 10.0, Epsilon: 0.09322301194154048
Episode 14 - Total Reward: 51.0, Epsilon: 0.09275689688183278
Episode 15 - Total Reward: 27.0, Epsilon: 0.09229311239742362
Episode 16 - Total Reward: 50.0, Epsil