In [9]:
import gym
import warnings
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
import random
import os
import tensorflow as tf

# Suppress DeprecationWarnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

def check_gpu_availability():
    # List all GPUs available to TensorFlow
    gpus = tf.config.list_physical_devices('GPU')

    # Check if any GPUs are available
    if len(gpus) > 0:
        print("GPU is available")
        for gpu in gpus:
            print(f"GPU device: {gpu}")
    else:
        print("GPU is not available")

# Call the function to check GPU availability
check_gpu_availability()

GPU is available
GPU device: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
GPU device: PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')


In [2]:
# Build Q-Network
def build_q_network(input_shape=(4,), num_actions=2):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(32, activation='relu', input_shape=input_shape),
        tf.keras.layers.Dense(32, activation='relu', input_shape=input_shape),
        tf.keras.layers.Dense(num_actions)
    ])
    return model

In [3]:
def adjust_weights(model, target_model, optimizer, states, actions, rewards, next_states, dones, discount_factor):
    # Convert to TensorFlow tensors and ensure float32 type for compatibility with TensorFlow
    states = tf.convert_to_tensor(states, dtype=tf.float32)
    next_states = tf.convert_to_tensor(next_states, dtype=tf.float32)
    rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
    dones = tf.convert_to_tensor(dones, dtype=tf.float32)
    actions = tf.convert_to_tensor(actions, dtype=tf.int32)
    
    with tf.GradientTape() as tape:        
        q_values = model(states)
        # print(f"q_values: {q_values}")
        next_q_values = target_model(next_states)
        # print(f"next_q_values: {next_q_values}")
        
        # Compute the max over the next state-action values
        max_next_q_values = tf.reduce_max(next_q_values, axis=1)
        # print(f"max_next_q_values: {max_next_q_values}")
        
        # Compute target Q-values
        target_q_values = rewards + (1 - dones) * discount_factor * max_next_q_values
        # print(f"target_q_values: {target_q_values}")
        
        # Compute the mask for the actions taken
        mask = tf.one_hot(actions, 2, dtype=tf.float32)
        # print(f"mask: {mask}")
        
        # Compute the predicted Q-values for the actions taken
        predicted_q_values = tf.reduce_sum(q_values * mask, axis=1)
        # print(f"predicted_q_values: {predicted_q_values}")
        
        # Compute the loss between target and predicted Q-values
        loss = tf.reduce_mean(tf.square(target_q_values - predicted_q_values))
        print(f"loss: {loss}")
        
    # Compute the gradients
    grads = tape.gradient(loss, model.trainable_variables)
    
    # Apply the gradients to update model weights
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

In [4]:
def select_action(model, state, epsilon):
    if random.random() < epsilon:
        return random.choice([0, 1])
    q_values = model(tf.expand_dims(state, axis=0))[0]
    return tf.argmax(q_values).numpy()

In [5]:
def plot_rewards(episode_rewards, window_size=100):
    plt.figure(figsize=(10, 6))
    plt.plot(episode_rewards, label='Episode rewards')
    if len(episode_rewards) >= window_size:
        moving_average = np.convolve(episode_rewards, np.ones(window_size)/window_size, mode='valid')
        plt.plot(range(window_size-1, len(episode_rewards)), moving_average, label=f'{window_size}-episode moving average')
    plt.xlabel('Episodes')
    plt.ylabel('Total Reward')
    plt.legend()
    
    # Create artifacts folder if it doesn't exist
    if not os.path.exists('artifacts'):
        os.makedirs('artifacts')

    plt.savefig('artifacts/CartPole_Progress.png')
    plt.close()

In [6]:
def CartPole_RL():
    # Initialize environment and model
    env = gym.make("CartPole-v0", 
                #    render_mode="human"
                   )
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model = build_q_network()
    target_model = build_q_network()
    target_model.set_weights(model.get_weights())
    episode_rewards = []
    episode = 0

    # Initialize training control variables
    consecutive_200_rewards = 0
    should_train = True

    # Initialize replay buffer and epsilon for epsilon-greedy action selection
    discount_factor = 0.95
    batch_size = 200
    replay_buffer = []
    buffer_size = 10000
    epsilon = 1.0
    epsilon_decay = 0.997
    epsilon_min = 0.01

    # Training loop
    while True:
        observation, info = env.reset()
        obs_history, reward_history, action_history = [], [], []
        terminated = False
        truncated = False

        # Episode loop
        while not terminated and not truncated:
            # Epsilon-greedy action selection
            action = select_action(model, observation, epsilon)
            obs_history.append(observation)
            action_history.append(action)

            # Step to next environment
            next_observation, reward, terminated, truncated, info = env.step(action)
            reward_history.append(reward)

            # Store the experience in the replay buffer
            done_flag = 1 if terminated else 0
            replay_buffer.append((observation, action, reward, next_observation, done_flag))
            if len(replay_buffer) > buffer_size:
                replay_buffer.pop(0)  # Remove the oldest experience if the buffer is full

            observation = next_observation

        # Post-episode updates
        total_reward = sum(reward_history)
        episode_rewards.append(total_reward)

        # Print episode rewards
        moving_num, window = 195, 100
        if episode >= window-1:
            moving_avg = np.mean(episode_rewards[-window:])
            print(f"CartPole-v0 episode {episode}, reward sum: {total_reward}, last {window} avg: {moving_avg:.2f}")
            
            if moving_avg > moving_num:
                print(f"Stopping as the last {window}-episode moving average is greater than {moving_num}")
                if not os.path.exists("saved_model"):
                    os.makedirs("saved_model")
                model.save("saved_model/cartpole_model") 
                break
        else:
            print(f"CartPole-v0 episode {episode}, reward sum: {total_reward}")

        # Plot and save functionality
        if episode % 25 == 0:
            plot_rewards(episode_rewards)
        if episode % 100 == 0:
            model.save(f"saved_model/cartpole_model_{episode}")

        # Training stop
        if total_reward == 200:  # Check for consecutive rewards of 200 (Max for CartPole)
            consecutive_200_rewards += 1
            if consecutive_200_rewards >= 50 and should_train == True:
                print("Stopping training as the reward has been 200 for 50 episodes in a row")
                should_train = False  # Set training flag to False

                # Check if folder exists, if not create it and save model
                if not os.path.exists("saved_model"):
                    os.makedirs("saved_model")
                model.save("saved_model/cartpole_model") 
        else:
            consecutive_200_rewards = 0  # Reset the counter if the reward is not 200

        # Adjust model weights
        epsilon = max(epsilon_min, epsilon * epsilon_decay)
        print(f"Epsilon: {epsilon}")

        # print(f"Replay Buffer: {replay_buffer}")

        # Update the Q-network based on the replay buffer
        if len(replay_buffer) >= batch_size and (should_train): # Train model if we have enough data and training flag is set to true
            mini_batch = random.sample(replay_buffer, batch_size)
            states, actions, rewards, next_states, dones = zip(*mini_batch)
            adjust_weights(model, target_model, optimizer, states, actions, rewards, next_states, dones, discount_factor)

        # Update the target network if the episode number is a multiple of update_target_every
        update_target_every = 25  # Choose an appropriate value
        if episode % update_target_every == 0:
            target_model.set_weights(model.get_weights())

        episode += 1

    env.close()
    plot_rewards(episode_rewards)

In [7]:
CartPole_RL()

  logger.warn(


CartPole-v0 episode 0, reward sum: 19.0
INFO:tensorflow:Assets written to: saved_model/cartpole_model_0\assets
Epsilon: 0.997
CartPole-v0 episode 1, reward sum: 12.0
Epsilon: 0.994009
CartPole-v0 episode 2, reward sum: 39.0
Epsilon: 0.991026973
CartPole-v0 episode 3, reward sum: 26.0
Epsilon: 0.988053892081
CartPole-v0 episode 4, reward sum: 13.0
Epsilon: 0.985089730404757
CartPole-v0 episode 5, reward sum: 24.0
Epsilon: 0.9821344612135428
CartPole-v0 episode 6, reward sum: 41.0
Epsilon: 0.9791880578299021
CartPole-v0 episode 7, reward sum: 18.0
Epsilon: 0.9762504936564125
CartPole-v0 episode 8, reward sum: 20.0
Epsilon: 0.9733217421754432
loss: 1.0309171676635742
CartPole-v0 episode 9, reward sum: 20.0
Epsilon: 0.9704017769489168
loss: 0.9851466417312622
CartPole-v0 episode 10, reward sum: 15.0
Epsilon: 0.9674905716180701
loss: 0.9452403783798218
CartPole-v0 episode 11, reward sum: 33.0
Epsilon: 0.9645880999032158
loss: 0.9057417511940002
CartPole-v0 episode 12, reward sum: 19.0
Epsil