In [1]:
import gym
import warnings
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output

# Suppress DeprecationWarnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow as tf

# List all GPUs available to TensorFlow
gpus = tf.config.list_physical_devices('GPU')

# Check if any GPUs are available
if len(gpus) > 0:
    print("GPU is available")
    for gpu in gpus:
        print(f"GPU device: {gpu}")
else:
    print("GPU is not available")

2023-10-18 17:42:39.620823: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-18 17:42:41.964994: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


GPU is available
GPU device: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
GPU device: PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')
GPU device: PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU')
GPU device: PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU')
GPU device: PhysicalDevice(name='/physical_device:GPU:4', device_type='GPU')


In [4]:
# Question 1: Cartpole-v0
env = gym.make(
    "CartPole-v0",
    # render_mode="human"
    )

optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)


def CartPole_RL(model, discount_factor=0.95, num_choices=2):
    
    # Save rewards for visualization later
    episode_rewards = []

    for episode in range(1000):
        # print(f"CartPole-v0, episode {episode}")
        # Initiate one episode
        observation, info = env.reset()

        obs_history = []
        reward_history = []
        action_history = []

        terminated = False
        truncated = False



        # Roll out one episode
        while (not terminated) and (not truncated):

            # Make a prediction based on the state
            probabilities = model.predict(np.array([observation]), verbose=0)[0]
            # print(probabilities)

            # This line stochastically samples an action based on the probabilities
            action = np.random.choice(num_choices, p=probabilities)
            # print(action)

            obs_history.append(observation)
            action_history.append(action)

            observation, reward, terminated, truncated, info = env.step(action)
            print(observation, reward, terminated, truncated, info)

            reward_history.append(reward)
            


        
        # Discount rewards
        discounted_rewards = []
        cumulative_reward = 0
        for reward in reversed(reward_history):
            cumulative_reward = reward + discount_factor * cumulative_reward
            discounted_rewards.insert(0, cumulative_reward)

        # print(f"Cumulative Reward History: {discounted_rewards}")


        # print(f"Frame: {count_frame}")
        # print(f"Obs History: {obs_history}")
        # print(f"Reward History: {reward_history}")
        # print(f"Action History: {action_history}")



        def adjust_weights(obs_history, action_history, discounted_rewards):
            
            with tf.GradientTape() as tape:
                # Predict action probabilities
                action_probabilities = model(np.array(obs_history))
                # print(action_probabilities)

                # Get probabilities of actions that were taken
                indices = list(zip(range(len(action_history)), action_history))
                # print(indices)
                chosen_action_probs = tf.gather_nd(action_probabilities, indices)
                # print(chosen_action_probs)

                # Compute loss
                loss = -tf.math.log(chosen_action_probs) * discounted_rewards
                loss = tf.reduce_mean(loss)

                
            # Calculate gradients
            grads = tape.gradient(loss, model.trainable_variables)
            # print(grads)
            
            # Apply gradients to model weights
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

            # # Print updated model weights
            # for var in model.trainable_variables:
            #     print(var.name, var.numpy()[1])


        adjust_weights(obs_history=obs_history, action_history=action_history, discounted_rewards=discounted_rewards)


        # Append episode rewards for plotting
        total_reward = sum(reward_history)
        episode_rewards.append(total_reward)
        print(f"CartPole-v0 episode {episode}, reward sum: {total_reward}")

        # clear_output(wait=True)
        
    env.close()

    # After the for loop
    plt.plot(episode_rewards)
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.show()

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation="relu", input_shape=(4,)),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(2, activation="softmax")
])

CartPole_RL(model)

### Question 2 (Pong)

In [1]:
import gym
import os
import warnings
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

import time  # Debug

# Suppress DeprecationWarnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

def check_gpu_availability():
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        print("GPU is available", flush=True)
        for gpu in gpus:
            print(f"GPU device: {gpu}", flush=True)
            tf.config.experimental.set_memory_growth(gpu, True)

        os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Set the second GPU as available
        os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" # Filter out warnings

    else:
        print("GPU is not available", flush=True)

check_gpu_availability()

GPU is available
GPU device: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [3]:
import gym
import os
import warnings
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

import time  # Debug

# Suppress DeprecationWarnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

def check_gpu_availability():
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        print("GPU is available", flush=True)
        for gpu in gpus:
            print(f"GPU device: {gpu}", flush=True)
            tf.config.experimental.set_memory_growth(gpu, True)

        os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Set the second GPU as available
        os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" # Filter out warnings

    else:
        print("GPU is not available", flush=True)

check_gpu_availability()

def preprocess(image, downsample_factor=2):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 2D float array """
    image = image[35:195] # crop
    image = image[::downsample_factor,::downsample_factor,0] # downsample by factor of 2
    image[image == 144] = 0 # erase background (background type 1)
    image[image == 109] = 0 # erase background (background type 2)
    image[image != 0] = 1 # everything else (paddles, ball) just set to 1
    # In preprocess function
    return tf.reshape(tf.cast(image, tf.float32), [80, 80, 1])


def build_model(input_shape=(80, 80, 1), num_choices=2, reg=0.0001):
    model = tf.keras.Sequential([
        tf.keras.layers.Flatten(input_shape=input_shape),  # Set input_shape here
        tf.keras.layers.Dense(256, activation='relu'),  # Dense hidden layer
        tf.keras.layers.Dense(num_choices, activation="softmax")  # Dense output layer
    ])
    return model 


def select_action(model, observation):
    probabilities = model(tf.expand_dims(observation, axis=0))[0].numpy()
    # print(probabilities)
    action_idx = np.random.choice([0, 1], p=probabilities)
    return action_idx


def compute_discounted_rewards(reward_history, discount_factor=0.99):
    discounted_rewards, cumulative_reward = [], 0
    for reward in reversed(reward_history):
        cumulative_reward = reward + discount_factor * cumulative_reward
        discounted_rewards.insert(0, cumulative_reward)
    
    # Normalization of discounted rewards
    mean = np.mean(discounted_rewards)
    std = np.std(discounted_rewards)
    normalized_rewards = (discounted_rewards - mean) / (std + 1e-8)  # Added epsilon to avoid division by zero
    
    return normalized_rewards

def adjust_weights(model, optimizer, obs_history, action_history, discounted_rewards):
    # Added print statement to observe the incoming discounted_rewards
    # print(f"Initial discounted_rewards: {discounted_rewards}")

    # print("Getting discounted rewards", flush=True)
    discounted_rewards = tf.convert_to_tensor(discounted_rewards, dtype=tf.float32)

    # Added print statement to observe the tensor-converted discounted_rewards
    # print(f"Tensor-converted discounted_rewards: {discounted_rewards}")

    with tf.GradientTape() as tape:
        # print("Getting probs", flush=True)
        probs = model(tf.convert_to_tensor(obs_history, dtype=tf.float32))

        # Added print statement to observe the model probabilities
        # print(f"Model probabilities: {probs}")

        # print("Getting Indices", flush=True)
        indices = tf.stack([tf.range(len(action_history), dtype=tf.int32), tf.convert_to_tensor(action_history, dtype=tf.int32)], axis=1)

        # Added print statement to observe the indices
        # print(f"indices: {indices}")

        # print("Getting Chosen Probs", flush=True)
        chosen_probs = tf.gather_nd(probs, indices)

        # Added print statement to observe the chosen probabilities
        # print(f"Chosen probabilities: {chosen_probs}")
        # print(f"Chosen probabilities: {chosen_probs[:4].numpy()}")

        # print("Getting Loss", flush=True)
        loss = -tf.math.log(chosen_probs) * discounted_rewards
        loss = tf.reduce_sum(loss)

        # Added print statement to observe the loss
        print(f"Loss: {loss}")

    # print("Applying Gradient and Optimizer", flush=True)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    # Added print statement to observe the applied gradients
    # print(f"Applied gradients: {grads}")



def save_plot(episode_rewards, episode):
    window = 100  # Size of the window for calculating moving average
    moving_avgs = [np.mean(episode_rewards[max(0, i - window + 1):i + 1]) for i in range(len(episode_rewards))]
    
    plt.plot(episode_rewards, label='Total Reward')
    plt.plot(moving_avgs, label=f'{window}-Episode Moving Average')
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.legend()
    plt.savefig(f'artifacts/pong_rewards_{episode}.png')
    plt.close()

def Pong_RL():
    env = gym.make("Pong-v0")
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model = build_model()
    episode_rewards = []
    episode = 0 

    consecutive_21_rewards = 0  # Count number of 21 occurences
    should_train = True  # Initialize flag for training

    while True:
        observation, info = env.reset()
        obs_history, reward_history, action_history = [], [], []

        terminated = False
        truncated = False

        # print("Starting Game", flush=True)

        while not terminated and not truncated:
            processed_observation = preprocess(observation)
            action = select_action(model, processed_observation)
            obs_history.append(processed_observation)
            action_history.append(action)

            action_to_take = 2 if action == 0 else 3  # Remap the action index to the appropriate Pong-v0 action
            observation, reward, terminated, truncated, info = env.step(action_to_take)
            
            reward_history.append(reward)

        # print("Finishing Game", flush=True)

        discounted_rewards = compute_discounted_rewards(reward_history)

        total_reward = sum(reward_history)
        episode_rewards.append(total_reward)

        moving_num, window = 0, 100
        if episode >= window-1:
            moving_avg = np.mean(episode_rewards[-window:])
            print(f"Pong-v0 episode {episode}, reward sum: {total_reward}, last {window} avg: {moving_avg:.2f}", flush=True)
            
            if moving_avg > moving_num:
                print(f"Stopping as the last {window}-episode moving average is greater than {moving_num}", flush=True)
                break
        else:
            print(f"Pong-v0 episode {episode}, reward sum: {total_reward}", flush=True)

        ### TRAINING STOP

        if total_reward == 21:  # Check for consecutive rewards of 21
            consecutive_21_rewards += 1
            if consecutive_21_rewards >= 10 and should_train == True:
                print("Stopping training as the reward has been 21 for 10 episodes in a row", flush=True)
                should_train = False  # Set flag to False

                # Check if folder exists, if not create it
                if not os.path.exists("saved_model"):
                    os.makedirs("saved_model")

                # Save the model
                model.save("saved_model/pong_model")
                
        else:
            consecutive_21_rewards = 0  # Reset the counter if the reward is not 21

        # Modify the weight adjustment to respect the should_train flag
        # print("Starting Model Training", flush=True)
        if should_train:
            adjust_weights(model, optimizer, obs_history, action_history, discounted_rewards)
        # print("Ending Model Training", flush=True)

        # Save the model every 100 iterations
        if episode % 100 == 0:
            model.save(f"saved_model/pong_model_{episode}")
            
        # Save a reward plot every iteration
        save_plot(episode_rewards, episode)

        episode += 1

    env.close()

    # Save Results
    if not os.path.exists('artifacts'):
        os.makedirs('artifacts')

    # Calculate moving average of rewards
    window = 100  # Size of the window for calculating moving average
    moving_avgs = [np.mean(episode_rewards[max(0, i - window + 1):i+1]) for i in range(len(episode_rewards))]

    plt.plot(episode_rewards, label='Total Reward')
    plt.plot(moving_avgs, label=f'{window}-Episode Moving Average')
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.legend()
    plt.savefig('artifacts/pong_rewards.png')

Pong_RL()

GPU is available


GPU device: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
[0.47736987 0.5226301 ]
[0.46762794 0.53237206]
[0.46762794 0.53237206]
[0.47179815 0.5282018 ]
[0.50426775 0.49573222]
[0.45801297 0.541987  ]
[0.48948845 0.5105116 ]
[0.50601554 0.49398446]
[0.4540295  0.54597044]
[0.50601554 0.49398446]
[0.50367093 0.4963291 ]
[0.46762794 0.53237206]
[0.4822126 0.5177874]
[0.46190742 0.53809255]
[0.4596942 0.5403058]
[0.5024996  0.49750045]
[0.49006975 0.50993025]
[0.46733487 0.53266513]
[0.4848396  0.51516044]
[0.46079984 0.5392001 ]
[0.4750202  0.52497977]
[0.46949825 0.5305018 ]
[0.47237423 0.52762574]
[0.50104576 0.49895424]


  logger.warn(


[0.483718   0.51628196]
[0.4836779  0.51632214]
[0.47734857 0.52265143]
[0.4925385 0.5074615]
[0.454592 0.545408]
[0.42982697 0.570173  ]
[0.43751302 0.562487  ]
[0.44315654 0.55684346]
[0.4036499  0.59635013]
[0.47609618 0.52390385]
[0.47793692 0.522063  ]
[0.48411217 0.51588786]
[0.47768104 0.5223189 ]
[0.46652487 0.5334751 ]
[0.45119208 0.5488079 ]
[0.4152834 0.5847166]
[0.4111179 0.5888821]
[0.39615878 0.6038412 ]
[0.40471745 0.5952826 ]
[0.43687016 0.5631299 ]
[0.49822924 0.5017708 ]
[0.50638324 0.4936168 ]
[0.5222902  0.47770977]
[0.4915882  0.50841177]
[0.45894527 0.5410547 ]
[0.47916833 0.5208317 ]
[0.46964502 0.530355  ]
[0.49838832 0.5016117 ]
[0.49080625 0.5091937 ]
[0.50852984 0.49147016]
[0.49678195 0.503218  ]
[0.44029877 0.5597012 ]
[0.47099957 0.5290004 ]
[0.5208753  0.47912475]
[0.548115   0.45188496]
[0.51690674 0.48309323]
[0.507652   0.49234802]
[0.50883806 0.4911619 ]
[0.5196303 0.4803697]
[0.49777153 0.50222844]
[0.4742186 0.5257814]
[0.46362424 0.53637576]
[0.445



INFO:tensorflow:Assets written to: saved_model/pong_model_0\assets


INFO:tensorflow:Assets written to: saved_model/pong_model_0\assets


[0.52904505 0.47095495]
[0.50849706 0.49150294]
[0.46538392 0.53461605]
[0.46010044 0.5398996 ]
[0.4655777  0.53442234]
[0.4655777  0.53442234]
[0.49774456 0.50225544]
[0.56317556 0.43682447]
[0.49170372 0.50829625]
[0.5122949 0.4877051]
[0.51637185 0.48362815]
[0.51637185 0.48362815]
[0.49170372 0.50829625]
[0.5397587  0.46024135]
[0.5122949 0.4877051]
[0.51637185 0.48362815]
[0.51637185 0.48362815]
[0.51637185 0.48362815]
[0.51637185 0.48362815]
[0.4528839 0.5471161]
[0.47609138 0.5239086 ]
[0.49674657 0.5032534 ]
[0.5281489  0.47185108]
[0.5356314  0.46436855]
[0.5544329 0.4455671]
[0.543155   0.45684496]
[0.5588268 0.4411732]
[0.5450754  0.45492455]
[0.5291898  0.47081017]
[0.49582168 0.5041783 ]
[0.51231647 0.48768356]
[0.44423422 0.5557658 ]
[0.40110645 0.5988936 ]
[0.4270311  0.57296896]
[0.41528895 0.5847111 ]
[0.45537266 0.54462737]
[0.43768534 0.5623147 ]
[0.52216774 0.47783226]
[0.5620244  0.43797553]
[0.5279279 0.4720721]
[0.4691023  0.53089774]
[0.44394293 0.5560571 ]
[0.4

KeyboardInterrupt: 