In [28]:
import gymnasium as gym
import matplotlib.pyplot as plt

from src.Agent import PPOAgent

## 1) Testing architecture w. individual Policy and Value net:

In [29]:
rng_seed = 0
env = gym.make('CartPole-v1')

input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n

gamma = 0.99
lmbda = 0.925
epsilon = 0.2  
policy_lr = 0.001
value_lr = 0.001
multihead_lr = 0.001
num_policy_epochs = 5
num_value_epochs = 5
num_multihead_epochs = 5
smoothing_const = 1e-8
normalize_advantages = True
batch_size = 32
shuffle_batches = False
max_game_length = 500
architecture = "Individual Networks"
num_episodes = 1000

Agent = PPOAgent(env=env,
                 state_space_size=input_dim,
                 action_space_size=output_dim,
                 gamma=gamma,
                 lmbda=lmbda,
                 epsilon=epsilon,
                 smooting_const=smoothing_const,
                 normalize_advantages=normalize_advantages,
                 batch_size=batch_size,
                 max_game_length=max_game_length,
                 shuffle_batches = shuffle_batches,
                 architecture=architecture,
                 seed=rng_seed)

In [30]:
# Trying to interact w. env. before training
Agent.play(render=False, max_game_length=max_game_length)

####################################
# --- Survived for: 11 episodes --- #
####################################


In [31]:
if architecture == "Individual Networks":
    avg_accumulated_rewards, avg_value_net_loss, avg_policy_net_loss = Agent.train(episodes=num_episodes,
                                                                                   policy_lr=policy_lr,
                                                                                   value_lr=value_lr,
                                                                                   num_policy_epochs=num_policy_epochs,
                                                                                   num_value_epochs=num_value_epochs)
elif architecture == "Multi Head Network":
    avg_accumulated_rewards, avg_multihead_net_loss = Agent.train(episodes=num_episodes,
                                                                  multihead_lr=multihead_lr,
                                                                  num_multihead_epochs=num_multihead_epochs)
else:
    pass
    

  2%|▎         | 25/1000 [00:05<03:27,  4.70it/s]


KeyboardInterrupt: 

In [None]:
if architecture == "Individual Networks":
    fig, ax = plt.subplots(3,2, figsize=(12,10))
    ax[0][0].plot([e for e in range(len(avg_accumulated_rewards))], avg_accumulated_rewards)
    ax[0][0].set_xlabel('Episodes')
    ax[0][0].set_ylabel('Avg. Accumulated Reward')
    
    ax[0][1].plot([e for e in range(len(avg_accumulated_rewards))], avg_accumulated_rewards)
    ax[0][1].set_xlabel('Episodes')
    ax[0][1].set_ylabel('Avg. Accumulated Reward [log]')
    ax[0][1].set_yscale('log')
    
    ax[1][0].plot([e for e in range(len(avg_value_net_loss))], avg_value_net_loss)
    ax[1][0].set_xlabel('Episodes')
    ax[1][0].set_ylabel('Avg. Value net loss')
    
    ax[1][1].plot([e for e in range(len(avg_value_net_loss))], avg_value_net_loss)
    ax[1][1].set_xlabel('Episodes')
    ax[1][1].set_ylabel('Avg. Value net loss [log]')
    ax[1][1].set_yscale('log')
    
    ax[2][0].plot([e for e in range(len(avg_policy_net_loss))], avg_policy_net_loss)
    ax[2][0].set_xlabel('Episodes')
    ax[2][0].set_ylabel('Avg. Policy net loss')
    
    ax[2][1].plot([e for e in range(len(avg_policy_net_loss))], avg_policy_net_loss)
    ax[2][1].set_xlabel('Episodes')
    ax[2][1].set_ylabel('Avg. Policy net loss [log]')
    ax[2][1].set_yscale('log')
    plt.tight_layout()
    plt.show()
    
elif architecture == "Multi Head Network":
    fig, ax = plt.subplots(2,2, figsize=(12,10))
    ax[0][0].plot([e for e in range(len(avg_accumulated_rewards))], avg_accumulated_rewards)
    ax[0][0].set_xlabel('Episodes')
    ax[0][0].set_ylabel('Avg. Accumulated Reward')
    
    ax[0][1].plot([e for e in range(len(avg_accumulated_rewards))], avg_accumulated_rewards)
    ax[0][1].set_xlabel('Episodes')
    ax[0][1].set_ylabel('Avg. Accumulated Reward [log]')
    ax[0][1].set_yscale('log')
    
    ax[1][0].plot([e for e in range(len(avg_multihead_net_loss))], avg_multihead_net_loss)
    ax[1][0].set_xlabel('Episodes')
    ax[1][0].set_ylabel('Avg. Multi-head net loss')
    
    ax[1][1].plot([e for e in range(len(avg_multihead_net_loss))], avg_multihead_net_loss)
    ax[1][1].set_xlabel('Episodes')
    ax[1][1].set_ylabel('Avg. Multi-head net loss [log]')
    ax[1][1].set_yscale('log')
    plt.tight_layout()
    plt.show()
    
    

In [None]:
# Trying to interact w. env. after training
Agent.play(render=False, max_game_length=max_game_length)

## 2) Testing architecture w. common backbone network w. Policy and Value heads:

In [None]:
architecture = "Multi Head Network"


Agent = PPOAgent(env=env,
                 state_space_size=input_dim,
                 action_space_size=output_dim,
                 gamma=gamma,
                 lmbda=lmbda,
                 epsilon=epsilon,
                 smooting_const=smoothing_const,
                 normalize_advantages=normalize_advantages,
                 batch_size=batch_size,
                 max_game_length=max_game_length,
                 shuffle_batches = shuffle_batches,
                 architecture=architecture,
                 seed=rng_seed)

In [None]:
# Trying to interact w. env. before training
Agent.play(render=False, max_game_length=max_game_length)

In [None]:
if architecture == "Individual Networks":
    avg_accumulated_rewards, avg_value_net_loss, avg_policy_net_loss = Agent.train(episodes=num_episodes,
                                                                                   policy_lr=policy_lr,
                                                                                   value_lr=value_lr,
                                                                                   num_policy_epochs=num_policy_epochs,
                                                                                   num_value_epochs=num_value_epochs)
elif architecture == "Multi Head Network":
    avg_accumulated_rewards, avg_multihead_net_loss = Agent.train(episodes=num_episodes,
                                                                  multihead_lr=multihead_lr,
                                                                  num_multihead_epochs=num_multihead_epochs)
    

In [None]:
if architecture == "Individual Networks":
    fig, ax = plt.subplots(3,2, figsize=(12,10))
    ax[0][0].plot([e for e in range(len(avg_accumulated_rewards))], avg_accumulated_rewards)
    ax[0][0].set_xlabel('Episodes')
    ax[0][0].set_ylabel('Avg. Accumulated Reward')
    
    ax[0][1].plot([e for e in range(len(avg_accumulated_rewards))], avg_accumulated_rewards)
    ax[0][1].set_xlabel('Episodes')
    ax[0][1].set_ylabel('Avg. Accumulated Reward [log]')
    ax[0][1].set_yscale('log')
    
    ax[1][0].plot([e for e in range(len(avg_value_net_loss))], avg_value_net_loss)
    ax[1][0].set_xlabel('Episodes')
    ax[1][0].set_ylabel('Avg. Value net loss')
    
    ax[1][1].plot([e for e in range(len(avg_value_net_loss))], avg_value_net_loss)
    ax[1][1].set_xlabel('Episodes')
    ax[1][1].set_ylabel('Avg. Value net loss [log]')
    ax[1][1].set_yscale('log')
    
    ax[2][0].plot([e for e in range(len(avg_policy_net_loss))], avg_policy_net_loss)
    ax[2][0].set_xlabel('Episodes')
    ax[2][0].set_ylabel('Avg. Policy net loss')
    
    ax[2][1].plot([e for e in range(len(avg_policy_net_loss))], avg_policy_net_loss)
    ax[2][1].set_xlabel('Episodes')
    ax[2][1].set_ylabel('Avg. Policy net loss [log]')
    ax[2][1].set_yscale('log')
    plt.tight_layout()
    plt.show()
    
elif architecture == "Multi Head Network":
    fig, ax = plt.subplots(2,2, figsize=(12,10))
    ax[0][0].plot([e for e in range(len(avg_accumulated_rewards))], avg_accumulated_rewards)
    ax[0][0].set_xlabel('Episodes')
    ax[0][0].set_ylabel('Avg. Accumulated Reward')
    
    ax[0][1].plot([e for e in range(len(avg_accumulated_rewards))], avg_accumulated_rewards)
    ax[0][1].set_xlabel('Episodes')
    ax[0][1].set_ylabel('Avg. Accumulated Reward [log]')
    ax[0][1].set_yscale('log')
    
    ax[1][0].plot([e for e in range(len(avg_multihead_net_loss))], avg_multihead_net_loss)
    ax[1][0].set_xlabel('Episodes')
    ax[1][0].set_ylabel('Avg. Multi-head net loss')
    
    ax[1][1].plot([e for e in range(len(avg_multihead_net_loss))], avg_multihead_net_loss)
    ax[1][1].set_xlabel('Episodes')
    ax[1][1].set_ylabel('Avg. Multi-head net loss [log]')
    ax[1][1].set_yscale('log')
    plt.tight_layout()
    plt.show()

In [None]:
# Trying to interact w. env. after training
Agent.play(render=False, max_game_length=max_game_length)