In [26]:
import gymnasium as gym
import matplotlib.pyplot as plt

from src.Agent import PPOAgent
from src.OptimizerParameters import AdamOptimizerParameters

## 1) Testing architecture w. individual Policy and Value net:

In [27]:
# General Hyperparams
gamma = 0.99
lmbda = 0.925
epsilon = 0.2  
num_policy_epochs = 5
num_value_epochs = 5
num_multihead_epochs = 5
smoothing_const = 1e-8
normalize_advantages = True
batch_size = 32
shuffle_batches = False
max_game_length = 500
architecture = "Individual Networks"
num_episodes = 40

# Optimizer Hyperparams
policy_optimizer_parameters = AdamOptimizerParameters(lr=0.0025, betas=(0.9, 0.999), weight_decay=0)
value_optimizer_parameters = AdamOptimizerParameters(lr=0.0025, betas=(0.9, 0.999), weight_decay=0)
multihead_optimizer_parameters = AdamOptimizerParameters(lr=0.0025, betas=(0.9, 0.999), weight_decay=0)


rng_seed = 0
env = gym.make(id='CartPole-v1', max_episode_steps=max_game_length)

input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n


Agent = PPOAgent(env=env,
                 state_space_size=input_dim,
                 action_space_size=output_dim,
                 gamma=gamma,
                 lmbda=lmbda,
                 epsilon=epsilon,
                 smooting_const=smoothing_const,
                 normalize_advantages=normalize_advantages,
                 batch_size=batch_size,
                 shuffle_batches = shuffle_batches,
                 architecture=architecture,
                 seed=rng_seed)


In [28]:
# Trying to interact w. env. before training
Agent.play()

#########################################################################
# --- Survived for: 11 episodes, and earned a total reward of: 11.0 --- #
#########################################################################


In [None]:
if architecture == "Individual Networks":
    avg_accumulated_rewards, avg_value_net_loss, avg_policy_net_loss = Agent.train(episodes=num_episodes,
                                                                                   num_policy_epochs=num_policy_epochs,
                                                                                   num_value_epochs=num_value_epochs,
                                                                                   policy_optimizer_params=policy_optimizer_parameters,
                                                                                   value_optimizer_params=value_optimizer_parameters)
elif architecture == "Multi Head Network":
    avg_accumulated_rewards, avg_multihead_net_loss = Agent.train(episodes=num_episodes,
                                                                  num_multihead_epochs=num_multihead_epochs,
                                                                  multihead_optimizer_params=multihead_optimizer_parameters)
else:
    pass
    

 38%|███▊      | 15/40 [00:02<00:07,  3.26it/s]

In [None]:
fig, ax = plt.subplots(1,2, figsize=(12,4))
ax[0].plot([e for e in range(len(avg_accumulated_rewards))], avg_accumulated_rewards)
ax[0].set_xlabel('Episodes')
ax[0].set_ylabel('Avg. Accumulated Reward')

ax[1].plot([e for e in range(len(avg_accumulated_rewards))], avg_accumulated_rewards)
ax[1].set_xlabel('Episodes')
ax[1].set_ylabel('Avg. Accumulated Reward [log]')
ax[1].set_yscale('log')
    

In [None]:
# Trying to interact w. env. after training
Agent.play()

## 2) Testing architecture w. common backbone network w. Policy and Value heads:

In [None]:
architecture = "Multi Head Network"


Agent = PPOAgent(env=env,
                 state_space_size=input_dim,
                 action_space_size=output_dim,
                 gamma=gamma,
                 lmbda=lmbda,
                 epsilon=epsilon,
                 smooting_const=smoothing_const,
                 normalize_advantages=normalize_advantages,
                 batch_size=batch_size,
                 shuffle_batches = shuffle_batches,
                 architecture=architecture,
                 seed=rng_seed)

In [None]:
# Trying to interact w. env. before training
Agent.play()

In [None]:
if architecture == "Individual Networks":
    avg_accumulated_rewards, avg_value_net_loss, avg_policy_net_loss = Agent.train(episodes=num_episodes,
                                                                                   num_policy_epochs=num_policy_epochs,
                                                                                   num_value_epochs=num_value_epochs,
                                                                                   policy_optimizer_params=policy_optimizer_parameters,
                                                                                   value_optimizer_params=value_optimizer_parameters)
elif architecture == "Multi Head Network":
    avg_accumulated_rewards, avg_multihead_net_loss = Agent.train(episodes=num_episodes,
                                                                  num_multihead_epochs=num_multihead_epochs,
                                                                  multihead_optimizer_params=multihead_optimizer_parameters)
else:
    pass
    

In [None]:
fig, ax = plt.subplots(1,2, figsize=(12,4))
ax[0].plot([e for e in range(len(avg_accumulated_rewards))], avg_accumulated_rewards)
ax[0].set_xlabel('Episodes')
ax[0].set_ylabel('Avg. Accumulated Reward')

ax[1].plot([e for e in range(len(avg_accumulated_rewards))], avg_accumulated_rewards)
ax[1].set_xlabel('Episodes')
ax[1].set_ylabel('Avg. Accumulated Reward [log]')
ax[1].set_yscale('log')
    

In [None]:
# Trying to interact w. env. after training
Agent.play()