In [7]:
import gymnasium as gym
import ale_py
import torch
import numpy as np
import random
import copy
from collections import deque
import multiprocessing as mp

from a2c import A2C

Setting up variables

In [8]:
RANDOM_STATE = 42
torch.seed = RANDOM_STATE
torch.manual_seed(torch.seed)
np.random.seed(torch.seed)
random.seed(torch.seed)

ENV_NAME = 'ALE/Bowling-ram-v5'

Testing environnement

In [9]:
env = gym.make(ENV_NAME)

state, _ = env.reset()
print(f"Initial state: \n{state}")
print(f"Observation space: \n{env.observation_space}")
print(f"Action space: {env.action_space}")

a = env.action_space.sample()
event = env.step(a)
print('Output from applying action {} on environment:\nstate:'.format(a) \
      + '{}\nreward: {}\ndone: {}\ntruncated: {}\ninfo: {}'.format(*event))

Initial state: 
[ 71 255   0   0   0   0   0   0   0   0   0   0 255   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   8  15   0   0   0 170   0
   1   0   0   0   1  12 184 247   0   0   6  16  19  13  22  16  10  25
  19  13   7   7   5   5   3   3   3   1   1   1   1   0   0   0   0   0
   0   0   0   0   0 136 216 132  38  88   0   0   1 255   0 255 128 255
   0   0   0   0   0   0   0   2   2   0   8   8   0  34  34   0 136 136
   0  34  34   0   8   8   0   2   2   0   0   0   0   0   0   0   0   0
  66 243]
Observation space: 
Box(0, 255, (128,), uint8)
Action space: Discrete(6)
Output from applying action 1 on environment:
state:[ 75 255   0   0   0   0   4   4   4   4   4   4 255   1   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0  12  20   0   2   0 170   0
   1   0   0   0   1   8 216 247   0   0   0  16  19  13  22  16  10  25
  19  13   7   7   5   5   3   3   3   1   1   1   1   0   0   0   0   0
   0   0   0   0   0 136 216 132  38  88   0  

In [10]:
state = env.reset()           # Start a new episode and get the initial state

done = False
total_reward = 0

while not done:
    action = env.action_space.sample()  # Random action (could be replaced by a learned policy)
    next_state, reward, done, truncated, _ = env.step(action)  # Take the action in the environment
    print(f"action: {action}, reward: {reward}")
    total_reward += reward
    state = next_state                   # Move to the next state

print(f"Episode finished with total reward: {total_reward}")

action: 1, reward: 0.0
action: 3, reward: 0.0
action: 2, reward: 0.0
action: 3, reward: 0.0
action: 1, reward: 0.0
action: 1, reward: 0.0
action: 4, reward: 0.0
action: 5, reward: 0.0
action: 2, reward: 0.0
action: 1, reward: 0.0
action: 2, reward: 0.0
action: 4, reward: 0.0
action: 5, reward: 0.0
action: 4, reward: 0.0
action: 3, reward: 0.0
action: 5, reward: 0.0
action: 3, reward: 0.0
action: 4, reward: 0.0
action: 1, reward: 0.0
action: 1, reward: 0.0
action: 2, reward: 0.0
action: 1, reward: 0.0
action: 4, reward: 0.0
action: 4, reward: 0.0
action: 1, reward: 0.0
action: 4, reward: 0.0
action: 2, reward: 0.0
action: 0, reward: 0.0
action: 2, reward: 0.0
action: 3, reward: 0.0
action: 2, reward: 0.0
action: 5, reward: 0.0
action: 4, reward: 0.0
action: 5, reward: 0.0
action: 0, reward: 0.0
action: 0, reward: 0.0
action: 4, reward: 0.0
action: 0, reward: 0.0
action: 3, reward: 0.0
action: 0, reward: 0.0
action: 0, reward: 0.0
action: 3, reward: 0.0
action: 2, reward: 0.0
action: 2, 

Defining A2CBatch

In [None]:
class A2CBatch(A2C):
    def __init__(self, env):
        super().__init__(env)
        # Overrides the loss function for the actor. Since we are trying batch processing, we need to sum up the losses
        self.actorLossFun = lambda probs, advantage: -1 * torch.sum(torch.log(probs) * advantage)

    # Trains the agent            
    def train(self, epochs=200, gamma=0.99, memory=218, batch=200):
        # Stores the info from the training process
        info = {}
        # Stores the total reward or scores per epoch
        scores = []
        # The replay buffer for Q-learning with experience replay
        replay = deque(maxlen=memory)
        for e in range(epochs):
            # Resets the environment per epoch
            state_, _ = self.env.reset()

            epsilon = 1 - (e / epochs)

            # A flag to check if an episode has ended
            done = False
            # Stores the score per episode
            score = 0
            # Maximum number of allowed moves per episode
            maxMoves = 218
            # Stores the information per move in an episode
            states = []
            actions = []
            rewards = []
            values = []          
            # Continue the episode until it ends or the maximum number of episodes has expired
            while not done and maxMoves > 0:
                # Decrement the number of allowed moves
                maxMoves -= 1
                # Calculate the probs. for the actions for a given state
                #print(f"state: {state_}")
                policy = self.actor(torch.from_numpy(
                        state_).float())
                #print(f"policy: {policy}")
                # Choose an action based on their probs.
                action = np.random.choice(self.numActionSpace, p=policy.detach().numpy())
                    
                # Executes an action to the environment
                nextState_, reward, done, truncated, _ = self.env.step(action)

                """ if maxMoves == 0 and score == 0:
                    reward = -10 """

                # Updates the rewards for the episode
                score += reward
                # Calculates the value of the current state
                value = self.critic(torch.from_numpy(state_).float())
                #print(f"value: {value}")
                # Calculate the value of the next state
                nextValue = torch.Tensor([0.0]) if done else self.critic(
                    torch.from_numpy(nextState_).float())
                #print(f"nextValue: {nextValue}")

                # Add the experience to the buffer
                replay.append((state_, reward, nextValue.item()))
                
                # Add the episode step info
                #print(state_, action, reward, nextValue.item())
                states.append(state_)
                actions.append(action)
                #rewards.append(reward)
                values.append(nextValue.item() - value.item())
                #print(nextValue.item() - value.item())
                # Assign next state as current state
                state_ = nextState_

                epsilon /= 2

            for i in range(len(states)):
                if score > 0:
                    rewards.append(score/10 * np.power(0.99, i))
                else:
                    rewards.append(-10 * np.power(0.99, i))

            # After each episode update the actor model
            # Policy Loss
            # Calculate the advantage
            advantages = torch.Tensor(list(rewards)).float() + torch.pow(gamma, torch.arange(
                len(values)).float()) * torch.Tensor(list(values)).float()

            # Store the state info as a batch of states
            stateBatch = torch.stack([torch.from_numpy(s).float()
                                      for s in states])
            print(f"type stateBatch: {stateBatch.type()}")
            # Store the action info as a batch of actions    
            actionBatch = torch.Tensor(list(actions))
            # Feed the state batch to the actor model to calculate the probs of actions for each state in the batch
            policy = self.actor(stateBatch)
            # Gets the probs of actions actually performed for each state
            probs = policy.gather(
                dim=1, index=actionBatch.long().unsqueeze(dim=1)).squeeze()
            #print(f"state: {stateBatch}, action: {actionBatch}, probs: {probs}")

            #print(f"advantage: {advantages}")

            entropy_values = -torch.sum(probs * torch.log(probs))
            entropy_bonus = torch.mean(entropy_values)
            #print(f"entropy: {entropy_bonus}")
            # Policy Loss
            actorLoss = self.actorLossFun(probs, advantages) + 1e-2 * entropy_bonus
            #print(f"actorLoss: {actorLoss}")
            #Backpropagate policy
            self.actorOptim.zero_grad()
            actorLoss.backward()
            self.actorOptim.step()

            # Update the value function if the size of the replay buffer is larger than the specified batch size
            if (len(replay) > batch):
                # Select a set of random indices to be chosen from the replay buffer 
                indices = np.random.choice(len(replay), size=batch)
                # Extract the experiences from the replay buffer
                replay_ = np.asarray(replay, dtype=object)[indices, :]  
                # Create a state batch with the excted experiences
                stateBatch = torch.stack([torch.from_numpy(s).float()
                                          for s in replay_[:, 0]])
                # Calculate the value for the extracted states                                        
                value = self.critic(stateBatch)                                                
                
                # Value Loss
                print(f"value: {torch.Tensor(list(replay_[:, 1] + gamma * replay_[:, 2])).float()}")
                criticLoss = self.criticLossFun(
                    value, torch.Tensor(list(replay_[:, 1] + gamma * replay_[:, 2])).float())
                                
                #Backpropagate value
                self.criticOptim.zero_grad()
                criticLoss.backward()
                self.criticOptim.step()            
                
            # Store the total score for the episode
            scores.append(score)

            # Print the progress
            if e % np.round(epochs/10) == 0:
                print('episode: {:d}, score: {:.2f}'.format(e, scores[e]))
            info[e] = scores[e]

        return info

In [26]:
# Instantiate the agent class.
agent = A2CBatch(env)
# Train the agent
info = agent.train(epochs=1000, gamma=0.99)
# Convert to an numpy array with epochs at axis=0 and scores at axis=1
info_ = np.array(list(info.items()))
print(info_)
# Plot the scores
agent.plot(info_)
# Run a test
agent.test(render=False)

type stateBatch: torch.FloatTensor
replay: tensor([-4.0103, -3.2884, -3.9696, -3.2719, -2.9156, -3.5601, -3.3626, -2.4777,
        -2.6708, -0.5689, -3.4428, -3.3618, -2.7849, -3.8416, -3.0882, -2.9172,
        -3.1132, -2.7686, -2.7856, -3.0279, -3.1594, -3.8312, -3.4798, -3.2195,
        -2.6084, -3.8416, -4.0820, -2.5919, -2.6501, -2.6769, -3.4506, -4.2016,
        -3.9059, -2.6084, -3.7781, -3.8208, -3.4013, -3.8288, -1.0064, -0.5103,
        -4.2016, -3.1353, -2.9468, -2.5438, -2.8162, -3.9336, -2.9107, -3.0437,
        -2.9957, -2.9369, -2.7165, -2.4212, -2.9920, -2.5569, -2.5438, -2.7849,
        -3.5601, -2.9422, -3.1992, -2.7777, -2.9133, -0.7272, -2.6938, -2.7856,
        -2.9957, -3.7393, -3.2414, -3.5709, -2.9957, -0.6126, -2.9369, -2.7966,
        -3.6017, -3.6382, -2.7095, -3.0882, -2.7009, -3.0746, -3.6922, -4.0275,
        -3.1504, -2.9172, -2.9133, -3.1224, -3.1594, -3.0753, -4.0468, -2.8839,
        -3.8870, -1.0064, -3.0746, -3.5753, -3.4798, -2.7009, -2.8839, -3.601

KeyboardInterrupt: 