In [1]:
from utilities.Network import Network
from utilities.ReplayBuffer import ReplayBuffer

import wandb
import json
import matplotlib.pyplot as plt
import torch
from Discrete_SAC_Agent import SACAgent

from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
from mlagents_envs.base_env import ActionTuple

from debug_side_channel import DebugSideChannel
from gym import spaces

import torch.nn.functional as F
from torch.distributions import Normal
import sys
import numpy as np
import pandas as pd
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
seed_value = 1
torch.manual_seed(seed_value)
np.random.seed(seed_value)
random.seed(seed_value)

In [3]:
#ALPHA_INITIAL = 1.
#DISCOUNT_RATE = 0.99
#SOFT_UPDATE_INTERPOLATION_FACTOR = 0.01
ALPHA_INITIAL = 1.
DISCOUNT_RATE = 0.99
LEARNING_RATE = 10 ** -4
SOFT_UPDATE_INTERPOLATION_FACTOR = 0.01
TRAINING_EVALUATION_RATIO = 4
RUNS = 2
EPISODES_PER_RUN = 400
STEPS_PER_EPISODE = 200

REPLAY_BUFFER_SIZE = 1000
REPLAY_BUFFER_BATCH_SIZE = 500
BUFFER_SAMPLE_SIZE = 100
WANDB = True

In [4]:
if WANDB:
    wandb.init(
        project="visibility-game",
    )

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mr-marr747[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
class SACAgent:
    def __init__(self, environment):
        self.environment = environment
        self.LEARNING_RATE = LEARNING_RATE
        self.ALPHA_INITIAL = ALPHA_INITIAL
        self.REPLAY_BUFFER_BATCH_SIZE = REPLAY_BUFFER_BATCH_SIZE
        self.DISCOUNT_RATE = DISCOUNT_RATE
        self.state_dim = 2#self.environment.observation_space.shape[0]
        self.action_dim = self.environment.action_space.n
        self.critic_local = Network(input_dimension=self.state_dim,
                                    output_dimension=self.action_dim)
        self.critic_local2 = Network(input_dimension=self.state_dim,
                                     output_dimension=self.action_dim)
        self.critic_optimiser = torch.optim.Adam(self.critic_local.parameters(), lr=self.LEARNING_RATE)
        self.critic_optimiser2 = torch.optim.Adam(self.critic_local2.parameters(), lr=self.LEARNING_RATE)

        self.critic_target = Network(input_dimension=self.state_dim,
                                     output_dimension=self.action_dim)
        self.critic_target2 = Network(input_dimension=self.state_dim,
                                      output_dimension=self.action_dim)

        self.soft_update_target_networks(tau=1.)

        self.actor_local = Network(
            input_dimension=self.state_dim,
            output_dimension=self.action_dim,
            output_activation=torch.nn.Softmax(dim=1)
        )
        self.actor_optimiser = torch.optim.Adam(self.actor_local.parameters(), lr=self.LEARNING_RATE)

        self.replay_buffer = ReplayBuffer(self.environment, REPLAY_BUFFER_SIZE)

        #self.target_entropy = 0.98 * -np.log(1 / self.environment.action_space.n)
        #self.log_alpha = torch.tensor(np.log(self.ALPHA_INITIAL), requires_grad=True)
        self.log_alpha = torch.tensor(np.log(self.ALPHA_INITIAL), requires_grad=False)
        self.alpha = self.log_alpha
        self.alpha_optimiser = torch.optim.Adam([self.log_alpha], lr=self.LEARNING_RATE)

    def eval_policy(self):
        pass
    
    def get_action_from_q(self, state):
        i = state[0]
        j = state[1]
        pos = torch.tensor([i,j], dtype=torch.float32)
        action = torch.argmax(self.critic_local(pos) + self.critic_local2(pos))
        return action

    def get_next_action(self, state, evaluation_episode=False):
        if evaluation_episode:
            discrete_action = self.get_action_deterministically(state)
        else:
            discrete_action = self.get_action_nondeterministically(state)
        return discrete_action

    def get_action_nondeterministically(self, state):
        action_probabilities = self.get_action_probabilities(state)
        discrete_action = np.random.choice(range(self.action_dim), p=action_probabilities)
        return discrete_action

    def get_action_deterministically(self, state):
        action_probabilities = self.get_action_probabilities(state)
        discrete_action = np.argmax(action_probabilities)
        return discrete_action

    def train_on_transition(self, state, discrete_action, next_state, reward, done):
        transition = (state, discrete_action, reward, next_state, done)
        self.train_networks(transition)

    def test_on_transition(self, state, discrete_action, next_state, reward, done):
        transition = (state, discrete_action, reward, next_state, done)
        return self.test_networks(transition)

    def print_critic(self):
        critic = dict()
        critic2 = dict()
        for i in range(5):
            for j in range(5):
                pos = torch.tensor([i,j], dtype=torch.float32)
                critic[(i,j)] = self.critic_local(pos)
                critic2[(i, j)] = self.critic_local2(pos)
        return {
            "critic": critic,
            "critic2": critic2
        }
    
    def test_networks(self, transition):
        # Set all the gradients stored in the optimisers to zero.
        self.critic_optimiser.zero_grad()
        self.critic_optimiser2.zero_grad()
        self.actor_optimiser.zero_grad()
        self.alpha_optimiser.zero_grad()
        self.replay_buffer.add_transition(transition)
        # Calculate the loss for this transition.
        # Compute the gradients based on this loss, i.e. the gradients of the loss with respect to the Q-network
        # parameters.
        if self.replay_buffer.get_size() >= self.REPLAY_BUFFER_BATCH_SIZE:
            # get minibatch of 100 transitions from replay buffer
            minibatch = self.replay_buffer.sample_minibatch(BUFFER_SAMPLE_SIZE)
            
            minibatch_separated = list(map(list, zip(*minibatch)))

            states_tensor = torch.tensor(np.array(minibatch_separated[0]), dtype=torch.float32)
            actions_tensor = torch.tensor(np.array(minibatch_separated[1]).astype(np.int64))
            rewards_tensor = torch.tensor(np.array(minibatch_separated[2])).float()
            next_states_tensor = torch.tensor(np.array(minibatch_separated[3]), dtype=torch.float32)
            done_tensor = torch.tensor(np.array(minibatch_separated[4]))

            critic_loss, critic2_loss = \
                self.critic_loss_test(states_tensor, actions_tensor, rewards_tensor, next_states_tensor, done_tensor)
            if WANDB:
                wandb.log({"critic_loss": critic_loss})
                wandb.log({"critic2_loss": critic2_loss})

            critic_loss.backward()
            critic2_loss.backward()
            self.critic_optimiser.step()
            self.critic_optimiser2.step()

            actor_loss, log_action_probabilities = self.actor_loss(states_tensor)

            if WANDB:
                wandb.log({"actor_loss": actor_loss})

            actor_loss.backward()
            self.actor_optimiser.step()

            #alpha_loss = self.temperature_loss(log_action_probabilities)

            #alpha_loss.backward()
            #self.alpha_optimiser.step()
            #self.alpha = self.log_alpha.exp()

            #self.soft_update_target_networks()

            return True
        
        return False
            
    def train_networks(self, transition):
        # Set all the gradients stored in the optimisers to zero.
        self.critic_optimiser.zero_grad()
        self.critic_optimiser2.zero_grad()
        self.actor_optimiser.zero_grad()
        self.alpha_optimiser.zero_grad()
        # Calculate the loss for this transition.
        self.replay_buffer.add_transition(transition)
        # Compute the gradients based on this loss, i.e. the gradients of the loss with respect to the Q-network
        # parameters.
        if self.replay_buffer.get_size() >= self.REPLAY_BUFFER_BATCH_SIZE:
            # get minibatch of 100 transitions from replay buffer
            minibatch = self.replay_buffer.sample_minibatch(self.REPLAY_BUFFER_BATCH_SIZE)
            mb = [(
                transition[0].tolist(),
                transition[1],
                transition[2],
                transition[3].tolist(), 
                transition[4]) for transition in minibatch.tolist()]
            with open('minibatch_debug.json', 'w') as file:
                json.dump(mb, file)
            
            minibatch_separated = list(map(list, zip(*minibatch)))

            # unravel transitions to get states, actions, rewards and next states
            states_tensor = torch.tensor(np.array(minibatch_separated[0]))
            actions_tensor = torch.tensor(np.array(minibatch_separated[1]).astype(np.int64))
            rewards_tensor = torch.tensor(np.array(minibatch_separated[2])).float() * 100
            next_states_tensor = torch.tensor(np.array(minibatch_separated[3]), dtype=torch.float32)
            done_tensor = torch.tensor(np.array(minibatch_separated[4]))

            critic_loss, critic2_loss = \
                self.critic_loss(states_tensor, actions_tensor, rewards_tensor, next_states_tensor, done_tensor)
            if WANDB:
                wandb.log({"critic_loss": critic_loss})
                wandb.log({"critic2_loss": critic2_loss})

            critic_loss.backward()
            critic2_loss.backward()
            self.critic_optimiser.step()
            self.critic_optimiser2.step()

            actor_loss, log_action_probabilities = self.actor_loss(states_tensor)

            if WANDB:
                wandb.log({"actor_loss": actor_loss})

            actor_loss.backward()
            self.actor_optimiser.step()

            alpha_loss = self.temperature_loss(log_action_probabilities)

            alpha_loss.backward()
            self.alpha_optimiser.step()
            self.alpha = self.log_alpha.exp()

            self.soft_update_target_networks()

    def critic_loss(self, states_tensor, actions_tensor, rewards_tensor, next_states_tensor, done_tensor):
        with torch.no_grad():
            action_probabilities, log_action_probabilities = self.get_action_info(next_states_tensor)
            next_q_values_target = self.critic_target.forward(next_states_tensor)
            next_q_values_target2 = self.critic_target2.forward(next_states_tensor)
            soft_state_values = (action_probabilities * (
                    torch.min(next_q_values_target, next_q_values_target2) - self.alpha * log_action_probabilities
            )).sum(dim=1)

            next_q_values = rewards_tensor + ~done_tensor * self.DISCOUNT_RATE*soft_state_values

        soft_q_values = self.critic_local(states_tensor).gather(1, actions_tensor.unsqueeze(-1)).squeeze(-1)
        soft_q_values2 = self.critic_local2(states_tensor).gather(1, actions_tensor.unsqueeze(-1)).squeeze(-1)
        critic_square_error = torch.nn.MSELoss(reduction="none")(soft_q_values, next_q_values)
        critic2_square_error = torch.nn.MSELoss(reduction="none")(soft_q_values2, next_q_values)
        weight_update = [min(l1.item(), l2.item()) for l1, l2 in zip(critic_square_error, critic2_square_error)]
        self.replay_buffer.update_weights(weight_update)
        critic_loss = critic_square_error.mean()
        critic2_loss = critic2_square_error.mean()
        return critic_loss, critic2_loss
    
    def critic_loss_test(self, states_tensor, actions_tensor, rewards_tensor, next_states_tensor, done_tensor):
        with torch.no_grad():
            action_probabilities, log_action_probabilities = self.get_action_info(next_states_tensor)
            #action_probabilities = torch.tensor([1/self.action_dim] * self.action_dim)
            #log_action_probabilities = torch.log(action_probabilities)

            next_q_values_target = self.critic_target.forward(next_states_tensor)
            next_q_values_target2 = self.critic_target2.forward(next_states_tensor)
            soft_state_values = (action_probabilities * (
                    torch.min(next_q_values_target, next_q_values_target2) - self.alpha * log_action_probabilities
            )).sum(dim=1)

            next_q_values = rewards_tensor + ~done_tensor * self.DISCOUNT_RATE*soft_state_values

        soft_q_values = self.critic_local(states_tensor).gather(1, actions_tensor.unsqueeze(-1)).squeeze(-1)
        soft_q_values2 = self.critic_local2(states_tensor).gather(1, actions_tensor.unsqueeze(-1)).squeeze(-1)
        critic_square_error = torch.nn.MSELoss(reduction="none")(soft_q_values, next_q_values)
        critic2_square_error = torch.nn.MSELoss(reduction="none")(soft_q_values2, next_q_values)
        weight_update = [min(l1.item(), l2.item()) for l1, l2 in zip(critic_square_error, critic2_square_error)]
        self.replay_buffer.update_weights(weight_update)
        critic_loss = critic_square_error.mean()
        critic2_loss = critic2_square_error.mean()
        return critic_loss, critic2_loss

    def actor_loss(self, states_tensor):
        action_probabilities, log_action_probabilities = self.get_action_info(states_tensor)
        q_values_local = self.critic_local(states_tensor)
        q_values_local2 = self.critic_local2(states_tensor)
        inside_term = self.alpha * log_action_probabilities - torch.min(q_values_local, q_values_local2)
        policy_loss = (action_probabilities * inside_term).sum(dim=1).mean()
        return policy_loss, log_action_probabilities

    def actor_loss_max_entropy(self, states_tensor):
        action_probabilities, log_action_probabilities = self.get_action_info(states_tensor)
        inside_term = self.alpha * log_action_probabilities
        policy_loss = (action_probabilities * inside_term).sum(dim=1).mean()
        return policy_loss

    def temperature_loss(self, log_action_probabilities):
        alpha_loss = -(self.log_alpha * (log_action_probabilities + self.target_entropy).detach()).mean()
        return alpha_loss

    def get_action_info(self, states_tensor):
        action_probabilities = self.actor_local.forward(states_tensor)
        z = action_probabilities == 0.0
        z = z.float() * 1e-8
        log_action_probabilities = torch.log(action_probabilities + z)
        return action_probabilities, log_action_probabilities

    def get_action_probabilities(self, state):
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        action_probabilities = self.actor_local.forward(state_tensor)
        return action_probabilities.squeeze(0).detach().numpy()

    def soft_update_target_networks(self, tau=SOFT_UPDATE_INTERPOLATION_FACTOR):
        self.soft_update(self.critic_target, self.critic_local, tau)
        self.soft_update(self.critic_target2, self.critic_local2, tau)

    def soft_update(self, target_model, origin_model, tau):
        for target_param, local_param in zip(target_model.parameters(), origin_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data)

    def predict_q_values(self, state):
        q_values = self.critic_local(state)
        q_values2 = self.critic_local2(state)
        return torch.min(q_values, q_values2)

In [6]:
class Env():
    def __init__(self, config):
        self.observation_space = spaces.Tuple((spaces.Discrete(10), spaces.Discrete(1), spaces.Discrete(10))) 
        self.action_space = spaces.Discrete(5) 
        self.engine_channel = EngineConfigurationChannel()
        self.debug_channel = DebugSideChannel()
        self.env = UnityEnvironment(file_name=config['unity_environment'], 
                                    side_channels=[self.engine_channel, self.debug_channel])
        self.env.reset()
        self.engine_channel.set_configuration_parameters(time_scale=config['time_scale'])
        self.behavior_registry = []
        self.behavior_registry.append(list(self.env.behavior_specs.keys())[0])

    def get_state(self):
        # need to figure out negative reward later
        behavior_name = self.behavior_registry[0]
        decision_steps, terminal_steps = self.env.get_steps(behavior_name)
        state = decision_steps.obs[0][0]
        return state
        
    # revisit and make correct
    def step(self, action):
        behavior_name = self.behavior_registry[0]
        action_tuple = ActionTuple()
        action_tuple.add_discrete(action.reshape(1, 1))
        self.env.set_actions(behavior_name, action_tuple)
        self.env.step()
        reward = 0.
        done = False

        # decision_steps, terminal_steps = self.env.get_steps(behavior_name)
        # if len(terminal_steps.reward) > 0:
        #     if terminal_steps.reward[0] > 0:
        #         print('win')
        #         reward = 1.

        decision_steps, terminal_steps = self.env.get_steps(behavior_name)
        next_state = decision_steps.obs[0][0]
        if len(terminal_steps.reward) > 0:
            if terminal_steps.reward[0] > 0:
                reward = 1.
                next_state = self.debug_channel.get_last_state()

        if len(terminal_steps) > 0:
            done = True

        return reward, next_state, done

In [7]:
class EnvSimple():
    def __init__(self):
        self.cur_pos = (2., 2.)
        self.observation_space = spaces.Tuple((spaces.Discrete(5), spaces.Discrete(5) )) 
        self.action_space = spaces.Discrete(5) 
    def in_bounds(self, pos):
        return (pos[0] >= 0 and pos[0] <= 4 and pos[1] >= 0 and pos[1] <= 4)
    def reset(self):
        self.cur_pos = (2.,2.)
    def move(self, action):
        new_pos = self.cur_pos
        if action == 1: # down
            new_pos = (self.cur_pos[0] + 1, self.cur_pos[1])
        elif action == 2: # up
            new_pos = (self.cur_pos[0] - 1, self.cur_pos[1])
        elif action == 3: # left
            new_pos = (self.cur_pos[0], self.cur_pos[1] - 1)
        elif action == 4: # right
            new_pos = (self.cur_pos[0], self.cur_pos[1] + 1)
        
        if self.in_bounds(new_pos):
            self.cur_pos = new_pos
        return (self.cur_pos[0] == 0 and self.cur_pos[1] == 4)
    def print(self):
        x = int(self.cur_pos[0])
        y = int(self.cur_pos[1])
        for i in range(5):
            for j in range(5):
                if i == x and j == y:
                    print("O", end="")
                else:
                    print("#", end="")
            print("\n")

In [8]:
driver_config = {'unity_environment': 'C:\\Users\\rmarr\\Documents\\ml-agents-dodgeball-env-ICT',
                'time_scale': 2.0}
environment = EnvSimple()
agent = SACAgent(environment)

In [24]:
i = 0
run_reward = 0
while True:
            
    environment.reset()

    done = False

    while not done:
        i += 1
        state = environment.cur_pos
        action = agent.get_next_action(state, evaluation_episode=0)
        #action = random.choice(range(5))
        done = environment.move(action)

        if done:
            run_reward = run_reward + 1
            
        reward = done * 10
        next_state = environment.cur_pos

        agent.test_on_transition(state, action, next_state, reward, done)

    if ((i+1) % 100) == 0:
        wandb.log({"runs wins": run_reward})
        run_reward = 0

KeyboardInterrupt: 

In [58]:
environment.reset()
environment.print()

#####

#####

##O##

#####

#####



In [63]:
cur_pos = environment.cur_pos
action = agent.get_action_from_q(cur_pos)
done = environment.move(action)

environment.print()

#####

#####

#####

O####

#####



In [None]:
cur_pos = environment.cur_pos
action = agent.get_next_action(cur_pos)
done = environment.move(action)

environment.print()

####O

#####

#####

#####

#####



In [None]:
agent.print_critic()

{'critic': {(0,
   0): tensor([4.3812, 2.1779, 6.8511, 7.3393, 1.8892], grad_fn=<AddBackward0>),
  (0,
   1): tensor([4.1277, 2.1808, 4.6076, 9.3529, 1.2534], grad_fn=<AddBackward0>),
  (0,
   2): tensor([1.9563, 0.6384, 1.7872, 4.8937, 0.0322], grad_fn=<AddBackward0>),
  (0,
   3): tensor([ 0.3654, -0.4135, -0.0469,  2.0107, -0.7610], grad_fn=<AddBackward0>),
  (0,
   4): tensor([-0.7055, -1.1531, -1.2661,  0.3965, -1.4057], grad_fn=<AddBackward0>),
  (1,
   0): tensor([4.0006, 1.5593, 9.0101, 4.2566, 2.1651], grad_fn=<AddBackward0>),
  (1,
   1): tensor([2.2416, 0.6235, 4.4566, 3.7954, 0.6844], grad_fn=<AddBackward0>),
  (1,
   2): tensor([ 1.0989, -0.1655,  1.9588,  2.6228, -0.2744], grad_fn=<AddBackward0>),
  (1,
   3): tensor([-0.0282, -0.8723,  0.3250,  1.1840, -0.9978], grad_fn=<AddBackward0>),
  (1,
   4): tensor([-1.3168, -1.7177, -1.2419, -0.8110, -1.7165], grad_fn=<AddBackward0>),
  (2,
   0): tensor([1.7854, 0.2664, 4.6914, 1.8014, 0.5645], grad_fn=<AddBackward0>),
  (2,
  

In [None]:
df = pd.DataFrame({
    'TupleArray': [str(tup) for tup in agent.replay_buffer.buffer],
    'FloatArray': agent.replay_buffer.weights
})
pd.set_option('display.max_rows', 1000)  # Show all rows
print(df)

                                TupleArray    FloatArray
0    ((2.0, 0.0), 3, 0, (2.0, 0.0), False)  6.315411e-03
1    ((2.0, 0.0), 1, 0, (3.0, 0.0), False)  1.443802e-03
2    ((3.0, 0.0), 0, 0, (3.0, 0.0), False)  4.462130e-03
3    ((3.0, 0.0), 2, 0, (2.0, 0.0), False)  9.939890e-02
4    ((2.0, 0.0), 4, 0, (2.0, 1.0), False)  2.092810e-02
5    ((2.0, 1.0), 2, 0, (1.0, 1.0), False)  2.165461e-01
6    ((1.0, 1.0), 1, 0, (2.0, 1.0), False)  2.347920e-02
7    ((2.0, 1.0), 4, 0, (2.0, 2.0), False)  7.004913e-02
8    ((2.0, 2.0), 2, 0, (1.0, 2.0), False)  1.382135e-01
9    ((1.0, 2.0), 3, 0, (1.0, 1.0), False)  9.430752e-02
10   ((1.0, 1.0), 1, 0, (2.0, 1.0), False)  1.093166e-02
11   ((2.0, 1.0), 2, 0, (1.0, 1.0), False)  1.907932e-01
12   ((1.0, 1.0), 2, 0, (0.0, 1.0), False)  2.492300e-03
13   ((0.0, 1.0), 0, 0, (0.0, 1.0), False)  8.034585e-05
14   ((0.0, 1.0), 2, 0, (0.0, 1.0), False)  1.413989e-01
15   ((0.0, 1.0), 3, 10, (0.0, 0.0), True)  2.229804e-01
16   ((2.0, 2.0), 0, 0, (2.0, 2

In [None]:
# for episode_number in range(EPISODES_PER_RUN):
            
#     environment.env.reset()
#     state = environment.get_state()

#     i = 0
#     done = False
#     run_reward = 0
#     while not done and i < REPLAY_BUFFER_BATCH_SIZE:
#         i += 1
#         action = agent.get_next_action(state, evaluation_episode=0)
#         next_state = state
#         while np.sum(next_state - state) < .5: 
#             reward, next_state, done = environment.step(action)
#             action = agent.get_next_action(state, evaluation_episode=0)
            
#         print(f'next state {next_state}')
#         if reward:
#             run_reward = run_reward + 1 
#         sample_batch = agent.test_on_transition(state, action, next_state, reward, done)
#         state = next_state

#     wandb.log({"runs wins": run_reward})
#     run_reward = 0

next state [6.5        0.49999988 1.4999999 ]
test net
size 1
next state [7.5        0.49999988 1.4999996 ]
test net
size 2
next state [7.5, 0.5, 2.5]
test net
size 3
next state [4.5 0.5 3.5]
test net
size 4
next state [7.5 0.5 1.5]
test net
size 5
next state [5.5, 0.5, 4.5]
test net
size 6
next state [7.5        0.49999985 0.49999994]
test net
size 7
next state [7.5 0.5 1.5]
test net
size 8
next state [8.5        0.49999988 1.4999999 ]
test net
size 9
next state [9.5       0.4999999 1.4999998]
test net
size 10
next state [9.5       0.5       2.4999998]
test net
size 11
next state [9.5       0.5       3.4999998]
test net
size 12


KeyboardInterrupt: 

In [None]:
states = torch.tensor([
    [5.5, .5, 1.5], #1
    [6.5, .5, 1.5]]) #3
print(agent.critic_local(states))
print(agent.critic_local2(states))

tensor([[ 0.4050, -0.5796,  0.1590,  0.1732,  0.0471],
        [ 0.4679, -0.6717,  0.2335,  0.2081,  0.0784]],
       grad_fn=<AddmmBackward>)
tensor([[ 0.0737,  0.2095,  0.4959,  0.1785, -0.1343],
        [ 0.0968,  0.1782,  0.5813,  0.1786, -0.1678]],
       grad_fn=<AddmmBackward>)


In [None]:
agent.replay_buffer.sample_minibatch(BUFFER_SAMPLE_SIZE)

array([([8.5       , 0.5       , 5.4999986 ], 1, 0., [9.5       , 0.5       , 5.4999986 ], False),
       ([4.5       , 0.5       , 3.5       ], 3, 0., [7.5       , 0.5       , 1.5       ], False),
       ([6.5       , 0.49999988, 1.4999999 ], 3, 0., [7.5       , 0.49999988, 1.4999996 ], False),
       ([4.5       , 0.5       , 3.5       ], 3, 0., [7.5       , 0.5       , 1.5       ], False),
       ([9.5       , 0.4999999 , 1.4999998 ], 1, 0., [9.5       , 0.5       , 2.4999998 ], False),
       ([5.5       , 0.5       , 1.5       ], 3, 0., [7.5       , 0.49999985, 0.49999994], False),
       ([7.5       , 0.49999985, 0.49999994], 2, 0., [7.5       , 0.5       , 1.5       ], False),
       ([9.5       , 0.5       , 2.4999998 ], 3, 0., [9.5       , 0.5       , 3.4999998 ], False),
       ([5.5       , 0.5       , 1.5       ], 3, 0., [6.5       , 0.49999988, 1.4999999 ], False),
       ([9.5       , 0.5       , 3.4999998 ], 2, 0., [8.5       , 0.5       , 5.4999986 ], False),
       ([7

In [None]:
states_tensor = torch.tensor([[6.5, 0.49999988, 1.499999]])
actions_tensor = torch.tensor([3])
rewards_tensor = torch.tensor([1.])
next_states_tensor = torch.tensor([[6.5, 0.5, 2.5]])
done_tensor = torch.tensor([True])
l = agent.critic_loss(states_tensor, actions_tensor, rewards_tensor, next_states_tensor, done_tensor)
print(l)

states_tensor = torch.tensor([[5.5, 0.49999988, 1.499999]])
actions_tensor = torch.tensor([1])
rewards_tensor = torch.tensor([0.])
next_states_tensor = torch.tensor([[6.5, 0.5, 1.5]])
done_tensor = torch.tensor([False])
l = agent.critic_loss(states_tensor, actions_tensor, rewards_tensor, next_states_tensor, done_tensor)
print(l)

(tensor(0.6271, grad_fn=<MeanBackward0>), tensor(0.6747, grad_fn=<MeanBackward0>))
(tensor(0.2597, grad_fn=<MeanBackward0>), tensor(0.0781, grad_fn=<MeanBackward0>))


In [None]:
df = pd.DataFrame({
    'TupleArray': [str(tup) for tup in agent.replay_buffer.buffer],
    'FloatArray': agent.replay_buffer.weights
})
pd.set_option('display.max_rows', 1000)  # Show all rows

In [None]:
print(df)

                                            TupleArray  FloatArray
0    ([5.5, 0.5, 1.5], 3, 0., [6.5       , 0.499999...    0.078098
1    ([6.5       , 0.49999988, 1.4999999 ], 3, 0., ...    0.078098
2    ([7.5       , 0.49999988, 1.4999996 ], 1, 1., ...    0.078098
3     ([5.5, 0.5, 1.5], 2, 0., [4.5, 0.5, 3.5], False)    0.078098
4     ([4.5, 0.5, 3.5], 3, 0., [7.5, 0.5, 1.5], False)    0.078098
5      ([7.5, 0.5, 1.5], 4, 1., [5.5, 0.5, 4.5], True)    0.078098
6    ([5.5, 0.5, 1.5], 3, 0., [7.5       , 0.499999...    0.078098
7    ([7.5       , 0.49999985, 0.49999994], 2, 0., ...    0.078098
8    ([7.5, 0.5, 1.5], 2, 0., [8.5       , 0.499999...    0.078098
9    ([8.5       , 0.49999988, 1.4999999 ], 2, 0., ...    0.078098
10   ([9.5      , 0.4999999, 1.4999998], 1, 0., [9....    0.078098
11   ([9.5      , 0.5      , 2.4999998], 3, 0., [9....    0.078098
12   ([9.5      , 0.5      , 3.4999998], 2, 0., [8....    0.078098
13   ([8.5      , 0.5      , 5.4999986], 1, 0., [9....    0.07

In [None]:
driver_config = {'unity_environment': 'C:\\Users\\rmarr\\Documents\\ml-agents-dodgeball-env-ICT',
                'time_scale': 1.0}
environment = Env(driver_config)
for run in range(RUNS):
    agent = SACAgent(environment)
    run_results = []
    run_reward = 0
    for episode_number in range(EPISODES_PER_RUN):
        #print('\r', f'Run: {run + 1}/{RUNS} | Episode: {episode_number + 1}/{EPISODES_PER_RUN}', end=' ')
        evaluation_episode = episode_number % TRAINING_EVALUATION_RATIO == 0
            
        environment.env.reset()
        state = environment.get_state()

        done = False
        i = 0
        while not done and i < STEPS_PER_EPISODE:
            i += 1
            action = agent.get_next_action(state, evaluation_episode=evaluation_episode)
            reward, next_state, done = environment.step(action)
            if reward:
                run_reward = run_reward + 1 
            if not evaluation_episode:
                agent.train_on_transition(state, action, next_state, reward, done)
            state = next_state

        wandb.log({"runs wins": run_reward})
        run_reward = 0

UnityWorkerInUseException: Couldn't start socket communication because worker number 0 is still in use. You may need to manually close a previously opened environment or use a different worker number.

In [None]:
driver_config = {'unity_environment': 'C:\\Users\\rmarr\\Documents\\ml-agents-dodgeball-env-ICT',
                'time_scale': 1.0}
environment = Env(driver_config)
agent = SACAgent(environment)
while True:
    agent.test_networks(transition)

indicies None
prediction errros [0.002217013854533434, 0.016281459480524063, 0.009729351848363876, 0.009729351848363876, 0.009729360230267048, 0.009729351848363876, 0.016255538910627365, 0.025095757097005844, 0.000764631200581789, 0.05348189175128937, 0.0097293546423316, 0.016281459480524063, 9.231052899849601e-06, 0.009729360230267048, 0.03596899285912514, 0.009729360230267048, 0.016255538910627365, 0.000764631200581789, 0.025502944365143776, 0.05348189175128937, 0.016255538910627365, 0.025502944365143776, 0.009729351848363876, 0.016281459480524063, 0.0097293546423316, 0.009729351848363876, 0.03650720417499542, 0.03596899285912514, 4.748810169985518e-05, 0.009729351848363876, 0.003392907092347741, 0.03596899285912514, 0.000764631200581789, 0.016281459480524063, 0.009729351848363876, 0.0097293546423316, 0.005932152271270752, 9.231052899849601e-06, 0.03596899285912514, 0.009729360230267048, 0.022140543907880783, 4.748810169985518e-05, 0.009729360230267048, 0.016255538910627365, 0.003392

ValueError: could not broadcast input array from shape (108,) into shape (1,5000)

In [None]:
while True:
    agent.test_networks(transition)

NameError: name 'agent' is not defined

In [None]:
print(agent.critic_local(torch.tensor([6.5, 0.5, 1.5])))
print(agent.critic_local2(torch.tensor([6.5, 0.5, 1.5])))

tensor([-5.8553, -4.0970,  1.4242, 74.7129,  0.9605], grad_fn=<AddBackward0>)
tensor([ 3.9447,  5.5563,  6.1680, 44.5385,  4.8056], grad_fn=<AddBackward0>)
