## Test Tennis Notebook

The essential steps to this are:

* Define Unity environment
* Get the default brain
* Import DDPG Agent
* Load in checkpoint
* Run testing loop

In [1]:
from unityagents import UnityEnvironment
import numpy as np
from ddpg_agent import Agents
import torch
from collections import deque
import matplotlib.pyplot as plt

In [2]:
env = UnityEnvironment(file_name="Tennis_Linux_NoVis/Tennis.x86_64", worker_id=40)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


In [3]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [5]:
BUFFER_SIZE = int(1e6)  # replay buffer size
BATCH_SIZE = 128        # minibatch size
GAMMA = 0.99            # discount factor
TAU = 1e-3              # for soft update of target parameters
LR_ACTOR = 1e-4         # learning rate of the actor 
LR_CRITIC = 1e-3        # learning rate of the critic
WEIGHT_DECAY = 0.       # L2 weight decay
UPDATE_EVERY = 40        # At every multiple of this value, we update our actor and critic
NUM_ITERS_LEARN = 20     # When we finally do the update, we run the learning process this many times
FC1_UNITS = 600         # Number of hidden units for the first hidden layer of the Actor and Critic networks
FC2_UNITS = 400         # Number of hidden units for the second hidden layer of the Actor and Critic networks

config = {'batch_size': BATCH_SIZE, 'buffer_size': BUFFER_SIZE, 'gamma': GAMMA,
'tau': TAU, 'lr_actor': LR_ACTOR, 'lr_critic': LR_CRITIC, 'weight_decay': WEIGHT_DECAY,
'update_every': UPDATE_EVERY, 'num_iters_learn': NUM_ITERS_LEARN, 'fc1_units': FC1_UNITS,
'fc2_units': FC2_UNITS}

In [7]:
import os

# reset the environment
env_info = env.reset(train_mode=False)[brain_name]

# number of agents
num_agents = len(env_info.agents)

# size of each action
action_size = brain.vector_action_space_size

# size of each observation
states = env_info.vector_observations
state_size = states.shape[1]

# Initialise Agents
agent = Agents(state_size=state_size, action_size=action_size, num_agents=num_agents, random_seed=1234, config=config)

# Also load in checkpoints
agent.actor_local.load_state_dict(torch.load(os.path.join('checkpoints_tennis', 'checkpoint_actor.pth')))
agent.critic_local.load_state_dict(torch.load(os.path.join('checkpoints_tennis', 'checkpoint_critic.pth')))

Using user-defined parameters
{'batch_size': 128,
 'buffer_size': 1000000,
 'fc1_units': 600,
 'fc2_units': 400,
 'gamma': 0.99,
 'lr_actor': 0.0001,
 'lr_critic': 0.001,
 'num_iters_learn': 20,
 'tau': 0.001,
 'update_every': 40,
 'weight_decay': 0.0}


<All keys matched successfully>

In [8]:
## Loop to iterate over a single episode
env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
num_iter = 0
while True:
    num_iter += 1
    actions = agent.act(states)                        # select an action
    actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
    env_info = env.step(actions)[brain_name]           # send all actions to the environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = np.array(env_info.rewards)               # get reward (for each agent)
    dones = np.array(env_info.local_done, dtype=np.bool)  # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if np.any(dones):                                  # exit loop if episode finished
        break

print('Total score (maximum over the two agents) this episode: {}'.format(np.max(scores)))
print(f'Number of iterations: {num_iter}')

Total score (maximum over the two agents) this episode: 2.600000038743019
Number of iterations: 1001
