# Training an intelligent agent
---
This notebooks shows how to train an intelligent agent in the tennis environment.

In [1]:
from unityagents import UnityEnvironment
import numpy as np
import random

In [2]:
from collections import deque
import json
import os
import sys

Load the tennis environment.

In [3]:
# start the environment
env = UnityEnvironment(file_name="src/exec/Tennis.app")
# get default brain (responsible for deciding agent actions)
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
# examine state and action space
env_info = env.reset(train_mode=True)[brain_name]
action_size = brain.vector_action_space_size
state_size = brain.vector_observation_space_size
n_agents = len(env_info.agents)
print('Number of agents:', n_agents)
print('Action size:', action_size)
print('State size:', state_size)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


Number of agents: 2
Action size: 2
State size: 8


## Train the agents using DDPG

In [4]:
from src.ddpg import AgentDDPG

Setup the model with hyperparameters and the folder in which to save model files (scores, plots, parameter checkpoints).

In [5]:
agentParams = {
    "actor_arch": [256, 128],
    "critic_arch": [256, 128],
    "action_size": 2,
    "state_size": 24,
    "buffer_size": int(1e5),
    "batch_size": 512,
    "lr_actor": 1e-4,
    "lr_critic": 1e-3,
    "gamma": 0.99,
    "tau": 1e-3,
    "noise_mu": 0.0,
    "noise_sigma": 0.2,
    "noise_decay": 1.0,
    "noise_min_sigma": 0.01,
    "noise_theta": 0.15,
    "weight_decay_critic": 0.0,
    "weight_decay_actor": 0.0,
    "soft_update_freq": 1,
    "hard_update_at_t": -1,
    "gradient_clipping": False
}
folder = "01_standard_ddpg"

First, create all necessary folders and save the model parameters.

In [6]:
print("Setting up model in folder:", folder)
if not os.path.exists(folder):
    os.makedirs(folder)
    os.makedirs(os.path.join(folder, "solved"))
    os.makedirs(os.path.join(folder, "end"))
# save parameter file
d = json.dumps(agentParams, indent=2)
with open(os.path.join(folder, "params.json"), "w") as f:
    f.write(d)

Setting up model in folder: 01_standard_ddpg


Create and setup the DDPG agent.

In [7]:
seed = 123
n_episodes = 20

In [8]:
agent = AgentDDPG(env, seed, **agentParams)
brain_name = env.brain_names[0]

Run the training!

In [9]:
scores = []
scores_window = deque(maxlen=100)
solved_env = False
for i_episode in range(1, n_episodes+1):
    # reset agent's noise process
    agent.episode_step()
    # reset environment
    env_info = env.reset(train_mode=True)[brain_name]
    # get current state (for each agent)
    states = env_info.vector_observations
    # initialize score (for each agent)
    i_scores = np.zeros(n_agents)
    while True:
        # select action (for each agent)
        actions = agent.act(states)
        # execute actions
        env_info = env.step(actions)[brain_name]
        # get next state, reward, done (for each agent)
        next_states = env_info.vector_observations
        rewards = env_info.rewards
        dones = env_info.local_done
        # learning step for the agent (one step for each agent)
        agent.step(states[0], actions[0], rewards[0], next_states[0], dones[0])
        agent.step(states[1], actions[1], rewards[1], next_states[1], dones[1])
        # update scores and states (for each agent)
        i_scores += rewards
        states = next_states
        if np.any(dones):
            break
    # get better maximum score of both agents
    score = max(i_scores)
    # save this episode's score
    scores.append(score)
    scores_window.append(score)
    # save the scores up until now
    np.save(os.path.join(folder, "scores.npy"), scores)
    with open(os.path.join(folder, "scores.txt"), "a") as f:
        f.write("{:03} {}\n".format(i_episode, score))
    # print episode
    print('\repisode {}\t score: {:.4f}\taverage: {:.4f}'.format(
        i_episode, score, np.mean(scores_window)
    ), end="\n" if i_episode % 100 == 0 else "")
    sys.stdout.flush()

    # save the model every 100 episodes
    if i_episode % 100 == 0:
        agent.save(os.path.join(folder, "params_{}".format(i_episode)))

    # check if solved
    if len(scores) > 100 and np.mean(scores_window) > 0.5 and not solved_env:
        print("\nsolved environment!")
        solved_env = True
        agent.save(os.path.join(folder, "solved"))

agent.save(os.path.join(folder, "end"))

episode 20	 score: 0.0000	average: 0.0100

Close the environment when done.

In [10]:
env.close()