In [1]:
import numpy as np
import time
import pickle
import torch

from unityagents import UnityEnvironment
from collections import deque
from ddpg_agent import Agent

In [2]:
env = UnityEnvironment(file_name='unity/Reacher_Linux_NoVis/Reacher.x86_64')
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_size -> 5.0
		goal_speed -> 1.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [3]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 20
Size of each action: 4
There are 20 agents. Each observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726624e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


In [4]:
def save(param, agent, score_list, avg_score_list, i_episode, current_score):
    BATCH_SIZE = param["BATCH_SIZE"]
    LR_ACTOR = param["LR_ACTOR"] / 1e-4
    LR_CRITIC = param["LR_CRITIC"] / 1e-4
    with open('score_log/score_{}_{:.0f}_{:.0f}_{}_{:.0f}.pk'.format(BATCH_SIZE,LR_ACTOR,LR_CRITIC,i_episode, current_score), 'wb') as f:
        pickle.dump(score_list, f)
    with open('score_log/avg_score_{}_{:.0f}_{:.0f}_{}_{:.0f}.pk'.format(BATCH_SIZE,LR_ACTOR,LR_CRITIC,i_episode, current_score), 'wb') as f:
        pickle.dump(score_list, f)
    torch.save(agent.actor_local.state_dict(), 'pytorch_weight/checkpoint_actor_{}_{:.0f}_{:.0f}_{}_{:.0f}.pt'.format(BATCH_SIZE,LR_ACTOR,LR_CRITIC,i_episode, current_score))
    torch.save(agent.critic_local.state_dict(), 'pytorch_weight/checkpoint_critic_{}_{:.0f}_{:.0f}_{}_{:.0f}.pt'.format(BATCH_SIZE,LR_ACTOR,LR_CRITIC,i_episode, current_score))

In [5]:
def ddpg(env, env_info, agent, num_agents, param, n_episodes=250, max_t=700):  
    scores_deque = deque(maxlen=100)
    score_list = []
    avg_score_list = []
    max_score = -np.Inf
    for i_episode in range(1, n_episodes+1):
        starting_time = time.time()
        env_info = env.reset(train_mode=True)[brain_name]     # reset the environment    
        states = env_info.vector_observations                  # get the current state (for each agent)
        agent.reset()
        scores = np.zeros(num_agents) 
        for t in range(max_t):
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]           # send all actions to tne environment
            next_states = env_info.vector_observations         # get next state (for each agent)
            rewards = env_info.rewards                         # get reward (for each agent)
            dones = env_info.local_done                        # see if episode finished
            scores += env_info.rewards                         # update the score (for each agent)
            agent.step(states, actions, rewards, next_states, dones, t)
            states = next_states
            scores += rewards
            if np.any(dones):                                  # exit loop if episode finished
                break

        proc_time = (time.time()-starting_time)/60

        score = np.mean(scores)
        scores_deque.append(score)
        score_list.append(score)
        avg_score_list.append(np.mean(scores_deque))
        print('\rEpisode {}\tAverage Score: {:.2f} proc_time {:.2f}m'.format(i_episode, np.mean(scores_deque),proc_time), end="")
        if(i_episode % 10 == 0):
            #save(param, agent, score_list, avg_score_list, i_episode, np.mean(scores_deque))
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))   
        if(np.mean(scores_deque) >= 30):
            save(param, agent, score_list, avg_score_list, i_episode, np.mean(scores_deque))
            print('\rProblem is solved at Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
            break
    return score_list, avg_score_list

In [6]:
LR = 1e-4
param = {
    "BUFFER_SIZE" : int(1e6),  # replay buffer size
    "BATCH_SIZE" : 128,        # minibatch size
    "GAMMA" : 0.99,            # discount factor
    "TAU" : 1e-3,              # for soft update of target parameters
    "LR_ACTOR" : LR,         # learning rate of the actor 
    "LR_CRITIC" : LR,        # learning rate of the critic
    "WEIGHT_DECAY" : 0.0000,   # L2 weight decay
    "LEARNING_REPEAT" : 20,     # updating the actor and critic networks LEARNING_REPEAT times
    "LEARNING_STEP" : 20        # at every LEARNING_STEP
}

In [None]:
for BATCH_SIZE in [128, 256, 512]:
    for LEARNING_REPEAT in [5, 10, 20]:
        for i in range(-1,2):
            param["BATCH_SIZE"] = BATCH_SIZE
            param["LEARNING_REPEAT"] = LEARNING_REPEAT
            param["LR_ACTOR"] = LR+(LR*(10**i))
            param["LR_CRITIC"] = LR+(LR*(10**i))
            agent = Agent(state_size, action_size, num_agents, param, 10)
            ddpg(env, env_info, agent, num_agents, param)
            del agent

Episode 10	Average Score: 0.18 proc_time 0.08m
Episode 14	Average Score: 0.14 proc_time 0.08m

In [None]:
agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth'))
agent.critic_local.load_state_dict(torch.load('checkpoint_critic.pth'))

env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
while True:
    actions = agent.act(states)                        # select an action (for each agent)
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if np.any(dones):                                  # exit loop if episode finished
        break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

In [4]:
env.close()