In [1]:
import sys

sys.path.append(r'D:\UserData\Z003XD5A\dev\deep_rl_projects\ml-agents\python')

from unityagents import UnityEnvironment
import numpy as np

In [2]:
env = UnityEnvironment(file_name='../Reacher_Windows_x86_64/Reacher.exe')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [3]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [4]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 20
Size of each action: 4
There are 20 agents. Each observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726624e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


## DDPG Agent

In [5]:
from collections import deque
from itertools import count
import time
import torch
import matplotlib.pyplot as plt
%matplotlib inline

%load_ext autoreload
%autoreload 2

from agent import *

In [9]:
def ddpg(n_episodes=2000):
    
    scores_deque = deque(maxlen=100)
    scores_global = []
        
    for i_episode in range(1, n_episodes+1):
        
        env_info = env.reset(train_mode=True)[brain_name]      # reset the environment    
        states = env_info.vector_observations                  # get the current state (for each agent)
        scores = np.zeros(num_agents)                          # initialize the score (for each agent)
        agent.reset()
        
        score_average = 0
        start_time = time.time()
        for t in count():
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]           # send all actions to tne environment
            next_states = env_info.vector_observations         # get next state (for each agent)
            rewards = env_info.rewards                         # get reward (for each agent)
            dones = env_info.local_done                        # see if episode finished
            agent.step(states, actions, rewards, next_states, dones, t)
            states = next_states                               # roll over states to next time step
            scores += rewards                                  # update the score (for each agent)            
            if np.any(dones):                                  # exit loop if episode finished
                break
        
        scores_deque.append(np.mean(scores))
        scores_global.append(np.mean(scores))
        
        print('\rEpisode {}, Average Score: {:.2f}, Runtime: {:.2f}'\
              .format(i_episode, np.mean(scores_deque), time.time() - start_time), end="\n")        
        
        if i_episode % 10 == 0:
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') 
            
        if np.mean(scores_deque) >= 30.0:
            torch.save(agent.actor_local.state_dict(), 'final_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'final_critic.pth')
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))            
            break            
            
    return scores_global

In [10]:
random_seed = 1234
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
settings = {'buffer_size':int(1e6), 'batch_size':256, 'gamma':0.99, 'tau':1e-3, 'lr_actor':1e-3, 'lr_critic':1e-3,
           'weight_decay':0., 'epsilon':1., 'epsilon_decay':1e-6, 'num_batch_permute':10}

agent = Agent(device, state_size, action_size, random_seed, **settings)

In [11]:
scores = ddpg()

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

Episode 1, Average Score: 0.01, Runtime: 14.42
Episode 2, Average Score: 0.11, Runtime: 14.64
Episode 3, Average Score: 0.23, Runtime: 14.60
Episode 4, Average Score: 0.37, Runtime: 14.40
Episode 5, Average Score: 0.50, Runtime: 14.85
Episode 6, Average Score: 0.54, Runtime: 14.89
Episode 7, Average Score: 0.59, Runtime: 15.88
Episode 8, Average Score: 0.60, Runtime: 15.45
Episode 9, Average Score: 0.63, Runtime: 15.53
Episode 10, Average Score: 0.63, Runtime: 15.85
Episode 11, Average Score: 0.64, Runtime: 16.02
Episode 12, Average Score: 0.67, Runtime: 16.01
Episode 13, Average Score: 0.70, Runtime: 16.17
Episode 14, Average Score: 0.74, Runtime: 16.65
Episode 15, Average Score: 0.76, Runtime: 16.48
Episode 16, Average Score: 0.79, Runtime: 16.98
Episode 17, Average Score: 0.79, Runtime: 16.95
Episode 18, Average Score: 0.83, Runtime: 18.10
Episode 19, Average Score: 0.88, Runtime: 17.42
Episode 20, Average Score: 0.90, Runtime: 18.16
Episode 21, Average Score: 0.94, Runtime: 17.85
E

KeyboardInterrupt: 

In [12]:
env.close()