### Continuous Control

---

You are welcome to use this coding environment to train your agent for the project.  Follow the instructions below to get started!

### 1. Start the Environment

Run the next code cell to install a few packages.  This line will take a few minutes to run!

In [1]:
!pip -q install ./python

In [2]:
from unityagents import UnityEnvironment
import numpy as np
env = UnityEnvironment(file_name='/data/Reacher_Linux_NoVis/Reacher.x86_64')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [3]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [4]:
env_info = env.reset(train_mode=True)[brain_name]
env.reset(train_mode=True)[brain_name]
# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 20
Size of each action: 4
There are 20 agents. Each observes a state with length: 33
The state for the first agent looks like: [  0.00000000e+00  -4.00000000e+00   0.00000000e+00   1.00000000e+00
  -0.00000000e+00  -0.00000000e+00  -4.37113883e-08   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00  -1.00000000e+01   0.00000000e+00
   1.00000000e+00  -0.00000000e+00  -0.00000000e+00  -4.37113883e-08
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   5.75471878e+00  -1.00000000e+00
   5.55726624e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00
  -1.68164849e-01]


In [7]:
import importlib
import Agent_1
importlib.reload(Agent_1)

<module 'Agent_1' from '/home/workspace/Agent_1.py'>

In [5]:
from Agent_1 import Agent
agent = Agent(state_size = state_size, action_size = action_size,seed = 2,num_agents=num_agents)

In [None]:
from collections import deque
pass_score = 30
import torch
def ddpg(n_episodes = 10000, t_max = 2000, print_every = 100,num_agents=20):
    scores_deque = deque(maxlen=100)
    scores=[]
    for i_episode in range(1, n_episodes+1):
        
        env_info = env.reset(train_mode = True)[brain_name]
        states = env_info.vector_observations
        
        agent.reset()
        agent_scores=np.zeros(num_agents)
        
        for t in range(t_max):
            actions=[]
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done 
            #add all the experience to common replay buffer (Attempt #1)
            for i in range(0,num_agents):
                agent.add_experience(states[i],actions[i],rewards[i],next_states[i],dones[i])
            #Every 20 time steps, perform 10 learning steps (Attempt #3)
            if t%20 == 0:
                for _ in range(10):
                    agent.step()
            states=next_states
            agent_scores+=rewards
            if np.any(dones):
                break
                
        scores_deque.append(np.mean(agent_scores))
        scores.append(np.mean(agent_scores))
        print('\nEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)), end="")
        if i_episode %10 == 0:
            torch.save(agent.actor_local.state_dict(),  'checkpoint_actor.pth' )
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
            
            
        if np.mean(scores_deque)>=pass_score: #and suc_ep>=100:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_deque)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
            break
    return scores

scores = ddpg(num_agents = num_agents)

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()


Episode 1	Average Score: 0.76
Episode 2	Average Score: 0.79
Episode 3	Average Score: 0.67
Episode 4	Average Score: 0.78
Episode 5	Average Score: 0.76
Episode 6	Average Score: 0.78
Episode 7	Average Score: 0.82
Episode 8	Average Score: 0.83
Episode 9	Average Score: 0.79
Episode 10	Average Score: 0.73
Episode 11	Average Score: 0.69
Episode 12	Average Score: 0.65
Episode 13	Average Score: 0.68
Episode 14	Average Score: 0.70
Episode 15	Average Score: 0.71
Episode 16	Average Score: 0.72
Episode 17	Average Score: 0.74
Episode 18	Average Score: 0.75
Episode 19	Average Score: 0.73
Episode 20	Average Score: 0.72
Episode 21	Average Score: 0.70
Episode 22	Average Score: 0.71
Episode 23	Average Score: 0.73

In [12]:
env.close()