# Continuous Control


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch

from unityagents import UnityEnvironment
from agent import Agent
from ddpg_learning import ddpg


In [None]:
# Initialize the Environment
env = UnityEnvironment(file_name="Reacher_20.app")

# Get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# Reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# Get the action size
action_size = brain.vector_action_space_size

# Get the state size
state_size = len(env_info.vector_observations[0])

num_agents = len(env_info.agents)


In [None]:
BUFFER_SIZE = int(1e5)  # replay buffer size
BATCH_SIZE = 1024        # minibatch size
GAMMA = 0.99            # discount factor
TAU = 1e-3              # for soft update of target parameters
LR_ACTOR = 1e-4         # learning rate of the actor
LR_CRITIC = 1e-3        # learning rate of the critic
UPDATE_EVERY = 2

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

agent = Agent(state_size=state_size,
              action_size=action_size,
              num_agents=num_agents,
              buffer_size=BUFFER_SIZE,
              batch_size=BATCH_SIZE,
              gamma=GAMMA,
              tau=TAU,
              learning_rate_actor=LR_ACTOR,
              learning_rate_critic=LR_CRITIC,
              device=device,
              update_every=UPDATE_EVERY,
              random_seed=42)


In [None]:
AVERAGE_SCORE_SOLVED=30.0

scores, num_episodes_solved = ddpg(env=env,
                                   agent=agent,
                                   num_agents=num_agents,
                                   average_score_solved=AVERAGE_SCORE_SOLVED)


In [None]:
#Plot the training session (scores per episode)

def plot_scores(scores):
    plt.plot(scores, color='royalblue')
    plt.title('Scores per episode')
    plt.ylabel('score')
    plt.xlabel('episode #')
    plt.show()
    
plot_scores(scores)


In [None]:
scores

In [None]:
#skip learning every timestep
#change batch size (increase)
#smaller buffer size
#change size of networks (smallest possible)
#change noise reduce sigma=0.2 | random noise for each set of actions
#add dropout | BatchNorm1d
#change LR

#comments
#add report, README

In [None]:
#See the trained agent in action.
env_info = env.reset(train_mode=False)[brain_name] # reset the environment
states = env_info.vector_observations              # get the current state

#Uncomment these lines to use a saved checkpoint:
#agent = Agent(state_size=state_size,
#              action_size=action_size,
#              num_agents=num_agents,
#              buffer_size=BUFFER_SIZE,
#              batch_size=BATCH_SIZE,
#              gamma=GAMMA,
#              tau=TAU,
#              learning_rate_actor=LR_ACTOR,
#              learning_rate_critic=LR_CRITIC,
#              device=device,
#              update_every=UPDATE_EVERY,
#              random_seed=42)
#agent.actor_local.load_state_dict(torch.load('solved_checkpoint_actor.pth'))

steps=0

scores = np.zeros(num_agents) 
while True:
    steps+=1
    actions = agent.act(states, add_noise=False)   # select an action
    env_info = env.step(actions)[brain_name]       # send the action to the environment
    next_states = env_info.vector_observations     # get the next state
    rewards = env_info.rewards                     # get the reward
    dones = env_info.local_done                    # see if episode has finished
    states = next_states                           # roll over the state to next time step
    scores += rewards                              # update the score
    if np.any(dones):                              # exit loop if episode finished
        break
    
print("Average Score: {}".format(np.mean(scores)))

In [None]:
steps #1000 per episode per agent

In [None]:
env.close()

In [None]:
#Network (256) (256, 256, 128)

#BUFFER_SIZE = int(1e6)  # replay buffer size
#BATCH_SIZE = 64        # minibatch size
#GAMMA = 0.99            # discount factor
#TAU = 1e-3              # for soft update of target parameters
#LR_ACTOR = 1e-4         # learning rate of the actor
#LR_CRITIC = 1e-3        # learning rate of the critic

#ONE AGENT
#Episode 100	Average Score: 1.8539999585598708	Score: 1.92999995686113833
#Episode 200	Average Score: 4.243399905152619	Score: 7.33999983593821585
#Episode 300	Average Score: 6.629699852038175	Score: 8.46999981068074755
#Episode 323	Average Score: 7.2578998377732935	Score: 8.1299998182803445

#20 AGENTS
#Episode 100	Average Score: 6.294999859295785	Score: 12.4614997214637755
#Episode 101	Average Score: 6.44864985586144	Score: 15.647999650239944

In [None]:
#Network (128) (128, 64, 32)

#BUFFER_SIZE = int(1e6)  # replay buffer size
#BATCH_SIZE = 256        # minibatch size
#GAMMA = 0.99            # discount factor
#TAU = 1e-3              # for soft update of target parameters
#LR_ACTOR = 1e-4         # learning rate of the actor
#LR_CRITIC = 1e-3        # learning rate of the critic

#Episode 100	Average Score: 12.235059726748619	Score: 25.371499432902784
#Episode 104	Average Score: 13.194844705072235	Score: 23.469499475415795

In [None]:
#Network (128) (128, 64, 32)

#BUFFER_SIZE = int(1e6)  # replay buffer size
#BATCH_SIZE = 256        # minibatch size
#GAMMA = 0.99            # discount factor
#TAU = 1e-3              # for soft update of target parameters
#LR_ACTOR = 1e-4         # learning rate of the actor
#LR_CRITIC = 1e-3        # learning rate of the critic

#update_every=2
#different noise for each agent

#Episode 100	Average Score: 9.29	Score: 24.03
#Episode 178	Average Score: 30.01	Score: 36.45
#Environment solved in 78 episodes!	Average Score: 30.01


In [None]:
#Network (128) (128, 64, 32)

#BUFFER_SIZE = int(1e6)  # replay buffer size
#BATCH_SIZE = 512        # minibatch size
#GAMMA = 0.99            # discount factor
#TAU = 1e-3              # for soft update of target parameters
#LR_ACTOR = 1e-4         # learning rate of the actor
#LR_CRITIC = 1e-3        # learning rate of the critic

#update_every=2
#different noise for each agent

#Episode 50	Average Score: 3.01	Score: 8.46
#Episode 100	Average Score: 8.71	Score: 16.27
#Episode 150	Average Score: 18.03	Score: 26.12
#Episode 200	Average Score: 24.41	Score: 31.73
#Episode 249	Average Score: 30.02	Score: 33.05
#Environment solved in 149 episodes!	Average Score: 30.02


In [None]:
#Network (128) (128, 64, 32)

#BUFFER_SIZE = int(1e6)  # replay buffer size
#BATCH_SIZE = 1024        # minibatch size
#GAMMA = 0.99            # discount factor
#TAU = 1e-3              # for soft update of target parameters
#LR_ACTOR = 1e-4         # learning rate of the actor
#LR_CRITIC = 1e-3        # learning rate of the critic

#update_every=2
#different noise for each agent

#Episode 50	Average Score: 6.35	Score: 16.11
#Episode 100	Average Score: 16.67	Score: 34.72
#Episode 142	Average Score: 30.06	Score: 37.12
#Environment solved in 42 episodes!	Average Score: 30.06

In [None]:
#Network (128) (128, 64, 32)

#BUFFER_SIZE = int(1e5)  # replay buffer size
#BATCH_SIZE = 1024        # minibatch size
#GAMMA = 0.99            # discount factor
#TAU = 1e-3              # for soft update of target parameters
#LR_ACTOR = 1e-4         # learning rate of the actor
#LR_CRITIC = 1e-3        # learning rate of the critic

#update_every=2
#different noise for each agent

#Episode 50	Average Score: 14.23	Score: 36.97
#Episode 100	Average Score: 25.06	Score: 37.03
#Episode 115	Average Score: 30.26	Score: 35.87
#Environment solved in 15 episodes!	Average Score: 30.26

In [None]:
#Network (128) (128, 64, 32)

#BUFFER_SIZE = int(1e5)  # replay buffer size
#BATCH_SIZE = 1024        # minibatch size
#GAMMA = 0.99            # discount factor
#TAU = 1e-3              # for soft update of target parameters
#LR_ACTOR = 1e-4         # learning rate of the actor
#LR_CRITIC = 1e-3        # learning rate of the critic

#update_every=2
#different noise for each agent
#add batch norm (more stable?)

#Episode 50	Average Score: 9.10	Score: 26.58
#Episode 100	Average Score: 19.40	Score: 26.75
#Episode 140	Average Score: 30.01	Score: 32.48
#Environment solved in 40 episodes!	Average Score: 30.01

In [None]:
#BEST?
#Network (128, 64) (128, 64, 32)

#BUFFER_SIZE = int(1e5)  # replay buffer size
#BATCH_SIZE = 1024        # minibatch size
#GAMMA = 0.99            # discount factor
#TAU = 1e-3              # for soft update of target parameters
#LR_ACTOR = 1e-4         # learning rate of the actor
#LR_CRITIC = 1e-3        # learning rate of the critic

#update_every=2
#different noise for each agent
#add batch norm (more stable?)

#Episode 50	Average Score: 15.79	Score: 34.08
#Episode 100	Average Score: 23.49	Score: 27.50
#Episode 122	Average Score: 30.18	Score: 31.47
#Environment solved in 22 episodes!	Average Score: 30.18

#Episode 50	Average Score: 16.11	Score: 34.15
#Episode 100	Average Score: 24.06	Score: 33.36
#Episode 118	Average Score: 30.07	Score: 35.69
#Environment solved in 18 episodes!	Average Score: 30.07

In [None]:
#Network (64, 32) (64, 32, 16)
#score went down - unstable 

#BUFFER_SIZE = int(1e5)  # replay buffer size
#BATCH_SIZE = 1024        # minibatch size
#GAMMA = 0.99            # discount factor
#TAU = 1e-3              # for soft update of target parameters
#LR_ACTOR = 1e-4         # learning rate of the actor
#LR_CRITIC = 1e-3        # learning rate of the critic

#update_every=2
#different noise for each agent
#add batch norm (more stable?)

#unstable

In [None]:
#Network (128, 64) (128, 64, 32)

#BUFFER_SIZE = int(1e5)  # replay buffer size
#BATCH_SIZE = 1024        # minibatch size
#GAMMA = 0.99            # discount factor
#TAU = 1e-3              # for soft update of target parameters
#LR_ACTOR = 1e-3         # learning rate of the actor
#LR_CRITIC = 1e-3        # learning rate of the critic

#update_every=2
#different noise for each agent
#add batch norm (more stable?)

#Episode 50	Average Score: 11.90	Score: 35.13
#Episode 100	Average Score: 21.94	Score: 33.25
#Episode 131	Average Score: 30.05	Score: 20.80
#Environment solved in 31 episodes!	Average Score: 30.05