# Crawler

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch

from mlagents_envs.environment import UnityEnvironment
from gym_unity.envs import UnityToGymWrapper
import numpy as np

from agent import Agent
from ddpg_learning import ddpg

In [2]:
# Initialize the Environment
unity_env = UnityEnvironment(file_name="crawler_static.app")
env = UnityToGymWrapper(unity_env)

# Get the action size
action_size = 20

# Get the state size
state_size = 172

# Get number of agents
num_agents = 1


2021-02-05 17:43:05 INFO [environment.py:111] Connected to Unity environment with package version 1.7.2-preview and communication version 1.3.0
2021-02-05 17:43:05 INFO [environment.py:271] Connected new brain:
CrawlerStatic?team=0


In [3]:
#Initialize the Agent with given hyperparameters

BUFFER_SIZE = int(2e4)  # replay buffer size
BATCH_SIZE = 256       # batch size
GAMMA = 0.99            # discount factor
TAU = 5e-2              # for soft update of target parameters
LR_ACTOR = 5e-4         # learning rate of the actor
LR_CRITIC = 5e-3        # learning rate of the critic
UPDATE_EVERY = 1        # how often to update the network

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #Torch device to use

agent = Agent(state_size=state_size,
              action_size=action_size,
              num_agents=num_agents,
              buffer_size=BUFFER_SIZE,
              batch_size=BATCH_SIZE,
              gamma=GAMMA,
              tau=TAU,
              learning_rate_actor=LR_ACTOR,
              learning_rate_critic=LR_CRITIC,
              device=device,
              update_every=UPDATE_EVERY,
              random_seed=42)


In [None]:
# Train the agent

AVERAGE_SCORE_SOLVED=2000
EPSILON = 1.0
EPSILON_DECAY = .999
EPSILON_MIN = 0.01

scores, num_episodes_solved = ddpg(env=env,
                                   agent=agent,
                                   num_agents=num_agents,
                                   average_score_solved=AVERAGE_SCORE_SOLVED,
                                   epsilon=EPSILON,
                                   epsilon_decay=EPSILON_DECAY,
                                   epsilon_min=EPSILON_MIN)


Episode 100	Average Score: -0.77	Score: -1.00
total timesteps: 1800
epsilon: 0.9047921471137096
Episode 200	Average Score: -0.50	Score: -0.30
total timesteps: 4234
epsilon: 0.818648829478636
Episode 300	Average Score: -0.37	Score: -1.00
total timesteps: 10550
epsilon: 0.7407070321560997
Episode 400	Average Score: -0.54	Score: -0.99
total timesteps: 15558
epsilon: 0.6701859060067403
Episode 424	Average Score: -0.52	Score: -0.98

In [None]:
#Plot the training session (scores per episode averaged across all agents)

def plot_scores(scores):
    plt.plot(scores, color='royalblue')
    plt.title('Scores per episode')
    plt.ylabel('score')
    plt.xlabel('episode #')
    plt.show()
    
plot_scores(scores)


In [None]:
#See the trained agent in action.

env_info = env.reset(train_mode=False)[brain_name] # reset the environment
states = env_info.vector_observations              # get the current state

#Uncomment these lines to use a saved checkpoint:
#agent = Agent(state_size=state_size,
#              action_size=action_size,
#              num_agents=num_agents,
#              buffer_size=BUFFER_SIZE,
#              batch_size=BATCH_SIZE,
#              gamma=GAMMA,
#              tau=TAU,
#              learning_rate_actor=LR_ACTOR,
#              learning_rate_critic=LR_CRITIC,
#              device=device,
#              update_every=UPDATE_EVERY,
#              random_seed=42)
#agent.actor_local.load_state_dict(torch.load('solved_checkpoint_actor.pth'))

scores = np.zeros(num_agents) 
while True:
    actions = agent.act(states, add_noise=False)   # select an action
    env_info = env.step(actions)[brain_name]       # send the action to the environment
    next_states = env_info.vector_observations     # get the next state
    rewards = env_info.rewards                     # get the reward
    dones = env_info.local_done                    # see if episode has finished
    states = next_states                           # roll over the state to next time step
    scores += rewards                              # update the score
    if np.any(dones):                              # exit loop if episode finished
        break
    
print("Average Score: {}".format(np.mean(scores)))


In [None]:
env.close()