In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch

from unityagents import UnityEnvironment
from agent import Agent
from deep_q_learning import dqn

In [None]:
# Initialize the Environment
env = UnityEnvironment(file_name="Banana.app")

# Get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# Reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# Get the action size
action_size = brain.vector_action_space_size

# Get the state size
state_size = len(env_info.vector_observations[0])

In [3]:
#Initialize the Agent with given hyperparameters

BUFFER_SIZE = int(1e5)  # replay buffer size
BATCH_SIZE = 64         # minibatch size
GAMMA = 0.999           # discount factor
TAU = 1e-3              # for soft update of target parameters
LR = 5e-4               # learning rate
UPDATE_EVERY = 4        # how often to update the network
seed = 42               # random seed
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #Torch device to use

agent = Agent(state_size=state_size,
              action_size=action_size,
              buffer_size=BUFFER_SIZE,
              batch_size=BATCH_SIZE,
              gamma=GAMMA,
              tau=TAU,
              learning_rate=LR,
              update_every=UPDATE_EVERY,
              device=device,
              seed=seed)

In [4]:
# Train the agent with given epsilon hyperparameters

EPSILON_START = 0.9           #starting value of epsilon, for epsilon-greedy action selection
EPSILON_MIN = 0.01            #minimum value of epsilon
EPSILON_DECAY = 0.9           #epsilon decay factor
EPSILON_DECAY_DELAY = 10      #used to delay the decay of epsilon by a given number of episodes
AVERAGE_SCORE_SOLVED = 13.0   #average score needed (over 100 last episodes) to consider the environment as solved

scores, num_episodes_solved = dqn(env=env,
                                  agent=agent,
                                  average_score_solved=AVERAGE_SCORE_SOLVED,
                                  epsilon_start=EPSILON_START,
                                  epsilon_min=EPSILON_MIN,
                                  epsilon_decay=EPSILON_DECAY,
                                  epsilon_decay_delay=EPSILON_DECAY_DELAY)


In [None]:
#Plot the training session (scores per episode)

def plot_scores(scores):
    plt.plot(scores, color='royalblue')
    plt.title('Scores per episode')
    plt.ylabel('score')
    plt.xlabel('episode #')
    plt.show()
    
plot_scores(scores)


In [None]:
#See the trained agent in action.

env_info = env.reset(train_mode=False)[brain_name] # reset the environment
state = env_info.vector_observations[0]            # get the current state

#agent = Agent(state_size=state_size, action_size=action_size, seed=0)
#agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))

score = 0                                          # initialize the score
while True:
    action = agent.act(state)                      # select an action
    env_info = env.step(action)[brain_name]        # send the action to the environment
    next_state = env_info.vector_observations[0]   # get the next state
    reward = env_info.rewards[0]                   # get the reward
    done = env_info.local_done[0]                  # see if episode has finished
    score += reward                                # update the score
    state = next_state                             # roll over the state to next time step
    if done:                                       # exit loop if episode finished
        break
    
print("Score: {}".format(score))

In [None]:
env.close()

In [5]:
# Function to test various hyperparameters (set above) for the Agent, as well as epsilon
def test_hyperparameters(repeat=10):
    average_episodes_solved = 0
    for i in range(repeat):
        scores, num_episodes_solved = dqn(env=env,
                                  agent=agent,
                                  average_score_solved=AVERAGE_SCORE_SOLVED,
                                  epsilon_start=EPSILON_START,
                                  epsilon_min=EPSILON_MIN,
                                  epsilon_decay=EPSILON_DECAY,
                                  epsilon_decay_delay=EPSILON_DECAY_DELAY)
        
        average_episodes_solved += num_episodes_solved
        
    print('\n\nAverage number of episodes to solve: {}'.format(average_episodes_solved/repeat))
    
#test_hyperparameters()

Episode 100	Average Score: 2.76
Episode 200	Average Score: 7.87
Episode 300	Average Score: 10.91
Episode 392	Average Score: 13.03
Environment solved in 392 episodes!	Average Score: 13.03
Episode 16	Average Score: 1.06

KeyboardInterrupt: 