# Collaboration and Competition


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch

from unityagents import UnityEnvironment
from agent import MultiAgent, Agent
from maddpg_learning import maddpg

In [None]:
# Initialize the Environment
env = UnityEnvironment(file_name="Tennis.app")

# Get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# Reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# Get the action size
action_size = brain.vector_action_space_size

# Get the state size
state_size = len(env_info.vector_observations[0])

# Get number of agents
num_agents = len(env_info.agents)


In [None]:
#Initialize the Agent with given hyperparameters

BUFFER_SIZE = int(1e5)  # replay buffer size
BATCH_SIZE = 256        # batch size
GAMMA = 0.99            # discount factor
TAU = 1e-3              # for soft update of target parameters
LR_ACTOR = 1e-4         # learning rate of the actor
LR_CRITIC = 1e-4        # learning rate of the critic
UPDATE_EVERY = 1        # how often to update the network

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #Torch device to use

multi_agent = MultiAgent(state_size=state_size,
                        action_size=action_size,
                        num_agents=num_agents,
                        buffer_size=BUFFER_SIZE,
                        batch_size=BATCH_SIZE,
                        gamma=GAMMA,
                        tau=TAU,
                        learning_rate_actor=LR_ACTOR,
                        learning_rate_critic=LR_CRITIC,
                        device=device,
                        update_every=UPDATE_EVERY,
                        random_seed=42)

In [None]:
# Train the agent

AVERAGE_SCORE_SOLVED=0.5

scores, num_episodes_solved = maddpg(env=env,
                                     agent=multi_agent,
                                     num_agents=num_agents,
                                     average_score_solved=AVERAGE_SCORE_SOLVED)


In [None]:
#Plot the training session (scores per episode averaged across all agents)

def plot_scores(scores):
    plt.plot(scores, color='royalblue')
    plt.title('Scores per episode')
    plt.ylabel('score')
    plt.xlabel('episode #')
    plt.show()
    
plot_scores(scores)


In [None]:
#See the trained agent in action.
agent = multi_agent

env_info = env.reset(train_mode=False)[brain_name] # reset the environment
states = env_info.vector_observations              # get the current state

#Uncomment these lines to use a saved checkpoint:
#agent = Agent(state_size=state_size,
#              action_size=action_size,
#              num_agents=num_agents,
#              buffer_size=BUFFER_SIZE,
#              batch_size=BATCH_SIZE,
#              gamma=GAMMA,
#              tau=TAU,
#              learning_rate_actor=LR_ACTOR,
#              learning_rate_critic=LR_CRITIC,
#              device=device,
#              update_every=UPDATE_EVERY,
#              random_seed=42)
#agent.actor_local.load_state_dict(torch.load('solved_checkpoint_actor.pth'))

scores = np.zeros(num_agents) 
while True:
    actions = agent.act(states, add_noise=False)   # select an action
    env_info = env.step(actions)[brain_name]       # send the action to the environment
    next_states = env_info.vector_observations     # get the next state
    rewards = env_info.rewards                     # get the reward
    dones = env_info.local_done                    # see if episode has finished
    states = next_states                           # roll over the state to next time step
    scores += rewards                              # update the score
    if np.any(dones):                              # exit loop if episode finished
        break
    
print("Average Score: {}".format(np.mean(scores)))


In [None]:
env.close()

In [None]:
##
#batch size (smaller?) - (remove batch norm?)
#buffer size 20-30k?
#update less

#weight decay?
#smaller model - no falloff?
#reduce model size more
#learning rate 
#noise

#use 2 agents (maddgp)