# Collaboration and Competition

---

### 1. Start the Environment

In [1]:
from unityagents import UnityEnvironment
import numpy as np

env = UnityEnvironment(file_name='apps/Tennis.app')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

In [2]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [3]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 2
Size of each action: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -6.65278625 -1.5
 -0.          0.          6.83172083  6.         -0.          0.        ]


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import random
import copy
from collections import deque, namedtuple

In [5]:
# Models
def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)

class Critic(nn.Module):
    def __init__(self, seed):
        super(Critic, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(48, 400)
        self.fc2 = nn.Linear(404, 300)
        self.fc3 = nn.Linear(300, 2)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)
        
    def forward(self, states, actions):
        states = states.reshape(-1, 48)
        actions = actions.reshape(-1, 4)
        x = F.relu(self.fc1(states))
        x = torch.cat((x, actions), -1)
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
class Actor(nn.Module):
    def __init__(self, seed):
        super(Actor, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(24, 400)
        self.fc2 = nn.Linear(400, 300)
        self.fc3 = nn.Linear(300, 2)
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)
        
    def forward(self, states):
        x = F.relu(self.fc1(states))
        x = F.relu(self.fc2(x))
        x = F.tanh(self.fc3(x))
        return x

In [6]:
class Agent(nn.Module):
    def __init__(self):
        super(Agent, self).__init__()
        self.actor = Actor(1)
        self.critic = Critic(1)
        self.critic_target = Critic(1)
        self.optimizer_actor = optim.Adam(self.actor.parameters(), lr = 1e-4)
        self.optimizer_critic = optim.Adam(self.critic.parameters(), lr = 1e-4)
        self.states = torch.empty(100000, num_agents, 24)
        self.actions = torch.empty(100000, num_agents, 2)
        self.rewards = torch.empty(100000, num_agents, 1)
        self.next_states = torch.empty(100000, num_agents, 24)
        self.dones = torch.empty(100000, num_agents, 1)
        self.idx = 0
        self.count = 0
        self.scale = 2
    
    def act(self, states, add_noise = True):
        with torch.no_grad():
            actions = self.actor(states)
        self.scale *= 0.9999
        if add_noise:
            actions += np.random.standard_normal(2) * self.scale
        return np.clip(actions, -1, 1)
    
    def push(self, states, actions, rewards, next_states, dones):
        self.states[self.idx] = states
        self.actions[self.idx] = actions
        self.rewards[self.idx] = torch.tensor(rewards, dtype = torch.float).unsqueeze(1)
        self.next_states[self.idx] = next_states
        self.dones[self.idx] = torch.tensor(dones, dtype = torch.float).unsqueeze(1)
        
        self.count += 1
        self.idx = self.count % 100000
        
    def sample(self):
        sample_range = min(self.count, 99999)
        batch_idxs = random.sample(range(sample_range), 128)
        return self.states[batch_idxs], self.actions[batch_idxs], self.rewards[batch_idxs], self.next_states[batch_idxs], self.dones[batch_idxs]

    def step(self):
        if self.count >= 128:
            states, actions, rewards, next_states, dones = self.sample()
            next_actions = self.actor(next_states)
            td_error = torch.mean((rewards + 0.99 * self.critic_target(next_states, next_actions).unsqueeze(2) * \
                                   (1 - dones) - self.critic(states, actions).unsqueeze(2))**2)
            
            self.optimizer_critic.zero_grad()
            td_error.backward()
            self.optimizer_critic.step()
            for target_param, local_param in zip(self.critic_target.parameters(), self.critic.parameters()):
                target_param.data.copy_(0.001*local_param.data + (1.0-0.001)*target_param.data)
                    
            actions_pred = self.actor(states)
            rewards_pred = self.critic(states, actions_pred).unsqueeze(2)
            rewards_calc = -torch.mean(rewards_pred)
            
            self.optimizer_actor.zero_grad()
            rewards_calc.backward()
            self.optimizer_actor.step()

agent = Agent()

In [7]:
agent.critic.load_state_dict(torch.load('maddpg/checkpoint/trained_weights_ddpg_v2_critic.pth'))
agent.actor.load_state_dict(torch.load('maddpg/checkpoint/trained_weights_ddpg_v2_policy.pth'))

<All keys matched successfully>

In [8]:
scores = []
for i in range(5):                                         
    env_info = env.reset(train_mode=False)[brain_name]     
    states = torch.tensor(env_info.vector_observations, dtype = torch.float) 
    score = np.zeros(num_agents) 
    while True:
        actions = agent.act(states, add_noise = False)
        env_info = env.step(actions.cpu().data.numpy())[brain_name]           
        next_states = torch.tensor(env_info.vector_observations, dtype = torch.float)       
        rewards = env_info.rewards
        dones = env_info.local_done                        
        score += env_info.rewards
        states = next_states                              
        if np.any(dones):                                  
            break
    scores.append(score)
    
    print('\rEpisode {}\tReward: {:.5f}'.format(i, np.mean(score)))



Episode 0	Reward: 2.60000
Episode 1	Reward: 2.65000
Episode 2	Reward: 2.60000
Episode 3	Reward: 2.65000
Episode 4	Reward: 2.65000


In [9]:
env.close()