In [2]:
import copy
import random

import torch
import torch.optim as optim
import torch.nn.functional as F

from buffer import *
from network import *
from ou_noise import *

class DDPG():
    """Interacts with and learns from the environment."""

    def __init__(self, device, state_size, action_size, random_seed, buffer_size, batch_size, 
             hidden_in_dim, hidden_out_dim, activation, gamma, tau, lr_actor, lr_critic,
                 weight_decay, epsilon, epsilon_decay, num_batch_permute=10):
             
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        super(DDPG, self).__init__()
        
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        
        self.device = device
        self.gamma = gamma
        self.tau = tau
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.num_batch_permute =num_batch_permute

        # Actor Network (w/ Target Network)
        self.actor_local = Network(self.state_size, self.action_size, hidden_in_dim, hidden_out_dim, activation=activation, is_actor=True).to(self.device)
        self.actor_target = Network(self.state_size, self.action_size, hidden_in_dim, hidden_out_dim, activation=activation, is_actor=True).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Network(self.state_size, self.action_size, hidden_in_dim, hidden_out_dim, activation=activation, is_actor=False).to(self.device)
        self.critic_target = Network(self.state_size, self.action_size, hidden_in_dim, hidden_out_dim, activation=activation, is_actor=False).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay)

        # Same initialization
        self.__copy__(self.actor_local, self.actor_target)
        self.__copy__(self.critic_local, self.critic_target)

        # Noise process
        self.noise = OUNoise(action_size, random_seed, scale=1.0)

        # Replay memory
        self.memory = ReplayBuffer(self.device, action_size, buffer_size, self.batch_size, random_seed)

    def step(self, states, actions, rewards, next_states, dones, time):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size and time % 20 == 0:
            for _ in range(self.num_batch_permute):
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)
                
    def act(self, state, noise_scale=0.0):
        """Returns actions for given state as per current policy."""

        if isinstance(state, np.ndarray):
            state = torch.from_numpy(state).float()
            
        action = self.actor_local(state.to(self.device)) + noise_scale*self.noise.noise()
        return action

    def target_act(self, state, noise_scale=0.0):
        """Returns actions for given state as per current policy."""

        if isinstance(state, np.ndarray):
            state = torch.from_numpy(state).float()

        action = self.actor_target(state.to(self.device)) + noise_scale*self.noise.noise()
        return action

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + ? * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.smooth_l1_loss(Q_expected, Q_targets.detach())

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

        # ---------------------------- update noise ---------------------------- #
        self.epsilon -= self.epsilon_decay
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        ?_target = t*?_local + (1 - t)*?_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

    def __copy__(self, source, target):
        for src_param, target_param in zip(source.parameters(), target.parameters()):
            target_param.data.copy_(src_param.data)



In [5]:
import torch
import numpy as np

from ddpg import *

class MADDPG:
    def __init__(self, gamma, tau, ddpg_settings):
    
        '''
            ddpg_settings: dict 
        '''
        super(MADDPG, self).__init__()
        self.marl = [DDPG(**ddpg_settings), DDPG(**ddpg_settings)]
        self.gamma = gamma
        self.tau = tau
    
    def act(self, obs_per_agent, noise_scale=0.0):
        """get actions from all agents in the MADDPG object"""
        actions = [agent.act(obs, noise_scale) for agent, obs in zip(self.marl, obs_per_agent)]
        return actions
    def target_act(self, obs_per_agent, noise_scale=0.0):
        """get target network actions from all the agents in the MADDPG object """
        target_actions = [agent.target_act(obs, noise) for agent, obs in zip(self.marl, obs_per_agent)]
        return target_actions