In [7]:
# Import libraries
import math
import random
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import time

from torch.distributions import Normal
from torch.distributions import MultivariateNormal
from mlagents_envs.environment import UnityEnvironment
from IPython.display import clear_output

In [None]:
# Check for CUDA
if (torch.cuda.is_available()):
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(device)

# Neural Network

In [None]:
# Function to initialize weights of NN from normal distribution
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0.0, std=0.1)
        nn.init.constant_(m.bias, 0.1)

In [None]:
# Actor-Critic Neural Network
class ActorCritic(nn.Module):
    def __init__(self, inputs, outputs, hidden_size, std=0.0):
        super(ActorCritic, self).__init__()
        
        self.critic = nn.Sequential(
            nn.Linear(inputs, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, 1),
            nn.Tanh()
        )
        
        self.actor = nn.Sequential(
            nn.Linear(inputs, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, outputs),
            nn.Tanh()
        )
        self.log_std = nn.Parameter(torch.ones(outputs) * std)
        self.apply(init_weights)
        
    def forward(self, x):
        value = self.critic(x)
        mu = self.actor(x)
        std   = self.log_std.exp().expand_as(mu)
        dist = Normal(mu, std)
        return dist, value

# Environment

In [None]:
class Environment():
    def __init__(self):
        super(Environment, self).__init__()
        self.env = None
        self.group_name = None
        self.group_spec = None
        self.agents = 0
        self.observations = 0
        self.actions = 0
    
    # Connect to Unity VE
    def connect_env(self):
        for i in range(11):
            try:
                self.env = UnityEnvironment(file_name=None, base_port=5004)
                self.env.reset()
                self.group_name = self.env.get_agent_groups()[0] 
                self.group_spec = self.env.get_agent_group_spec(self.group_name)
                step_result = self.env.get_step_result(self.group_name)
                self.agents = step_result.n_agents()
                print('Connected...')
                return self.group_name, self.group_spec
            except:
                time.sleep(1)
                self.env = None
                self.group_name = None
                self.group_spec = None
        return self.group_name, self.group_spec
    
    # Close environment
    def close_env(self):
        try:
            self.env.close()
        except:
            pass
        return
    
    # Get space dimensions
    def get_shapes(self):
        try:
            self.observations = self.group_spec.observation_shapes
            self.actions = self.group_spec.action_shape
        except:
            self.observations = None
            self.actions = None
        return self.observations, self.actions
    
    # Get observations
    def __result(self):
        step_result = self.env.get_step_result(self.group_name)
        sAgents = step_result.n_agents()
        if (sAgents == self.agents):
            obs = step_result.obs[0]
        else:
            obs = step_results.obs[0][0:self.agents]
        return obs
    
    # Get reward
    def __reward(self):
        step_result = self.env.get_step_result(self.group_name)
        sAgents = step_result.n_agents()
        if (sAgents == self.agents):
            reward = step_result.reward
        else:
            reward = step_result.reward[0:self.agents]
        return reward
    
    # Get done
    def __done(self):
        step_result = self.env.get_step_result(self.group_name)
        sAgents = step_result.n_agents()
        if (sAgents == self.agents):
            done = step_result.done
        else:
            done = step_result.done[0:self.agents] 
        return done
    
    # Reset
    def reset(self):
        self.env.reset()
        result, reward, done = self.__result(), self.__reward(), self.__done()
        return result, reward, done        
    
    # Step
    def step(self, action):
        step_result = self.env.get_step_result(self.group_name)
        sAgents = step_result.n_agents()
        action = np.array(action.cpu())
        print(action)
        if (sAgents == self.agents):
            self.env.set_actions(self.group_name, action)
        else:
            x = np.full((sAgents, action.shape[0]), action)
            self.env.set_actions(self.group_name, x)
        self.env.step()
        result, reward, done = self.__result(), self.__reward(), self.__done()
        return result, reward, done

# Proximal Policy Optimization

In [31]:
class PPO():
    def __init__(self, inputs, outputs, neurons, lr_):
        super(PPO, self).__init__()
        self.model = ActorCritic(inputs, outputs, neurons)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr_)
    
    # Generalized Advantage Estimator
    def gae(self, next_value, rewards, masks, values, gamma=0.99, tau=0.95):
        values = values + [next_value]
        g = 0
        returns = []
        
        for step in reversed(range(len(rewards))):
            delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
            g = delta + gamma * tau * g * masks[step]
            returns.insert(0, g + values[step])
        return returns
    
    # Mini-batch iteration PPO
    def batch(self, mini_batch, states, actions, 
                       log_probs, returns, advantage):
        batch_size = states.size(0)
        print(batch_size//mini_batch)
        for _ in range(batch_size // mini_batch):
            ids = np.random.randint(0, batch_size, mini_batch)
            yield states[ids, :], actions[ids, :], log_probs[ids, :], returns[ids, :], advantage[ids, :]
            
    # Update PPO
    def update(epochs, mini_batch, states, actions, log_probs,
              returns, advantages, epsilon=0.2):
        for _ in range(epochs):
            for state, action, old_probs, ret, adv in self.batch(mini_batch, states, actions, log_probs, returns, advantages):
                dist, value = self.model(state)
                entropy = dist.entropy().mean()
                new_probs = dist.log_prob(action)
                
                ratio = (new_probs - old_probs).exp()
                surr1 = ratio * adv
                surr2 = torch.clamp(ratio, 1 - epsilon, 1 + epsilon) * adv
                
                actor_loss = -torch.min(surr1, surr2).mean()
                critic_loss = (ret - value).pow(2).mean()
                
                loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy
                
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

In [48]:
p = PPO()
x = [1 for i in range(10)]
r = p.gae(10.0, x, x, x)