In [82]:
import os
import numpy as np
import pandas as pd
import time
from IPython.display import clear_output

import torch as T
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical

# Agent class definition

In [83]:
class PPOMemory:
    def __init__(self, batch_size):
        self.states = []
        self.probs = []
        self.vals = []
        self.actions = []
        self.rewards = []
        self.dones = []

        self.batch_size = batch_size

    def generate_batches(self):
        n_states = len(self.states)
        batch_start = np.arange(0, n_states, self.batch_size)
        indices = np.arange(n_states, dtype=np.int64)
        np.random.shuffle(indices)
        batches = [indices[i:i+self.batch_size] for i in batch_start]

        return np.array(self.states),\
                np.array(self.actions),\
                np.array(self.probs),\
                np.array(self.vals),\
                np.array(self.rewards),\
                np.array(self.dones),\
                batches

    def store_memory(self, state, action, probs, vals, reward, done):
        self.states.append(state)
        self.actions.append(action)
        self.probs.append(probs)
        self.vals.append(vals)
        self.rewards.append(reward)
        self.dones.append(done)

    def clear_memory(self):
        self.states = []
        self.probs = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.vals = []

class ActorNetwork(nn.Module):
    def __init__(self, n_actions, input_dims, alpha,
            fc1_dims=256, fc2_dims=256, chkpt_dir='tmp/ppo'):
        super(ActorNetwork, self).__init__()

        self.checkpoint_file = os.path.join(chkpt_dir, 'actor_torch_ppo')
        self.actor = nn.Sequential(
                nn.Linear(*input_dims, fc1_dims),
                nn.ReLU(),
                nn.Linear(fc1_dims, fc2_dims),
                nn.ReLU(),
                nn.Linear(fc2_dims, n_actions),
                nn.Softmax(dim=-1)
        )

        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        dist = self.actor(state)
        dist = Categorical(dist)
        
        return dist

    def save_checkpoint(self):
        T.save(self.state_dict(), self.checkpoint_file)

    def load_checkpoint(self):
        self.load_state_dict(T.load(self.checkpoint_file))

class CriticNetwork(nn.Module):
    def __init__(self, input_dims, alpha, fc1_dims=256, fc2_dims=256,
            chkpt_dir='tmp/ppo'):
        super(CriticNetwork, self).__init__()

        self.checkpoint_file = os.path.join(chkpt_dir, 'critic_torch_ppo')
        self.critic = nn.Sequential(
                nn.Linear(*input_dims, fc1_dims),
                nn.ReLU(),
                nn.Linear(fc1_dims, fc2_dims),
                nn.ReLU(),
                nn.Linear(fc2_dims, 1)
        )

        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        value = self.critic(state)

        return value

    def save_checkpoint(self):
        T.save(self.state_dict(), self.checkpoint_file)

    def load_checkpoint(self):
        self.load_state_dict(T.load(self.checkpoint_file))

class Agent:
    def __init__(self, n_actions, input_dims, gamma=0.99, alpha=0.0003, gae_lambda=0.95,
            policy_clip=0.2, batch_size=64, n_epochs=10):
        self.gamma = gamma
        self.policy_clip = policy_clip
        self.n_epochs = n_epochs
        self.gae_lambda = gae_lambda

        self.actor = ActorNetwork(n_actions, input_dims, alpha)
        self.critic = CriticNetwork(input_dims, alpha)
        self.memory = PPOMemory(batch_size)
       
    def remember(self, state, action, probs, vals, reward, done):
        self.memory.store_memory(state, action, probs, vals, reward, done)

    def save_models(self):
        print('... saving models ...')
        self.actor.save_checkpoint()
        self.critic.save_checkpoint()

    def load_models(self):
        print('... loading models ...')
        self.actor.load_checkpoint()
        self.critic.load_checkpoint()

    def choose_action(self, observation):
        state = T.tensor([observation], dtype=T.float).to(self.actor.device)

        dist = self.actor(state)
        value = self.critic(state)
        action = dist.sample()

        probs = T.squeeze(dist.log_prob(action)).item()
        action = T.squeeze(action).item()
        value = T.squeeze(value).item()

        return action, probs, value

    def learn(self):
        for _ in range(self.n_epochs):
            state_arr, action_arr, old_prob_arr, vals_arr,\
            reward_arr, dones_arr, batches = \
                    self.memory.generate_batches()

            values = vals_arr
            advantage = np.zeros(len(reward_arr), dtype=np.float32)

            for t in range(len(reward_arr)-1):
                discount = 1
                a_t = 0
                for k in range(t, len(reward_arr)-1):
                    a_t += discount*(reward_arr[k] + self.gamma*values[k+1]*\
                            (1-int(dones_arr[k])) - values[k])
                    discount *= self.gamma*self.gae_lambda
                advantage[t] = a_t
            advantage = T.tensor(advantage).to(self.actor.device)

            values = T.tensor(values).to(self.actor.device)
            for batch in batches:
                states = T.tensor(state_arr[batch], dtype=T.float).to(self.actor.device)
                old_probs = T.tensor(old_prob_arr[batch]).to(self.actor.device)
                actions = T.tensor(action_arr[batch]).to(self.actor.device)

                dist = self.actor(states)
                critic_value = self.critic(states)

                critic_value = T.squeeze(critic_value)

                new_probs = dist.log_prob(actions)
                prob_ratio = new_probs.exp() / old_probs.exp()
                #prob_ratio = (new_probs - old_probs).exp()
                weighted_probs = advantage[batch] * prob_ratio
                weighted_clipped_probs = T.clamp(prob_ratio, 1-self.policy_clip,
                        1+self.policy_clip)*advantage[batch]
                actor_loss = -T.min(weighted_probs, weighted_clipped_probs).mean()

                returns = advantage[batch] + values[batch]
                critic_loss = (returns-critic_value)**2
                critic_loss = critic_loss.mean()

                total_loss = actor_loss + 0.5*critic_loss
                self.actor.optimizer.zero_grad()
                self.critic.optimizer.zero_grad()
                total_loss.backward()
                self.actor.optimizer.step()
                self.critic.optimizer.step()

        self.memory.clear_memory()    

In [84]:
class Environment:
    def __init__(self, size, posreward = 2, negreward = -2):
        self.size = size
        self.environment = np.array([[['0'] * self.size[2]] * self.size[1]] * self.size[0])
        self.posreward = posreward
        self.negreward = negreward
        self.score = 0
        self.done = False
        self.action_space = self.environment.shape[0]*self.environment.shape[1]*self.environment.shape[2]
        self.observation_space = [self.environment.shape[0]*self.environment.shape[1]*self.environment.shape[2]]
    def resetField(self):
        self.environment = np.array([[['0'] * self.size[2]] * self.size[1]] * self.size[0])
        
    def resetEnvironment(self):
        self.resetField()
        self.score = 0
        self.done = False
    
    def genObs(self):
        return np.array(self.environment.flatten() == '0', dtype=np.float32)
    
    def step(self, action):
        #Save Old State
        oldState = self.environment.copy()
        
        #Make move
        if self.placeContainer(np.unravel_index(action,self.environment.shape)):
            #If move is allowed reward
            reward = self.posreward
        else:
            #If move is not allowed punish
            reward = self.negreward
        
        #Add to total score
        self.score += reward
        
        #End game if field is all #s or if the user messed up.
        if np.all(self.environment == '#') or reward == self.negreward:
            self.done = True
        return np.array(oldState == '0',dtype=np.float32).flatten(), np.array(self.environment == '0',dtype=np.float32).flatten(), action, reward, self.done 
    
    def placeContainer(self, pos):
        if self.isLegal(pos) and not self.done:
            self.environment[pos] = '#'
            return True
        else:
            return False
        
    def isLegal(self, pos):
        IO = self.isOccupied(pos)
        IF = self.isFloating(pos)
        IIE = self.posIsInEnv(pos)
        NAS = self.hasNorthAndSouth(pos)
        #print(IO,IF,IIE,NAS)
        return not IO and not IF and IIE and not NAS
    
    def isOccupied(self, pos):
        if self.posIsInEnv(pos):
            return self.environment[pos] == '#'
        else:
            return False
    
    def hasNorthAndSouth(self, pos):
        NC = self.isOccupied((pos[0],pos[1]-1,0))
        SC = self.isOccupied((pos[0],pos[1]+1,0))
        #print(NC,SC)
        return NC and SC
    
    def posIsInEnv(self, pos):
        x = 0 <= pos[0] < self.environment.shape[0]
        y = 0 <= pos[1] < self.environment.shape[1]
        z = 0 <= pos[2] < self.environment.shape[2]  
        return x and y and z
    
    def isFloating(self, pos):
        return np.any(self.environment[pos[0],pos[1],:pos[2]] == '0')

In [86]:
e = Environment((3,3,3),negreward=-5)

N = 10
batch_size = 32
n_epochs = 4
alpha = 0.0003


agent = Agent(n_actions=e.action_space, batch_size=batch_size, 
                alpha=alpha, n_epochs=n_epochs, 
                input_dims=e.observation_space)

n_games = 5000

score_history = []

n_steps = 0
learn_iters = 0
for i in range(n_games):
    e.resetEnvironment()
    while not e.done:
        action, prob, val = agent.choose_action(e.genObs())
        #print(action, prob, val)
        oldState, newState, action, reward, done = e.step(action)
        n_steps += 1
        #print(oldState, newState, action, reward, done)
        agent.remember(oldState, action, prob, val, reward, done)
        
        if n_steps % N == 0:
            agent.learn()
            learn_iters += 1
    if i % N == 0:
        clear_output(wait=True)
        print('episode', i, 'score %.1f' % e.score, 'time_steps', n_steps, 'learning_steps', learn_iters)
    score_history.append(e.score)

episode 4990 score 15.0 time_steps 32697 learning_steps 3269


import matplotlib.pyplot as plt
plt.plot(score_history)


In [None]:
e.environment.size