In [6]:
import os
import numpy as np
import pandas as pd
import time
from IPython.display import clear_output

import torch as T
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical

# Agent class definition

In [28]:
class PPOMemory:
    def __init__(self, batch_size):
        self.states = []
        self.probs = []
        self.vals = []
        self.actions = []
        self.rewards = []
        self.dones = []

        self.batch_size = batch_size

    def generate_batches(self):
        n_states = len(self.states)
        batch_start = np.arange(0, n_states, self.batch_size)
        indices = np.arange(n_states, dtype=np.int64)
        np.random.shuffle(indices)
        batches = [indices[i:i+self.batch_size] for i in batch_start]

        return np.array(self.states),\
                np.array(self.actions),\
                np.array(self.probs),\
                np.array(self.vals),\
                np.array(self.rewards),\
                np.array(self.dones),\
                batches

    def store_memory(self, state, action, probs, vals, reward, done):
        self.states.append(state)
        self.actions.append(action)
        self.probs.append(probs)
        self.vals.append(vals)
        self.rewards.append(reward)
        self.dones.append(done)

    def clear_memory(self):
        self.states = []
        self.probs = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.vals = []

class ActorNetwork(nn.Module):
    def __init__(self, n_actions, input_dims, alpha,
            fc1_dims=256, fc2_dims=256, chkpt_dir='tmp/ppo'):
        super(ActorNetwork, self).__init__()

        self.checkpoint_file = os.path.join(chkpt_dir, 'actor_torch_ppo')
        self.actor = nn.Sequential(
                nn.Linear(*input_dims, fc1_dims),
                nn.ReLU(),
                nn.Linear(fc1_dims, fc2_dims),
                nn.ReLU(),
                nn.Linear(fc2_dims, n_actions),
                nn.Softmax(dim=-1)
        )

        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        dist = self.actor(state)
        dist = Categorical(dist)
        
        return dist

    def save_checkpoint(self):
        T.save(self.state_dict(), self.checkpoint_file)

    def load_checkpoint(self):
        self.load_state_dict(T.load(self.checkpoint_file))

class CriticNetwork(nn.Module):
    def __init__(self, input_dims, alpha, fc1_dims=256, fc2_dims=256,
            chkpt_dir='tmp/ppo'):
        super(CriticNetwork, self).__init__()

        self.checkpoint_file = os.path.join(chkpt_dir, 'critic_torch_ppo')
        self.critic = nn.Sequential(
                nn.Linear(*input_dims, fc1_dims),
                nn.ReLU(),
                nn.Linear(fc1_dims, fc2_dims),
                nn.ReLU(),
                nn.Linear(fc2_dims, 1)
        )

        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        value = self.critic(state)

        return value

    def save_checkpoint(self):
        T.save(self.state_dict(), self.checkpoint_file)

    def load_checkpoint(self):
        self.load_state_dict(T.load(self.checkpoint_file))

class Agent:
    def __init__(self, n_actions, input_dims, gamma=0.99, alpha=0.0003, gae_lambda=0.95,
            policy_clip=0.2, batch_size=64, n_epochs=10):
        self.gamma = gamma
        self.policy_clip = policy_clip
        self.n_epochs = n_epochs
        self.gae_lambda = gae_lambda

        self.actor = ActorNetwork(n_actions, input_dims, alpha)
        self.critic = CriticNetwork(input_dims, alpha)
        self.memory = PPOMemory(batch_size)
       
    def remember(self, state, action, probs, vals, reward, done):
        self.memory.store_memory(state, action, probs, vals, reward, done)

    def save_models(self):
        print('... saving models ...')
        self.actor.save_checkpoint()
        self.critic.save_checkpoint()

    def load_models(self):
        print('... loading models ...')
        self.actor.load_checkpoint()
        self.critic.load_checkpoint()

    def choose_action(self, observation):
        state = T.tensor([observation], dtype=T.float).to(self.actor.device)

        dist = self.actor(state)
        value = self.critic(state)
        action = dist.sample()
        

        probs = T.squeeze(dist.log_prob(action)).item()
        action = T.squeeze(action).item()
        value = T.squeeze(value).item()

        return action, probs, value

    def learn(self):
        for _ in range(self.n_epochs):
            state_arr, action_arr, old_prob_arr, vals_arr,\
            reward_arr, dones_arr, batches = \
                    self.memory.generate_batches()

            values = vals_arr
            advantage = np.zeros(len(reward_arr), dtype=np.float32)

            for t in range(len(reward_arr)-1):
                discount = 1
                a_t = 0
                for k in range(t, len(reward_arr)-1):
                    a_t += discount*(reward_arr[k] + self.gamma*values[k+1]*\
                            (1-int(dones_arr[k])) - values[k])
                    discount *= self.gamma*self.gae_lambda
                advantage[t] = a_t
            advantage = T.tensor(advantage).to(self.actor.device)

            values = T.tensor(values).to(self.actor.device)
            for batch in batches:
                states = T.tensor(state_arr[batch], dtype=T.float).to(self.actor.device)
                old_probs = T.tensor(old_prob_arr[batch]).to(self.actor.device)
                actions = T.tensor(action_arr[batch]).to(self.actor.device)

                dist = self.actor(states)
                critic_value = self.critic(states)

                critic_value = T.squeeze(critic_value)

                new_probs = dist.log_prob(actions)
                prob_ratio = new_probs.exp() / old_probs.exp()
                #prob_ratio = (new_probs - old_probs).exp()
                weighted_probs = advantage[batch] * prob_ratio
                weighted_clipped_probs = T.clamp(prob_ratio, 1-self.policy_clip,
                        1+self.policy_clip)*advantage[batch]
                actor_loss = -T.min(weighted_probs, weighted_clipped_probs).mean()

                returns = advantage[batch] + values[batch]
                critic_loss = (returns-critic_value)**2
                critic_loss = critic_loss.mean()

                total_loss = actor_loss + 0.5*critic_loss
                self.actor.optimizer.zero_grad()
                self.critic.optimizer.zero_grad()
                total_loss.backward()
                self.actor.optimizer.step()
                self.critic.optimizer.step()

        self.memory.clear_memory()    

In [31]:
class Environment:
    def __init__(self, size, posreward = 2, negreward = -2):
        self.size = size
        self.environment = np.array([[['0'] * self.size[2]] * self.size[1]] * self.size[0])
        self.posreward = posreward
        self.negreward = negreward
        self.score = 0
        self.done = False
        self.envsize = self.environment.shape[0]*self.environment.shape[1]*self.environment.shape[2]
        self.action_space = self.envsize*self.envsize
        self.observation_space = [self.environment.shape[0]*self.environment.shape[1]*self.environment.shape[2]]
    def resetField(self):
        self.environment = np.array([[['0'] * self.size[2]] * self.size[1]] * self.size[0])
        
    def resetEnvironment(self):
        self.resetField()
        self.score = 0
        self.done = False
    
    def genObs(self):
        return np.array(self.environment.flatten() == '0', dtype=np.float32)
    
    def playGame(self,agout):
        action_rows = np.reshape(agout,(e.envsize,e.envsize))
        gameScore = 0
        for i in action_rows:
            oldState, newState, reward, done = self.step(np.argmax(i))
            gameScore += reward
            if self.done:
                break
                
        #TODO: Add scoring moment for the entire board
        
        return gameScore, self.done
        
        
    def step(self, action):
        #Save Old State
        oldState = self.environment.copy()
        
        #Make move
        if self.placeContainer(np.unravel_index(action,self.environment.shape)):
            #If move is allowed reward
            reward = self.posreward
        else:
            #If move is not allowed punish
            reward = self.negreward
        
        #Add to total score
        self.score += reward
        
        #End game if field is all #s or if the user messed up.
        if np.all(self.environment == '#') or reward == self.negreward:
            self.done = True
        return np.array(oldState == '0',dtype=np.float32).flatten(), np.array(self.environment == '0',dtype=np.float32).flatten(), action, reward, self.done 
    
    def placeContainer(self, pos):
        if self.isLegal(pos) and not self.done:
            self.environment[pos] = '#'
            return True
        else:
            return False
        
    def isLegal(self, pos):
        IO = self.isOccupied(pos)
        IF = self.isFloating(pos)
        IIE = self.posIsInEnv(pos)
        NAS = self.hasNorthAndSouth(pos)
        #print(IO,IF,IIE,NAS)
        return not IO and not IF and IIE and not NAS
    
    def isOccupied(self, pos):
        if self.posIsInEnv(pos):
            return self.environment[pos] == '#'
        else:
            return False
    
    def hasNorthAndSouth(self, pos):
        NC = self.isOccupied((pos[0],pos[1]-1,0))
        SC = self.isOccupied((pos[0],pos[1]+1,0))
        #print(NC,SC)
        return NC and SC
    
    def posIsInEnv(self, pos):
        x = 0 <= pos[0] < self.environment.shape[0]
        y = 0 <= pos[1] < self.environment.shape[1]
        z = 0 <= pos[2] < self.environment.shape[2]  
        return x and y and z
    
    def isFloating(self, pos):
        return np.any(self.environment[pos[0],pos[1],:pos[2]] == '0')

In [40]:
e = Environment((3,3,3),negreward=-5)
f = np.zeros(e.action_space)
np.reshape(f,(e.envsize,e.envsize))


array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],


In [30]:
e = Environment((3,3,3),negreward=-5)

N = 10
batch_size = 32
n_epochs = 4
alpha = 0.0003


agent = Agent(n_actions=e.action_space, batch_size=batch_size, 
                alpha=alpha, n_epochs=n_epochs, 
                input_dims=e.observation_space)

n_games = 5000

score_history = []

n_steps = 0
learn_iters = 0
for i in range(n_games):
    e.resetEnvironment()
    while not e.done:
        action, prob, val = agent.choose_action(e.genObs())
        #print(action, prob, val)
        oldState, newState, action, reward, done = e.step(action)
        n_steps += 1
        agent.remember(oldState, action, prob, val, reward, done)
        
        if n_steps % N == 0:
            agent.learn()
            learn_iters += 1
    if i % N == 0:
        clear_output(wait=True)
        print('episode', i, 'score %.1f' % e.score, 'time_steps', n_steps, 'learning_steps', learn_iters)
    score_history.append(e.score)

episode 460 score -1.0 time_steps 1236 learning_steps 123
tensor([[0.1347, 0.0005, 0.0013, 0.0054, 0.0021, 0.0016, 0.1035, 0.0008, 0.0011,
         0.3861, 0.0007, 0.0006, 0.0019, 0.0029, 0.0006, 0.1652, 0.0006, 0.0009,
         0.1245, 0.0007, 0.0014, 0.0247, 0.0021, 0.0004, 0.0348, 0.0004, 0.0004]],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.1341, 0.0008, 0.0017, 0.0068, 0.0027, 0.0022, 0.1041, 0.0011, 0.0016,
         0.3478, 0.0011, 0.0009, 0.0026, 0.0038, 0.0008, 0.1757, 0.0008, 0.0013,
         0.1365, 0.0011, 0.0020, 0.0290, 0.0027, 0.0006, 0.0370, 0.0006, 0.0006]],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.1347, 0.0005, 0.0013, 0.0054, 0.0021, 0.0016, 0.1035, 0.0008, 0.0011,
         0.3861, 0.0007, 0.0006, 0.0019, 0.0029, 0.0006, 0.1652, 0.0006, 0.0009,
         0.1245, 0.0007, 0.0014, 0.0247, 0.0021, 0.0004, 0.0348, 0.0004, 0.0004]],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.1284, 0.0008, 0.0017, 0.0066, 0.0026, 0.

tensor([[1.2557e-01, 7.7901e-04, 1.6969e-03, 6.2034e-03, 2.7676e-03, 1.9956e-03,
         9.5276e-02, 1.0861e-03, 1.5463e-03, 3.9309e-01, 9.8481e-04, 7.9708e-04,
         2.4705e-03, 3.9953e-03, 8.3976e-04, 1.9487e-01, 8.3757e-04, 1.2605e-03,
         9.2589e-02, 9.8327e-04, 1.8679e-03, 2.3030e-02, 2.6198e-03, 6.3768e-04,
         4.0983e-02, 6.1335e-04, 6.0624e-04],
        [1.2449e-01, 1.7107e-03, 3.1917e-03, 9.7430e-03, 4.6222e-03, 4.0224e-03,
         1.0510e-01, 2.2031e-03, 2.9220e-03, 3.4307e-01, 2.2740e-03, 1.8984e-03,
         4.7713e-03, 6.7711e-03, 1.8606e-03, 1.8831e-01, 1.9496e-03, 2.6615e-03,
         9.9425e-02, 2.2880e-03, 3.4388e-03, 3.2675e-02, 4.8963e-03, 1.4243e-03,
         4.1441e-02, 1.4194e-03, 1.4262e-03],
        [1.1544e-01, 2.2967e-03, 4.2275e-03, 1.1980e-02, 5.8864e-03, 5.1344e-03,
         1.0326e-01, 2.9845e-03, 3.8442e-03, 3.1377e-01, 3.1611e-03, 2.6549e-03,
         6.1115e-03, 8.4444e-03, 2.5242e-03, 1.9507e-01, 2.8201e-03, 3.5447e-03,
         1.0524e-

tensor([[1.1858e-01, 6.7026e-04, 1.4889e-03, 5.8360e-03, 2.3550e-03, 1.7833e-03,
         1.0677e-01, 9.8644e-04, 1.3357e-03, 4.0803e-01, 8.7725e-04, 7.2134e-04,
         2.2482e-03, 3.5596e-03, 7.1625e-04, 1.9582e-01, 7.4110e-04, 1.1279e-03,
         7.9965e-02, 9.0639e-04, 1.6143e-03, 2.3065e-02, 2.5063e-03, 5.7694e-04,
         3.6627e-02, 5.4648e-04, 5.4899e-04],
        [1.2690e-01, 6.9888e-04, 1.5163e-03, 5.9800e-03, 2.3377e-03, 1.9154e-03,
         1.0932e-01, 1.0031e-03, 1.3793e-03, 3.9078e-01, 9.3261e-04, 7.6324e-04,
         2.3373e-03, 3.5909e-03, 7.4727e-04, 1.9775e-01, 7.5173e-04, 1.1727e-03,
         8.3905e-02, 9.5733e-04, 1.7007e-03, 2.4024e-02, 2.5137e-03, 5.7974e-04,
         3.5350e-02, 5.4761e-04, 5.4238e-04],
        [1.0835e-01, 2.0644e-03, 4.0068e-03, 1.1074e-02, 5.5844e-03, 4.9171e-03,
         9.0800e-02, 2.7145e-03, 3.8030e-03, 3.3257e-01, 2.9801e-03, 2.3871e-03,
         5.8908e-03, 8.3409e-03, 2.3492e-03, 2.1574e-01, 2.7594e-03, 3.3821e-03,
         9.6466e-

tensor([[1.2941e-01, 4.5615e-04, 1.0129e-03, 4.3567e-03, 1.6246e-03, 1.2620e-03,
         1.1241e-01, 6.6068e-04, 9.0971e-04, 4.3301e-01, 5.6005e-04, 4.7205e-04,
         1.6033e-03, 2.6271e-03, 4.7427e-04, 1.9594e-01, 4.6276e-04, 7.5643e-04,
         5.7265e-02, 6.0763e-04, 1.1296e-03, 1.7897e-02, 1.7790e-03, 3.8081e-04,
         3.2238e-02, 3.4000e-04, 3.5326e-04]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
tensor([[1.2393e-01, 8.3255e-04, 1.7422e-03, 6.2910e-03, 2.7823e-03, 2.1950e-03,
         9.8434e-02, 1.1456e-03, 1.6602e-03, 3.9153e-01, 1.1101e-03, 8.5309e-04,
         2.7006e-03, 4.2528e-03, 8.6476e-04, 2.2069e-01, 9.0489e-04, 1.4009e-03,
         6.9224e-02, 1.1113e-03, 1.9427e-03, 2.1012e-02, 2.7463e-03, 7.0990e-04,
         3.8585e-02, 6.7971e-04, 6.6371e-04],
        [1.2408e-01, 1.1982e-03, 2.3728e-03, 8.0980e-03, 3.4355e-03, 2.7624e-03,
         1.1718e-01, 1.6432e-03, 2.1269e-03, 3.4746e-01, 1.5538e-03, 1.3072e-03,
         3.4637e-03, 5.3326e-03, 1.2660e-03, 

tensor([[1.3028e-01, 4.6142e-04, 9.9780e-04, 4.3078e-03, 1.5682e-03, 1.2481e-03,
         1.2089e-01, 6.5957e-04, 8.9344e-04, 4.1881e-01, 5.5141e-04, 4.6916e-04,
         1.5755e-03, 2.6228e-03, 4.6990e-04, 2.1109e-01, 4.5976e-04, 7.5376e-04,
         4.6944e-02, 6.0950e-04, 1.1163e-03, 1.7795e-02, 1.7845e-03, 3.8348e-04,
         3.2562e-02, 3.3731e-04, 3.5316e-04],
        [1.2464e-01, 8.4355e-04, 1.7226e-03, 6.2445e-03, 2.6967e-03, 2.1780e-03,
         1.0536e-01, 1.1473e-03, 1.6361e-03, 3.7887e-01, 1.0980e-03, 8.5110e-04,
         2.6616e-03, 4.2562e-03, 8.5984e-04, 2.3707e-01, 9.0266e-04, 1.3993e-03,
         5.7790e-02, 1.1184e-03, 1.9272e-03, 2.0914e-02, 2.7592e-03, 7.1640e-04,
         3.9002e-02, 6.7696e-04, 6.6492e-04],
        [1.2918e-01, 4.7396e-04, 1.0034e-03, 4.3428e-03, 1.6317e-03, 1.2683e-03,
         1.2003e-01, 6.6135e-04, 9.1024e-04, 4.2845e-01, 5.6948e-04, 4.6070e-04,
         1.6198e-03, 2.6504e-03, 4.6940e-04, 2.0401e-01, 4.5847e-04, 7.7376e-04,
         4.5993e-

tensor([[1.2465e-01, 6.9676e-04, 1.4095e-03, 5.6315e-03, 2.0554e-03, 1.7887e-03,
         1.3041e-01, 9.6551e-04, 1.2753e-03, 3.6287e-01, 8.6834e-04, 7.2509e-04,
         2.1610e-03, 3.5041e-03, 7.0630e-04, 2.4655e-01, 7.1756e-04, 1.1251e-03,
         4.7076e-02, 9.3787e-04, 1.5924e-03, 2.2456e-02, 2.4618e-03, 5.7520e-04,
         3.5756e-02, 5.2079e-04, 5.2484e-04],
        [1.2362e-01, 4.7463e-04, 1.0024e-03, 4.3393e-03, 1.5428e-03, 1.2528e-03,
         1.2911e-01, 6.7059e-04, 8.9501e-04, 4.0334e-01, 5.5963e-04, 4.7343e-04,
         1.5617e-03, 2.6398e-03, 4.7498e-04, 2.3059e-01, 4.6851e-04, 7.7072e-04,
         3.9765e-02, 6.2770e-04, 1.1206e-03, 1.8727e-02, 1.8281e-03, 3.9365e-04,
         3.3046e-02, 3.4548e-04, 3.5952e-04],
        [1.2465e-01, 6.9676e-04, 1.4095e-03, 5.6315e-03, 2.0554e-03, 1.7887e-03,
         1.3041e-01, 9.6551e-04, 1.2753e-03, 3.6287e-01, 8.6834e-04, 7.2509e-04,
         2.1610e-03, 3.5041e-03, 7.0630e-04, 2.4655e-01, 7.1756e-04, 1.1251e-03,
         4.7076e-

KeyboardInterrupt: 

import matplotlib.pyplot as plt
plt.plot(score_history)


In [None]:
e.environment.size