In [2]:
import torch
import torch.nn as nn
from torch.distributions import Normal
import random
from AREgym import AREEnv

In [3]:
''' temporarily here, to be move out to a diff file'''

class MLP(nn.Module):
    def __init__(self):
        self.actor = nn.Sequential(
            nn.Linear(72, 256),
            nn.ELU(),
            nn.Linear(256,128),
            nn.ELU(),
            nn.Linear(128, 1),
            nn.Tanh()
        )
        self.critic = nn.Sequential(
            nn.Linear(72, 256),
            nn.ELU(),
            nn.Linear(256,128),
            nn.ELU(),
            nn.Linear(128, 1),
            nn.Tanh()
        )

        self.log_std = nn.Parameters()
    
    def act(self, x):
        value = self.critic(x)
        mu    = self.actor(x)
        std   = self.log_std.exp().expand_as(mu)
        dist  = Normal(mu, std)
        return dist, value
    
    # used for deployment
    def forward(self, x):
        return self.actor(x)

In [None]:
class Worker:
    def __init__(self, env, network, optimizer, device=torch.device('cpu')):
        self.env = env
        self.network = network
        self.optimizer = optimizer
        self.device = device

    def work(self):
        global max_episodes, max_episode_length

        for i in range(max_episodes):
            # initialize episode buffer [obs, action, reward, probability-distribution, value]??
            observations_buffer = []
            actions_buffer = []
            rewards_buffer = []
            values_buffer = []
            log_probs_buffer = []

            # sum up total reward for episode, for back calculating return ground truth
            episode_reward = 0

            # reset env
            obs_np = self.env.reset()
            for j in range(max_episode_length):
                # convert observation from np array to torch tensor
                obs = torch.tensor(obs_np, device=self.device)

                # pass observation into network and get probability distribution and value
                dist, value = self.network.act(obs)

                # sample probability distribution to get action
                action = dist.sample()

                # step env and collect data
                obs_new, reward, done = self.env.step(action)

                # add data to rollout
                reward = torch.tensor(reward, device=self.device)
                observations_buffer.append(obs)
                actions_buffer.append(action)
                rewards_buffer.append(reward)
                values_buffer.append(value)
                log_probs_buffer.append(dist.log_prob(action))

                obs = obs_new

                # train
                if TRAINING and (i % BATCH_SIZE == 0 or done):
                    if len(observations_buffer) >= BATCH_SIZE:
                        observations = observations_buffer[-BATCH_SIZE:]
                        actions = actions_buffer[-BATCH_SIZE:]
                        rewards = rewards_buffer[-BATCH_SIZE:]
                        values = values_buffer[-BATCH_SIZE:]
                        log_probs = log_probs_buffer[-BATCH_SIZE:]
                    else:
                        observations = observation_buffer[:]
                        actions = actions_buffer[:]
                        rewards = rewards_buffer[:]
                        values = values_buffer[:]
                        log_probs = log_probs_buffer[:]

                    # calc return 
                    if done:
                        bootstrapped_value = torch.tensor([[0]]).to(self.device)
                    else:
                        _, bootstrapped_value = self.network.act(obs)
                    returns = self.calc_returns(rewards, bootstrapped_value)
                    advantages = returns - values

                    # split batch into minibatch and train update gradient for up to K epochs
                    ''' how to split minibatch and choose epochs
                    can i just take K_epochs to be batch // minibatch and iterate all minibatches in a random order?
                    or just randomly sample from the batch to form minibatches K number of times'''
                    for _ in range(K_EPOCHS):
                        minibatch_start = random.randint[0: BATCHSIZE - MINIBATCH_SIZE]
                        self.ppo(observations[minibatch_start: minibatch_start + MINIBATCH_SIZE], 
                                actions[minibatch_start: minibatch_start + MINIBATCH_SIZE],
                                returns[minibatch_start: minibatch_start + MINIBATCH_SIZE],
                                values[minibatch_start: minibatch_start + MINIBATCH_SIZE],
                                log_probs[minibatch_start: minibatch_start + MINIBATCH_SIZE],
                                advantages[minibatch_start: minibatch_start + MINIBATCH_SIZE],
                                clip=CLIP)

                if done:
                    break

    def ppo(self, observations, actions, returns, values, log_probs, advantages, clip=0.3):
        dist, value = self.network.act(observations)
        entropy = -0.01 * dist.entropy().mean()
        new_log_prob = dist.log_prob(actions)

        ratio = torch.exp(new_log_prob - log_probs) 
        # this expands to (new prob / old prob) but pytorch doesnt have a prob function only a log prob so some gymnastics has to be done 
        surr1 = ratio * advantages
        surr2 = torch.clamp(ratio , 1 - clip, 1 + clip) * advantages
        actor_loss = -1 *  torch.min(surr1, surr2)

        # mse of estimated value and return 
        value_loss =  0.5 * (returns - values).pow(2).mean() 

        loss = actor_loss + value_loss + entropy
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
    def calc_returns(self, rewards, bootstrapped_value):
        '''some GAE (Generalized Advantage Estimation) thing? havent figured out whats the best way to do this yet'''
        return returns


In [None]:
''' MAGIC NUMBERS'''

TRAINING = True

max_episodes = 1000
max_episode_length = 250

BATCH_SIZE = 50
MINIBATCH_SIZE = 10 

CLIP = 0.2
K_EPOCHS = 50

LR = 0.0001

In [None]:
device = torch.device('cuda')
network = MLP()
optimizer = torch.optim.Adam(network.parameters(), lr=LR)
env = AREEnv(500, 20)

worker = Worker(env, network, optimizer, device)
worker.work()