In [None]:
import torch
import torch.nn as nn
from torch.distributions import Normal
import random
from AREgym import AREEnv
import gym

from torch.utils.tensorboard import SummaryWriter


In [None]:
''' temporarily here, to be move out to a diff file'''

class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.actor = nn.Sequential(
            # nn.Linear(72, 256),
            nn.Linear(3, 256),
            nn.ELU(),
            nn.Linear(256,128),
            nn.ELU(),
            nn.Linear(128, 1),
            nn.Tanh()
        )
        self.critic = nn.Sequential(
            nn.Linear(3, 256),
            # nn.Linear(72, 256),
            nn.ELU(),
            nn.Linear(256,128),
            nn.ELU(),
            nn.Linear(128, 1),
            # nn.Tanh()
        )

        self.log_std = 0.6 * torch.ones(1, 1)
    
    def act(self, x):
        value = self.critic(x)
        mean    = self.actor(x)
        
        std   = 0.2 * torch.ones(1, 1)
        dist  = Normal(mean, std)
        return dist, value
    
    # used for deployment
    def forward(self, x):
        return self.actor(x)

In [None]:
''' MAGIC NUMBERS'''

TRAINING = True

max_episodes = 1000
max_episode_length = 200

BATCH_SIZE = 100
MINIBATCH_SIZE = 50

CLIP = 0.2
K_EPOCHS = 10

LR = 0.00001 

gamma = 0.99 #
lamda = 0.95 #

In [None]:
class Worker:
    def __init__(self, env, network, optimizer, device=torch.device('cpu')):
        self.env = env
        self.network = network
        self.optimizer = optimizer
        self.device = device
        self.writer = SummaryWriter()


    def work(self):
        global max_episodes, max_episode_length

        for i in range(max_episodes):
            # initialize episode buffer [obs, action, reward, probability-distribution, value]??
            # print("starting episode", i)
            observations_buffer = [] # not normalized
            actions_buffer = [] # normalized (-1, 1)
            rewards_buffer = [] # not normalized
            values_buffer = [] # normalized (-1 ,1)
            log_probs_buffer = [] #normalized i think

            # reset env
            obs_np = self.env.reset()
            # print(obs_np)
            for j in range(max_episode_length):
                

                # convert observation from np array to torch tensor
                obs = torch.tensor(obs_np, device=self.device,dtype=torch.float32) / 8

                # pass observation into network and get probability distribution and value
                with torch.no_grad():
                    dist, value = self.network.act(obs)

                # sample probability distribution to get action
                action = dist.sample()

                # step env and collect data
                obs_new, reward, done, _ = self.env.step(action)

                # convert to tensor
                obs_new = torch.tensor(obs_new, device=self.device,dtype=torch.float32)
                reward = torch.tensor(reward, device=self.device,dtype=torch.float32)

                # print(reward)
                observations_buffer.append(obs)
                actions_buffer.append(action)
                rewards_buffer.append(reward)
                values_buffer.append(value)
                log_probs_buffer.append(dist.log_prob(action))

                obs = obs_new

                # train
                if TRAINING and (j % BATCH_SIZE == 0 or done) and (j != 0):
                    if len(observations_buffer) >= BATCH_SIZE:
                        observations = observations_buffer[-BATCH_SIZE:]
                        actions = actions_buffer[-BATCH_SIZE:]
                        rewards = rewards_buffer[-BATCH_SIZE:]
                        values = values_buffer[-BATCH_SIZE:]
                        log_probs = log_probs_buffer[-BATCH_SIZE:]
                    else:
                        observations = observations_buffer[:]
                        actions = actions_buffer[:]
                        rewards = rewards_buffer[:]
                        values = values_buffer[:]
                        log_probs = log_probs_buffer[:]
                    
                    observations = torch.stack(observations)
                    actions = torch.stack(actions).squeeze(1)
                    rewards = torch.stack(rewards)
                    values = torch.stack(values)
                    log_probs = torch.stack(log_probs).squeeze(1)

                    # calc return 
                    if done:
                        bootstrapped_value = torch.tensor([0]).to(self.device)
                    else:
                        _, bootstrapped_value = self.network.act(obs)
                    advantages, returns = self.calc_returns_gae(rewards , values, bootstrapped_value)
    
                    # split batch into minibatch and train update gradient for up to K epochss
                    for _ in range(K_EPOCHS):
                        minibatch_start = random.randint(0, BATCH_SIZE - MINIBATCH_SIZE)
                        self.ppo(observations[minibatch_start: minibatch_start + MINIBATCH_SIZE], 
                                actions[minibatch_start: minibatch_start + MINIBATCH_SIZE],
                                returns[minibatch_start: minibatch_start + MINIBATCH_SIZE],
                                values[minibatch_start: minibatch_start + MINIBATCH_SIZE],
                                log_probs[minibatch_start: minibatch_start + MINIBATCH_SIZE],
                                advantages[minibatch_start: minibatch_start + MINIBATCH_SIZE],
                                clip=CLIP)

                if done:
                    break
            if i % 10 == 0:
                self.writer.add_scalar('reward', sum(rewards_buffer) / len(rewards_buffer), i)
                # self.writer.add_scalar('loss',self.loss, i)
                self.writer.add_scalar('loss/actor', self.p_l, i)
                self.writer.add_scalar('loss/critic', self.v_l, i)
                self.writer.add_scalar('loss/entropy', self.e_l, i)

            if i % 50 == 0:
                torch.save(self.network.state_dict(), 'policy.pt')
                # print(rewards[-1])


                # render images
                self.env.render()

            print("done with episode ", i)

    def ppo(self, observations, actions, returns, values, log_probs, advantages, clip=0.3):
        dist, values = self.network.act(observations)

        entropy = -0.01 * dist.entropy().mean()

        new_log_prob = dist.log_prob(actions)

        ratio = torch.exp(new_log_prob - log_probs.detach())

        # this expands to (new prob / old prob) but pytorch doesnt have a prob function only a log prob so some gymnastics has to be done 
        surr1 = ratio * advantages.detach()
        surr2 = torch.clamp(ratio , 1 - clip, 1 + clip) * advantages.detach()
        actor_loss = -1 *  (torch.min(surr1, surr2)).mean()

        # mse of estimated value and return 
        value_loss =  0.5 * (returns.detach() - values).pow(2).mean() 

        loss = (actor_loss + value_loss + entropy)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # this is for tensorboard
        self.v_l = value_loss.clone().detach().cpu().numpy()
        self.p_l = actor_loss.clone().detach().cpu().numpy()
        self.e_l = entropy.clone().detach().cpu().numpy()
        
    ''' almost defo correct'''
    def calc_returns_gae(self, rewards, values, last_value):

        # scale rewards so that returns is between -1 and 0 or some shit idk
        # rewards = rewards / 100
        # print("rewards",rewards)

        last_value = torch.unsqueeze(last_value, 1)
        values = torch.cat((values, last_value), 0)
        advantages = torch.zeros(len(rewards) + 1)
        
        for i in reversed(range(len(rewards))):
            delta = rewards[i] + (gamma * values[i + 1]) - values[i]
            advantages[i] = delta + (gamma * lamda * advantages[i + 1]) 
        advantages = torch.unsqueeze(advantages, 1)
        returns = advantages + values
        # print("returns", returns)
        return advantages, returns


In [None]:
device = torch.device('cpu')
network = MLP()
# optimizer = torch.optim.Adam(network.parameters(), lr=LR)
optimizer = torch.optim.Adam([{"params":network.actor.parameters(), "lr":LR}, {"params":network.critic.parameters(), "lr":LR}])
# env = AREEnv(500, 20)
env = gym.make('Pendulum-v0', g=3)

worker = Worker(env, network, optimizer, device)
worker.work()



In [None]:
policy = MLP()
policy.load_state_dict(torch.load('policy.pt'))
policy

In [None]:
#code to test env againt policy
env = gym.make('Pendulum-v0', g=3)

obs =  env.reset()# print(obs)
with torch.no_grad():
    for i in range(1000):
        # print(i)
        # print(obs)
        obs = torch.tensor(obs, dtype=torch.float32)
        action = policy(obs)
        print(action)
        # action = [random.uniform(-1, 1)]
        obs, reward, done, info = env.step(action)
        env.render()