<a href="https://colab.research.google.com/github/srinathjukanti/DeepRL/blob/master/Vanilla_Policy_Gradient.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import gym
import numpy as np
from torch.distributions.categorical import Categorical
from torch.utils.tensorboard import SummaryWriter
from pdb import set_trace

In [0]:
# Tensorboard Setup (Ngrok)
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip

import os
LOG_DIR = 'runs'
os.makedirs(LOG_DIR, exist_ok=True)
get_ipython().system_raw(
    'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'
    .format(LOG_DIR))
get_ipython().system_raw('./ngrok http 6006 &')
! curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

--2020-03-25 00:24:15--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 52.206.78.89, 34.197.28.250, 52.4.177.151, ...
Connecting to bin.equinox.io (bin.equinox.io)|52.206.78.89|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13773305 (13M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.zip.2’


2020-03-25 00:24:17 (14.4 MB/s) - ‘ngrok-stable-linux-amd64.zip.2’ saved [13773305/13773305]

Archive:  ngrok-stable-linux-amd64.zip
replace ngrok? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
https://5c112b3d.ngrok.io


In [0]:
class MLP(nn.Module):
  def __init__(self, input_dim, n_actions, device, lr, activation=nn.Tanh):
    super(MLP, self).__init__()
    self.input_dim = input_dim
    self.n_actions = n_actions 
    self.layers = nn.Sequential(nn.Linear(input_dim, 32), nn.Tanh(), 
                                nn.Linear(32, 64), nn.Tanh(),
                                nn.Linear(64, n_actions))
    self.device = device
    self.lr = lr 
    self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
    self.to(self.device)

  def forward(self, obs):
    obs = torch.tensor(obs, dtype=torch.float32).unsqueeze(0).to(self.device)
    scores = self.layers(obs)
    return scores

class Agent():
  def __init__(self, input_dim, n_actions, device, lr, gamma):
    self.policy = MLP(input_dim, n_actions, device, lr)
    self.device = device
    self.gamma = gamma
    self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr)

  def get_policy(self, obs):
    logits = self.policy(obs)
    return Categorical(logits=logits)

  def choose_action(self, obs):
    return self.get_policy(obs).sample().item()

  def compute_returns(self, episode_rewards):
    R = 0
    returns = []
    for reward in episode_rewards[::-1]:
      R = reward + self.gamma * R
      returns.insert(0,R)

    return returns

  def compute_loss(self, obs, acts, returns):
    logp = self.get_policy(obs).log_prob(acts)
    t = -logp * returns
    return (-logp * returns).mean()

In [0]:
def train(n_epochs=10, env_name='CartPole-v0', batch_size=5000, lr=1e-2, gamma=0.995):
  env = gym.make(env_name)
  input_dim = env.observation_space.shape[0]
  n_actions = env.action_space.n
  device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
  agent = Agent(input_dim, n_actions, device, lr, gamma)
  tb = SummaryWriter()

  def train_one_epoch():
    batch_obs = []
    batch_actions = []
    batch_returns = []
    batch_cumulative_rewards = []
    episode_rewards = []
    obs = env.reset()
    done = False

    while True:
      action = agent.choose_action(torch.as_tensor(obs, dtype=torch.float32).to(device))
      batch_obs.append(obs.copy())
      batch_actions.append(action)
      obs, reward, done, _ = env.step(action)
      episode_rewards.append(reward)

      if done:
        batch_returns += list(agent.compute_returns(episode_rewards))
        batch_cumulative_rewards.append(sum(episode_rewards))

        obs, done, episode_rewards = env.reset(), False, []
        if len(batch_obs) >= batch_size:
          break;


    agent.optimizer.zero_grad()
    batch_loss = agent.compute_loss(torch.tensor(batch_obs).float().to(device),
                                torch.tensor(batch_actions).float().to(device),
                                torch.tensor(batch_returns).float().to(device))
    batch_loss.backward()
    agent.optimizer.step()

    return batch_loss, batch_cumulative_rewards

  for i_epoch in range(n_epochs):
    batch_loss, batch_rewards = train_one_epoch()
    tb.add_scalar('Epoch/Loss', i_epoch, batch_loss)
    tb.add_scalar('Epoch/Average Reward', i_epoch, np.mean(batch_rewards))
    print(f"Epoch {i_epoch} \t Loss {batch_loss} \
          \t Average Reward {np.mean(batch_rewards)}")

  tb.close()

In [0]:
train()

  from ipykernel import kernelapp as app


Epoch 0 	 Loss 9.643745422363281           	 Average Reward 21.319148936170212
Epoch 1 	 Loss 10.385102272033691           	 Average Reward 24.27403846153846
Epoch 2 	 Loss 14.468417167663574           	 Average Reward 34.736111111111114
Epoch 3 	 Loss 16.205259323120117           	 Average Reward 41.68595041322314
Epoch 4 	 Loss 17.89388084411621           	 Average Reward 50.0
Epoch 5 	 Loss 18.041746139526367           	 Average Reward 56.266666666666666
Epoch 6 	 Loss 20.503896713256836           	 Average Reward 69.875
Epoch 7 	 Loss 25.511638641357422           	 Average Reward 94.66666666666667
Epoch 8 	 Loss 27.30763053894043           	 Average Reward 111.66666666666667
Epoch 9 	 Loss 27.780227661132812           	 Average Reward 124.58536585365853


In [0]:
env