In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import copy
import einops
import gym

In [None]:
@gin.configurable
def mlp_model(obs_size, action_size, hidden_size):
    return nn.Sequential(CastToFloat(), nn.Linear(obs_size, hidden_size),
                         nn.ReLU(), nn.Linear(hidden_size, hidden_size),
                         nn.ReLU(), dqn_head(hidden_size, action_size))

def atari_model(obs_n_channels, action_size):
    return nn.Sequential(Rearrange("n h w c -> n c h w"), PixelByteToFloat(),
                         nn.Conv2d(obs_n_channels, 32, 8, stride=4), nn.ReLU(),
                         nn.Conv2d(32, 64, 4, stride=2), nn.ReLU(),
                         nn.Conv2d(64, 64, 3, stride=1), nn.ReLU(),
                         nn.Flatten(), dqn_head(3136, action_size))


## Policy Gradient

In [1]:
from collections import namedtuple
from copy import deepcopy
import einops
import gym
from IPython import display
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from tqdm.notebook import tqdm

%matplotlib inline

# DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu:0"

In [None]:
class PGNet(nn.Module):
    
    def __init__(self, in_size, hidden_size, out_size):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(in_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, out_size),
            nn.Softmax()
        )
        
    def forward(self, x):
        return self.layers(x)

In [None]:
def action_probs(model, states):
    return torch.distributions.categorical.Categorical(model(states))

def batch_loss(model, states, actions, rewards):
    logprob = action_probs(model, states).log_prob(actions)
    return (-1.0 * logprob * rewards).mean()

In [None]:
def value_loss(states, rewards, value_fn, discount_factor):
    """
    value_fn takes in a tensor of dim 1?? and applies value function element wise
    values is a tensor of 
    """
    values = value_fn(states)
    tds = discount_factor * values[1:] + rewards[:-1] - values[:-1]
    tds = torch.pow(gamma_lambda, torch.arange(values.shape[-1] - 1)) * tds
    return torch.sum(tds, dim=-1)

In [None]:
def evaluate(env_name, model):
    env = gym.make(env_name)
    state = torch.tensor(env.reset(), dtype=torch.float32)
    done = False
    ep_reward = 0.0

    while not done:
        action = action_probs(model, state).sample().item()
        state_next, reward, done, _ = env.step(action)
        state = torch.tensor(state_next, dtype=torch.float32)
        ep_reward += reward
        
    return ep_reward

In [None]:
def train(
    env_name, num_epochs, batch_size, lr, hidden_size=64, eval_episodes=10,
    rewards_to_go=False, gen_adv_est=False
):
    env = gym.make(env_name)
    max_steps = num_epochs * batch_size
    steps = 0
    in_size = env.reset().shape[0]
    out_size = env.action_space.n
    model = PGNet(in_size, hidden_size, out_size)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    final_ep_rewards = []
        
    for epoch in range(num_epochs):
        batch_states = []
        batch_actions = []
        batch_rewards = []
        done = False
        ep_states = []
        ep_actions = []
        ep_rewards = []
        state = torch.tensor(env.reset(), dtype=torch.float32)
        
        for step in range(batch_size):
            action = action_probs(model, state).sample().item()
            state_next, reward, done, _ = env.step(action)
            ep_states.append(state)
            ep_actions.append(action)
            ep_rewards.append(reward)
            if done:
                final_ep_rewards.append(sum(ep_rewards))
                if rewards_to_go:
                    ep_rewards = (np.cumsum(ep_rewards[::-1])[::-1] - ep_rewards).tolist()
                else:
                    ep_rewards = [sum(ep_rewards)] * len(ep_rewards)
                batch_states.extend(ep_states)
                batch_actions.extend(ep_actions)
                batch_rewards.extend(ep_rewards)
                ep_states = []
                ep_actions = []
                ep_rewards = []
                state = torch.tensor(env.reset(), dtype=torch.float32)
                done = False
            else:
                state = torch.tensor(state_next, dtype=torch.float32)
            
        optimizer.zero_grad()
        batch_states = torch.stack(batch_states)
        batch_actions = torch.tensor(batch_actions, dtype=torch.float32)
        batch_rewards = torch.tensor(batch_rewards, dtype=torch.float32)
        L = batch_loss(model, batch_states, batch_actions, batch_rewards)
        if gen_adv_est:
            L += value_loss(
        L.backward()
        optimizer.step()
        
        eval_rewards = []
        for ep in range(eval_episodes):
            reward = evaluate(env_name, model)
            eval_rewards.append(reward)
        mean_reward = sum(eval_rewards) / eval_episodes
        print(f"epoch {epoch}: reward {mean_reward}")
        
    num_groups = 200
    eps_per_group = len(final_ep_rewards) // num_groups
    final_ep_rewards = [
        sum(final_ep_rewards[i*eps_per_group : (i+1)*eps_per_group]) / eps_per_group 
        for i in range(num_groups)
    ]
    plt.plot(final_ep_rewards)


- could predict returns
- estimating based on its own thing
- takes longer to train because it has high variance
- if you don't actually get the reward, you won't learn that you were close to it
- if you're trying to train a value function, why not use 2 steps rather than 1 step? 
  - TD1 is one step
  - TD-2 train on two steps
- value function is biased because you're just approximating it
- TD-N is N steps
  - higher Ns will learn faster about events later in the episode
  - unbiased because it's not an "estimate of the value function"
  - but it makes things harder to learn because you're not caching previous information
- monte carlo is just every future reward summed up
- if you use a long N, you're going to get a lot of noise
  - but a 1 step estimator 
- TD-N is closer to TD-infinity, whihc is monte carlo
- you could weight between many TD-n's using TD-lambda
  - exponential weighted sum

In [None]:
train(
    "CartPole-v1", 
    num_epochs=10,
    batch_size=5_000,
    lr=1e-2,
    hidden_size=64,
    eval_episodes=20,
)

## Policy Gradient