# REINFORCE

<img src="https://miro.medium.com/max/335/1*YJ84XhbocwSpGb260PNt1A.png" 
     alt="UToronto Pseudo code"/>

In [16]:
import numpy as np
import random
import gym

import torch
import torch.nn as nn
import torch.nn.functional as F

from itertools import count
from collections import deque
from typing import List, Tuple

In [10]:
def get_env_dim(env: gym.Env) -> Tuple[int, int]:
    """Returns input_dim & output_dim
    Args:
        env (gym.Env): gym Environment (CartPole-v0)
    Returns:
        int: input_dim
        int: output_dim
    """
    input_dim = env.observation_space.shape[0]
    output_dim = env.action_space.n

    return input_dim, output_dim

In [102]:
class PolicyGradientNetwork(nn.Module):
    """2 layer linear network

    Args:
        input_dim (int): `state` dimension.
            `state` is 1-D tensor of shape (input_dim)
        output_dim (int): Number of actions.
            Q_value is 1-D tensor of shape (output_dim)
        hidden_dim (int): Hidden dimension in fc layer
    """
    def __init__(self, input_dim: int, output_dim: int, hidden_dim: int):
        super(PolicyGradientNetwork, self).__init__()

        self.layer1 = torch.nn.Sequential(
            torch.nn.Linear(input_dim, hidden_dim),
            torch.nn.BatchNorm1d(hidden_dim),
            torch.nn.PReLU()
        )

        self.final = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, state: np.ndarray):
        """Returns a Q_value
        Args:
            x (torch.Tensor): `State` 2-D tensor of shape (n, input_dim)
        Returns:
            torch.Tensor: Q_value, 2-D tensor of shape (n, output_dim)
        """
        x = self.layer1(state)
        x = F.softmax(self.final(x), dim=1)
        return x

In [103]:
x = env.reset()
x = torch.from_numpy(x).float().unsqueeze(0)

In [104]:
p_net = PolicyNetwork(4,2,128)
p_net.train(mode=False)
probs = p_net.forward(x)
# torch.log(probs)

In [121]:
class Agent():
    def __init__(self, input_dim: int, output_dim: int, hidden_dim: int, learning_rate: float = 3e-4) -> None:
        """Agent class that choose action and train
        Args:
            input_dim (int): input dimension
            output_dim (int): output dimension
            hidden_dim (int): hidden dimension
        """
        self.policy_net = PolicyGradientNetwork(input_dim, output_dim, hidden_dim)
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.optim = torch.optim.Adam(self.policy_net.parameters(), lr=learning_rate)
        
    def get_action(self, state: np.ndarray) -> int:
        """Returns an action
        Args:
            state (np.ndarray): 1-D tensor of shape (input_dim)
        Returns:
            int: action index
        """
        self.policy_net.train(mode=False)   # Otherwise you'll need a batch input
        state = torch.from_numpy(state).float().unsqueeze(0) # unsqueeze because 2D or 3D input expected
        probs = self.policy_net.forward(state)
        action = np.random.choice(self.output_dim, p=np.squeeze(probs.detach()).numpy())
        log_prob = torch.log(probs.squeeze(0)[action])
        return action, log_prob
    
    def update_policy_gradient(self, log_probs : List[float], rewards: List[int], GAMMA: int):
        R = 0
        returns = []
        policy_loss = []
        
        for r in rewards[::-1]:
            R = r + GAMMA * R
            returns.insert(0,R)
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + eps)  # normalize probably more stable
        for log_prob, R in zip(log_probs, returns):
            policy_loss.append(-log_prob * R)
        self.optim.zero_grad()
        policy_loss = torch.stack(policy_loss).sum()
        policy_loss.backward()
        self.optim.step()
        

In [126]:
GAME = 'CartPole-v1'
MAX_ITERATIONS = 10000
RENDER = False
SEED = 42
GAMMA = 0.99
HIDDEN_DIM = 128

try:
    env = gym.make('CartPole-v1')
#     env = gym.wrappers.Monitor(env, directory='reinforce_monitors', force=True)   # Write information about agent's performance in a file with optional video recording of agent in action 
    env.seed(SEED)
    torch.manual_seed(SEED)
    
    input_dim, output_dim = get_env_dim(env)
    agent = Agent(input_dim, output_dim, HIDDEN_DIM)
    
    # episode rewards
    rewards = deque(maxlen=50)
    
    # Reinforce
    for i_episode in count(1):
        state = env.reset()
        
        log_probs = []
        ep_reward = []
        
        # get an episode
        for t in range(1, MAX_ITERATIONS):
            action, log_prob = agent.get_action(state)
            new_state, reward, done, _ = env.step(action)
            log_probs.append(log_prob)
            ep_reward.append(reward)
            
            state = new_state
            if done:
                agent.update_policy_gradient(log_probs, ep_reward, GAMMA)
                break
            
        print('[Episode: {:5}] Reward: {:5} '.format(i_episode, np.sum(ep_reward)))
                
        # Convergence
        rewards.append(np.sum(ep_reward))
        if len(rewards) == rewards.maxlen:
            if np.mean(rewards) >= 200:
                print("Game cleared in {} games with {}".format(i + 1, np.mean(rewards)))
                break
            
            
finally:
    env.close()


[Episode:     1] Reward:  26.0 
[Episode:     2] Reward:  11.0 
[Episode:     3] Reward:  12.0 
[Episode:     4] Reward:  18.0 
[Episode:     5] Reward:  10.0 
[Episode:     6] Reward:  11.0 
[Episode:     7] Reward:  13.0 
[Episode:     8] Reward:  27.0 
[Episode:     9] Reward:  12.0 
[Episode:    10] Reward:   9.0 
[Episode:    11] Reward:  21.0 
[Episode:    12] Reward:  18.0 
[Episode:    13] Reward:  31.0 
[Episode:    14] Reward:  13.0 
[Episode:    15] Reward:  28.0 
[Episode:    16] Reward:  12.0 
[Episode:    17] Reward:  15.0 
[Episode:    18] Reward:  17.0 
[Episode:    19] Reward:  15.0 
[Episode:    20] Reward:  22.0 
[Episode:    21] Reward:  31.0 
[Episode:    22] Reward:  53.0 
[Episode:    23] Reward:  12.0 
[Episode:    24] Reward:  12.0 
[Episode:    25] Reward:  20.0 
[Episode:    26] Reward:  10.0 
[Episode:    27] Reward:  12.0 
[Episode:    28] Reward:  57.0 
[Episode:    29] Reward:  20.0 
[Episode:    30] Reward:  14.0 
[Episode:    31] Reward:  17.0 
[Episode