# Policy Gradient Pong

## Imports

In [6]:
%load_ext autoreload
%autoreload 2

import gym
import sys
import numpy as np

from config import D, H, resume, render, batch_size, learning_rate, decay_rate
from model import PolicyGradientPongAgent
from utils import discount_rewards
from data_preprocessing import preprocessing
from steve_rl.math import normalize

sys.path.append('../')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Model initialization

This model is a 2-layer Perceptron, if we write it in `Pytorch`, it would be:

```python
torch.Sequentail(
    torch.Linear(80*80, 200),
    torch.ReLU(),
    torch.Linear(200, 1),
    torch.Sigmoid()
)
```

In [7]:
# model initialization
model = PolicyGradientPongAgent(input_dim=D, hidden_dim=H)
if resume:
    model.load_model('save.p')

## Train loop

![](./imgs/computational_graph.png)

In [None]:
env = gym.make("Pong-v4")
observation = env.reset()
observation = env.reset()

prev_x = None # used in computing the difference frame in an episode
xs = []  # list of input x, x is actually the frame difference in an episode
hs = []  # list of hidden states in an episode
dlogps = []  # list of delta of log probabilities of taking action 2 in an episode
drs = []  # the list of reward in an episode
running_reward = None  # the running average of `reward_sum`s
reward_sum = 0  # the reward of one episode(21 games, should be in range (-21, 21))
episode_number = 0  # the number of episode


while True:
    # one loop will go through one step, i.e. one frame in the game
    if render:
        env.render()

    # 1. preprocess the observation, set input to network to be difference image
    cur_x = preprocessing(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    prev_x = cur_x

    # 2. forward the policy network and sample an action from the returned probability
    aprob, h = model.forward(x)
    action = 2 if np.random.uniform() < aprob else 3 # roll the dice!

    # record various intermediates (needed later for backprop)
    xs.append(x) # observation
    hs.append(h) # hidden state
    y = 1 if action == 2 else 0 # a "fake label"
    dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused), say: we take action 2, with aprob = 0.8, then dlogps.append(0.2), i.e. increase the prob of taking action 2,  if we take action 3 with aprob = 0.8, then dlogps.append(-0.8), increase the prob of taking action 3

    # 3. step the environment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

    if done: # an episode finished
        episode_number += 1

        # stack together all inputs, hidden states, action gradients, and rewards for this episode
        epx, eph, epdlogp, epr = np.vstack(xs), np.vstack(hs), np.vstack(dlogps), np.vstack(drs)
        xs, hs, dlogps, drs = [],[],[],[] # reset array memory for next episode

        # compute the discounted reward backwards through time and standardize
        discounted_epr = normalize(discount_rewards(epr))
        epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
        grad = model.backward(eph, epx, epdlogp)

        # perform rmsprop parameter update every batch_size episodes
        if episode_number % batch_size == 0:
            model.update_weights()

        # boring bookkeeping
        running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
        print(f'resetting env. episode reward total was {reward_sum:.1f}. running mean: {running_reward :.1f}')

        if episode_number % 100 == 0:
            print('saving model at: save.p')
            model.save_model_weights('save.p')

        reward_sum = 0
        observation = env.reset() # reset env
        prev_x = None

    if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
        print (f'ep {episode_number}: game finished, reward: {reward:.1f}' + ('' if reward == -1 else ' !!!!!!!!'))


ep 0: game finished, reward: -1.0
ep 0: game finished, reward: -1.0
ep 0: game finished, reward: -1.0
ep 0: game finished, reward: -1.0
ep 0: game finished, reward: -1.0
ep 0: game finished, reward: -1.0
ep 0: game finished, reward: -1.0
ep 0: game finished, reward: -1.0
ep 0: game finished, reward: -1.0
ep 0: game finished, reward: -1.0
ep 0: game finished, reward: -1.0
ep 0: game finished, reward: -1.0
ep 0: game finished, reward: -1.0
ep 0: game finished, reward: -1.0
ep 0: game finished, reward: -1.0
ep 0: game finished, reward: -1.0
ep 0: game finished, reward: -1.0
ep 0: game finished, reward: -1.0
ep 0: game finished, reward: -1.0
ep 0: game finished, reward: -1.0
resetting env. episode reward total was -21.0. running mean: -21.0
ep 1: game finished, reward: -1.0
ep 1: game finished, reward: -1.0
ep 1: game finished, reward: -1.0
ep 1: game finished, reward: -1.0
ep 1: game finished, reward: -1.0
ep 1: game finished, reward: -1.0
ep 1: game finished, reward: -1.0
ep 1: game fini