In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import gym
from collections import namedtuple
import numpy as np
import matplotlib
import random
import math
from itertools import count
import matplotlib.pyplot as plt
from torch.distributions import Categorical
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display


In [2]:
class policyNet(nn.Module):
    
    def __init__(self):
        super(policyNet,self).__init__()
        self.L1 = nn.Linear(4,30)
        self.out = nn.Linear(30,2)
    
    def forward(self,x):
        x = F.relu(self.L1(x))
        x = self.out(x)
        x = F.softmax(x)
        return x

In [3]:
env = gym.make('CartPole-v0').unwrapped
policy_net = policyNet()
optimizer = torch.optim.RMSprop(policy_net.parameters() , lr = 0.01)
batch_size = 16
SAR = namedtuple('sar',['state','action','reward'])
steps = 0
duration = []
SAR_list = []
try:
    for e in range(200):
        state = env.reset()
        state = torch.Tensor(state)
        for t in count():
            env.render()
            probs = policy_net(state)
            m = Categorical(probs)
            action = m.sample()
            next_state , reward , done , _ = env.step(action.item())

            if done:
                reward = 0
            sar = SAR(state , action , reward)
            SAR_list.append(sar)
            state = next_state
            state = torch.Tensor(state)
            
            steps += 1
            if done:
                break
        
        duration.append(t)
        
        if e>0 and e%batch_size == 0:
            
            rewards = np.zeros([steps])
            gamma = 0.99
            cur_reward = 0
            
            for i in reversed(range(steps)):
                sar = SAR_list[i]
                r = sar.reward
                if r == 0:
                    cur_reward = 0
                else:
                    cur_reward = gamma*cur_reward + r
                    rewards[i] = cur_reward
                    
            reward_mean = np.mean(rewards)
            reward_std = np.std(rewards)
            rewards = (rewards-reward_mean) / reward_std
            
            optimizer.zero_grad()
            loss = 0
            for i in range(steps):
                # TODO:
                # Take out state, action, reward from SAR_list
                # Compute loss
                sar = SAR_list[i]
                cur_state = sar.state
                cur_action = sar.action
                cur_reward = rewards[i]
                
                probs = policy_net(cur_state)           # <= hint: feed something into policy network
                m = Categorical(probs)
                loss += -m.log_prob( cur_action ) * cur_reward
                # END TODO
            loss /= batch_size
            loss.backward()
            optimizer.step()
            
            steps = 0
            SAR_list = []
            
finally:
    env.close()

  # This is added back by InteractiveShellApp.init_path()


In [4]:
from PIL import Image
env = gym.make('CartPole-v0').unwrapped
state = env.reset()
state = torch.Tensor(state)
frames = []
try:
    for t in count():
        env.render()
        frames.append(Image.fromarray(env.render(mode='rgb_array')))
        probs = policy_net(state)
        m = Categorical(probs)
        action = m.sample()
        state , reward , done , _ = env.step(action.item())
        state = torch.Tensor(state)
        if done:
            break
finally:
    env.close()
    with open('policy_gradient.gif','wb') as f:
        im = Image.new('RGB', frames[0].size)
        im.save(f, save_all=True, append_images=frames)

  # This is added back by InteractiveShellApp.init_path()
