In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from torch.autograd import Variable

import gym
import numpy as np
from itertools import count

import pdb 

import logger 

  from ._conv import register_converters as _register_converters


In [2]:
env = gym.make('CartPole-v0')
env.seed(0)
torch.manual_seed(0)
gamma = 0.99 
eps = np.finfo(np.float32).eps.item() 
## for removing numerical instability

global_step = 0 
tb_folder = './reinforce_only'
tb = logger.Logger(tb_folder, name='freeloc') 

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [3]:
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(4, 8)
        self.affine2 = nn.Linear(8, 2)
        
        self.saved_log_probs = []
        self.rewards = []
        
    def forward(self, x):
        x = F.relu(self.affine1(x))
        action_scores = self.affine2(x)
        return F.softmax(action_scores, dim=1)

In [4]:
policy = Policy()
optimizer = optim.Adam(policy.parameters(), lr=1e-2)

In [5]:
def select_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    state = Variable(state)
    probs = policy(state)
    m = Categorical(probs)
    action = m.sample()
    policy.saved_log_probs.append(m.log_prob(action))
#     pdb.set_trace()
    
    arr = action.data.numpy() ## Variable to numpy array
    return arr[0]
#     return action.item()

In [6]:
def finish_episode():
    R = 0 
    policy_loss = []
    rewards = []
    for r in policy.rewards[::-1]:
        R = r + gamma*R 
        rewards.insert(0, R)
#     pdb.set_trace()
    rewards = torch.Tensor(rewards)
    rewards = (rewards - rewards.mean())/(rewards.std() + eps)
    for log_prob, reward in zip(policy.saved_log_probs, rewards):
        policy_loss.append(-log_prob*reward)
    optimizer.zero_grad()
    policy_loss = torch.cat(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()
    del policy.rewards[:]
    del policy.saved_log_probs[:]

In [7]:
def main():
    global global_step 
    stable = 0 
    ## count arguments start, [step]
    for i_episode in count(1):
        state = env.reset()
        done = False
        t = 0 
        while (not done):
            action = select_action(state)
            t += 1 
            state, reward, done, _ = env.step(action)
            policy.rewards.append(reward)
            
        finish_episode()
        global_step += 1 
        
        tb.scalar_summary('Episode Reward', t, global_step)
            
        if i_episode % 50 == 0:
            print('Episode {}\t Length of episode {:5d}'.format(i_episode, t))

        if t >= 195:
            stable += 1
        else:
            stable = 0
        
        if (stable >= 20):
            print(i_episode)
            print("Solved")
            break

In [8]:
main()

Episode 50	 Length of episode    27
Episode 100	 Length of episode    19
Episode 150	 Length of episode    15
Episode 200	 Length of episode    98
Episode 250	 Length of episode   191
Episode 300	 Length of episode   190
Episode 350	 Length of episode   200
Episode 400	 Length of episode   132
Episode 450	 Length of episode   189
Episode 500	 Length of episode   118
Episode 550	 Length of episode   200
559
Solved
