In [1]:
import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
from collections import namedtuple, deque
import matplotlib.pyplot as plt
import math
import torch.nn.functional as F

MAX_EPISODES = 1500
MAX_TIMESTEPS = 200

alpha_policy = 1e-3
alpha_value = 1e-4
gamma = 0.99

def softmax(logits):
    max_logit = torch.max(logits)
    shifted_logits = logits - max_logit
    exp_shifted_logits = torch.exp(shifted_logits)
    softmax_probs = exp_shifted_logits / torch.sum(exp_shifted_logits)
    
    return softmax_probs


device = torch.device('cpu')



In [23]:
class reinforce(nn.Module):

    def __init__(self, baseline = True):
        super(reinforce, self).__init__()
        # policy network
        self.fc1 = nn.Linear(4, 128)
        self.relu = nn.ReLU(inplace=True)
        self.tanh = nn.Tanh()
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 2)


        self.baseline = baseline

        #Value network
        self.v1 = nn.Linear(4, 128)
        self.v2 = nn.Linear(128, 128)
        self.v3 = nn.Linear(128, 1)

        self.params_policy = [
          {"params": self.fc1.parameters()},  {"params": self.fc2.parameters()}, {"params": self.fc3.parameters()}
        ]

        self.params_value = [
          {"params": self.v1.parameters()},  {"params": self.v2.parameters()}, {"params": self.v3.parameters()}
        ]

    def forward(self, x):
        x = self.fc1(x)
        x = self.tanh(x)
        # x = self.fc2(x)
        # x = self.tanh(x)
        x = self.fc3(x)
        x = softmax(x)
        return x # [B, 2]

    def value_forward(self,x):
        x = self.v1(x)
        x = self.relu(x)
        # x = self.v2(x)
        # x = self.relu(x)
        x = self.v3(x)
        return x # [B, 1]

    def get_action(self, state):
        with torch.no_grad():
          state = torch.tensor(state).to(device) # [4,]
          state = torch.unsqueeze(state, 0) # [1, 4]
          
          probs = self.forward(state) # [1, 2]
          probs = torch.squeeze(probs, 0) # [2,]
          
          if math.isnan(probs[0]) or math.isnan(probs[1]):
            print(probs)
            print(state)
            print(self.fc1(state))
      
          action = torch.multinomial(probs, 1, replacement=True)

        return int(action.item())

    def pi(self, s, a):
        s = torch.tensor([s]).to(device)
        probs = self.forward(s)
        probs = torch.squeeze(probs, 0)
        
        return probs[a]

    def v(self,s):
        s = torch.tensor([s]).to(device)
        state_value = self.value_forward(s)

        return state_value

    def update_weight(self, states, actions, rewards, optimizer, optimizer_value = None):
        G = torch.tensor([0]).to(device)
        # for each step of the episode t = T - 1, ..., 0
        # r_tt represents r_{t+1}
        if self.baseline == False:
          for s_t, a_t, r_tt in zip(states[::-1], actions[::-1], rewards[::-1]):
              G = torch.tensor([r_tt]).to(device) + gamma * G
              loss = (-1.0) * G * torch.log(self.pi(s_t, a_t))
              # update policy parameter \theta

              optimizer.zero_grad()
              loss.backward()
              optimizer.step()
        
        else:
          for idx, (s_t, a_t, r_tt) in enumerate(zip(states, actions, rewards)):
              G = 0
              for t in range(idx, len(rewards)):
                G += rewards[t] + gamma * G
                
              delta =  gamma**idx * (G - self.v(s_t).detach())
              print(G)
              loss = (-1.0) * delta * torch.log(self.pi(s_t, a_t))
              
              y = r_tt + gamma * self.v(states[idx + 1]) if idx < len(states)-1 else r_tt + gamma * torch.zeros([1, 1])
              
              val_loss = F.mse_loss(self.v(s_t), y.detach())#-0.5*(G-self.v(s_t))**2
              # update policy and value parameters \theta and \w
              optimizer.zero_grad()
              optimizer_value.zero_grad()
              loss.backward()
              val_loss.backward()
              torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm=0.5)
              optimizer.step()
              optimizer_value.step()


In [24]:
def main():

    env = gym.make('CartPole-v1')

    agent = reinforce().to(device)
    optimizer = optim.Adam(agent.params_policy, lr=alpha_policy)
    optimizer_value = optim.Adam(agent.params_value, lr=alpha_value)

    avg_scores = []
    episodic_returns = []
    scores_window = deque(maxlen=100)

    for i_episode in range(MAX_EPISODES):

        state = env.reset()

        states = []
        actions = []
        rewards = [0]
        return_episode = 0

        for timesteps in range(MAX_TIMESTEPS):
            action = agent.get_action(state)
            states.append(state)
            actions.append(action)
            state, reward, done, _ = env.step(action)

            rewards.append(reward)

            # return_episode = reward + gamma * return_episode
            return_episode = reward + 1 * return_episode

            if done and i_episode % 100 == 0:
                print("Episode {} finished after {} timesteps".format(i_episode, timesteps+1))
                break

        if i_episode % 100 == 0:
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if np.mean(scores_window)>=195.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
            break

        episodic_returns.append(return_episode)

        scores_window.append(return_episode)
        avg_scores.append(np.mean(scores_window))
        if agent.baseline == False:
          agent.update_weight(states, actions, rewards, optimizer)
        else:
          agent.update_weight(states, actions, rewards, optimizer, optimizer_value)

    plt.plot(avg_scores, label = 'Running average')
    plt.plot(episodic_returns, label = 'Epsiodic returns')
    plt.xlabel('Episodes')
    plt.ylabel('Reward')

    env.close()


main()

  logger.warn(
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  logger.warn(


Episode 0 finished after 18 timesteps
Episode 0	Average Score: nan
241945.95607968204
241945.95607968204
121580.37993953872
61095.16579876317
30700.585828524207
15426.927552022215
7751.722387950862
3894.835370829579
1956.7011913716478
982.7644177746973
493.3489536556268
247.41153450031499
123.824891708701
61.7210511099
30.513091010000004
14.830699000000003
6.950100000000001
2.99
5.956346741303047e+59
5.956346741303047e+59
2.9931382771312594e+59
1.5040887976479497e+59
7.558227275558329e+58
3.7980962700702996e+58
1.9085832019858714e+58
9.590791476720845e+57
4.8194143193651415e+57
2.421737355869816e+57
1.2168745601435177e+57
6.1141586882377196e+56
3.0716527017063357e+56
1.542755221037292e+56
7.744650297463192e+55
3.8838955673577734e+55
1.9438178135359656e+55
9.689043694044008e+54
4.7899811607195954e+54
2.3281406917123375e+54
1.0910349283921074e+54
4.693737407940035e+53
1.5698118421204148e+53
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.

RuntimeError: invalid multinomial distribution (encountering probability entry < 0)