In [1]:
import numpy as np
from tqdm import tqdm
import gym
import random
import time
import matplotlib.pyplot as plt
from collections import namedtuple

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import wandb
wandb.init(project='reinforce_cartpole')

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('GPU', torch.cuda.is_available())

env = gym.make('CartPole-v0')

# Get size of observation space
obs_size = env.observation_space.shape[0]
print(f'Observation space: {obs_size}')
# Cart Position, Cart Velocity, Pole Angle, Pole Angular Velocity 

# Get number of actions from gym action space
n_actions = env.action_space.n
print(f'Action space: {n_actions}')
# Left, Right

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
[34m[1mwandb[0m: Currently logged in as: [33msradicwebster[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.9 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


GPU False
Observation space: 4
Action space: 2


In [2]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(obs_size, 64) 
        self.fc2 = nn.Linear(64, 128)
        self.fc3 = nn.Linear(128, n_actions)
            
    #define the forward function, and the backward function (where gradients are computed)
    # is automatically defined for you using autograd
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
def forward_prop(network, state):
    return network(torch.from_numpy(state).float().to(device))

def softmax_action(state):
    policy = forward_prop(policy_net, state)
    probs = F.softmax(policy, dim=0)
    dist = torch.distributions.Categorical(probs)
    action = dist.sample().item()
    log_prob = F.log_softmax(policy, dim=0)[action]
    return action, log_prob

def optimise_model(log_probs, discounted_rewards):
    returns = [np.sum(discounted_rewards[i:]) for i in range(len(discounted_rewards))]
    optimizer.zero_grad()
    loss = -torch.stack([returns[i]*log_probs[i] for i in range(len(discounted_rewards))]).sum()
    wandb.log({"loss": loss.sum()}, step=episode)
    #loss.to(gpu)
    loss.backward()
    optimizer.step()
    return 

In [3]:
GAMMA = 0.99
LEARNING_RATE = 1e-4

num_episodes = 10000

# Save model inputs and hyperparameters
wandb.config = wandb.config
wandb.config.learning_rate = LEARNING_RATE

# initialise parameterised policy function
policy_net = MLP().to(device) 

nodes = []
params = list(policy_net.parameters())
for i in range(len(params))[1::2]:
    nodes.append(params[i].size()[0])
wandb.config.nn_nodes = nodes



optimizer = optim.Adam(policy_net.parameters(), lr=LEARNING_RATE)

episode_rewards = []

for episode in tqdm(range(num_episodes)):
    
    # reset step count
    step = 0
    cum_discounted_reward = 0
    episode_reward = 0
    discounted_rewards = []
    log_probs = []
    
    # get start state from env
    state = env.reset() 
    
    terminal = False
    while terminal is False:
        
        # choose next action
        action, log_prob = softmax_action(state)
        log_probs.append(log_prob)
        
        # take next step and get reward from env
        next_state, reward, terminal, _ = env.step(action)
        
        # reward tracking
        #cum_discounted_reward += reward * np.power(GAMMA, step)
        discounted_rewards.append(reward*np.power(GAMMA, step)) 
        
        # updates
        state = next_state
        step += 1
        episode_reward += reward
       

    episode_rewards.append(episode_reward)
    wandb.log({"reward": episode_reward}, step=episode)
            
    optimise_model(log_probs, discounted_rewards)
            

100%|██████████| 10000/10000 [24:45<00:00,  6.73it/s]
