In [None]:
# install necessary packages for rendering openAI gym environment
!apt-get update -qq
!pip install --upgrade pip --quiet

!apt-get install python-opengl swig cmake libopenmpi-dev zlib1g-dev xvfb x11-utils ffmpeg -qq
!pip install stable-baselines[mpi] box2d box2d-kengz pyvirtualdisplay pyglet==1.3.1 --quiet

In [None]:
# start a virtual display
import os
import pyvirtualdisplay

display = pyvirtualdisplay.Display(visible=0, size=(1024, 768))
display.start()

<pyvirtualdisplay.display.Display at 0x7f480988af60>

In [None]:
# we need GLX for rendering the episodes, so check if GLX is available
glxinfo = !xdpyinfo | grep GLX

for line in glxinfo:
    if line.strip() == 'GLX':
        print('GLX is available')
        break
else:
    print('GLX is unavailable')

GLX is available


In [None]:
# necessary imports

import time
from collections import namedtuple

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [None]:
# use cuda if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

if device.type == 'cuda':
    device_name = torch.cuda.get_device_name(0)
    # allocated, cached = torch.cuda.memory_allocated(0), torch.cuda.memory_cached(0)    # memory_cached is deprecated
    allocated, cached = torch.cuda.memory_allocated(0), torch.cuda.memory_reserved(0)
    allocated, cached = round(allocated / 1024**3, 1), round(cached / 1024**3, 1)

    print(f'Device name: {device_name}')
    print(f'Memory Usage: Allocated {allocated}GB, Cached {cached}GB')
else:
    print('CUDA is unavailable')

Using device: cpu
CUDA is unavailable


In [None]:
# simple FNN to learn mapping from state to action
class Net(nn.Module):
  def __init__(self, observation_size, no_of_actions):
    super(Net, self).__init__()
    self.fc1 = nn.Linear(observation_size, 128)
    self.fc2 = nn.Linear(128, no_of_actions)
    self.dropout = nn.Dropout(p=0.6)

  def forward(self, x):
    x = F.relu(self.dropout(self.fc1(x)))
    return F.softmax(self.fc2(x), dim=1)

In [None]:
# some hyperparameters
lr = 0.01
steps = 500
no_of_episodes, generations = 1000, 500
percentile, expected_reward_mean = 70, 200

# misc
seed = 2
required_reward_threshold = 200

In [None]:
# prepare gym environment
env = gym.make("CartPole-v1")
env_wrapper = gym.wrappers.Monitor(env, directory="CartPole_simple", force=True, video_callable=lambda episode_idx: True)

# get observation space and number of actions of the environment
observation_size, no_of_actions = env.observation_space.shape[0], env.action_space.n

In [None]:
# set seed to avoid randomness
env.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7f480aabce88>

In [None]:
net = Net(observation_size, no_of_actions).to(device=device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=lr)

In [None]:
def generate_episode(env, net, max_steps=1000):
    states, actions, episode_reward = [], [], 0
    state = env.reset()
    for step in range(max_steps):
        tensor_state = torch.FloatTensor([state]).to(device=device)
        act_probs = net(tensor_state)
        act_probs = act_probs.data.cpu().numpy()[0]
        action = np.random.choice(act_probs.shape[0], p=act_probs)
        next_state, reward, is_done, info = env.step(action)
        
        states.append(state)
        actions.append(action)
        episode_reward += reward
        
        state = next_state
        if is_done:
            return True, (states, actions, episode_reward)
    return False, None


def generate_episodes(env, net, episode_size, max_steps=1000):
    e_states, e_actions, e_rewards = [], [], []
    for episode in range(episode_size):
        is_done, episode_data = generate_episode(env, net, max_steps)
        if is_done:
            states, actions, episode_reward = episode_data
            e_states.append(states)
            e_actions.append(actions)
            e_rewards.append(episode_reward)
    return e_states, e_actions, e_rewards


def filter_episodes(states, actions, rewards, percentile=70):
    filtered_states, filtered_actions = [], []
    reward_threshold = np.percentile(rewards, percentile)
    
    # add entire episode to (filtered_states, filtered_actions),
    # if its reward is higher than reward_threshold
    for idx,reward in enumerate(rewards):
        if reward > reward_threshold:
            filtered_states.append(np.array(states[idx], dtype=np.float32))
            filtered_actions.append(np.array(actions[idx], dtype=np.long))
    
    return np.concatenate(filtered_states), np.concatenate(filtered_actions), reward_threshold

In [None]:
def train_agent():
    for gen in range(generations):
        start_timer = time.time()
        if not gen % record_interval: generate_episode(env_wrapper, net, max_steps=steps)
        states, actions, rewards = generate_episodes(env, net, no_of_episodes, max_steps=steps)
        filtered_states, filtered_actions, reward_threshold = filter_episodes(states, actions, rewards, percentile)
        
        tensor_states = torch.from_numpy(filtered_states).to(device=device)
        tensor_actions = torch.from_numpy(filtered_actions).to(device=device)
        
        optimizer.zero_grad()
        action_probs = net(tensor_states)
        loss = criterion(action_probs, tensor_actions)
        loss.backward()
        optimizer.step()

        reward_mean = np.mean(rewards)
        time_per_gen = round(time.time() - start_timer)
        print('{:4d}[{:3d}sec]: loss={:.3f}, reward_mean={:.1f}, reward_threshold={:.1f}'.format(
            gen, time_per_gen, loss.item(), reward_mean, reward_threshold)
        )
        
        if reward_mean > expected_reward_mean:
            print('expected reward mean has been reached')
            return

In [13]:
start_timer = time.time()
record_interval = 2
train_agent()
print(f'Total time taken: {time.time() - start_timer}')

   0[  8sec]: loss=0.695, reward_mean=22.1, reward_threshold=25.0
   1[  8sec]: loss=0.686, reward_mean=27.7, reward_threshold=31.0
   2[ 11sec]: loss=0.673, reward_mean=34.4, reward_threshold=39.3
   3[ 13sec]: loss=0.660, reward_mean=42.9, reward_threshold=51.0
   4[ 16sec]: loss=0.651, reward_mean=49.9, reward_threshold=58.0
   5[ 17sec]: loss=0.640, reward_mean=55.0, reward_threshold=63.0
   6[ 20sec]: loss=0.630, reward_mean=61.2, reward_threshold=69.0
   7[ 19sec]: loss=0.620, reward_mean=63.6, reward_threshold=73.0
   8[ 22sec]: loss=0.610, reward_mean=66.9, reward_threshold=74.0
   9[ 20sec]: loss=0.600, reward_mean=68.1, reward_threshold=75.0
  10[ 23sec]: loss=0.590, reward_mean=69.8, reward_threshold=76.0
  11[ 23sec]: loss=0.582, reward_mean=70.6, reward_threshold=79.0
  12[ 22sec]: loss=0.572, reward_mean=72.0, reward_threshold=78.0
  13[ 22sec]: loss=0.561, reward_mean=74.8, reward_threshold=81.0
  14[ 23sec]: loss=0.553, reward_mean=73.0, reward_threshold=80.0
  15[ 23se

In [14]:
# finally, close gym environment
env_wrapper.close()
env.close()

In [15]:
# save the model parameters
torch.save(net, 'CartPole_simple.pth')