In [1]:
# install necessary packages for rendering openAI gym environment
!apt-get update -qq
!pip install --upgrade pip --quiet

!apt-get install python-opengl swig cmake libopenmpi-dev zlib1g-dev xvfb x11-utils ffmpeg -qq
!pip install stable-baselines[mpi] box2d box2d-kengz pyvirtualdisplay pyglet==1.3.1 --quiet

[K     |████████████████████████████████| 1.5MB 12.4MB/s 
[?25hSelecting previously unselected package libxxf86dga1:amd64.
(Reading database ... 145480 files and directories currently installed.)
Preparing to unpack .../0-libxxf86dga1_2%3a1.1.4-1_amd64.deb ...
Unpacking libxxf86dga1:amd64 (2:1.1.4-1) ...
Selecting previously unselected package python-opengl.
Preparing to unpack .../1-python-opengl_3.1.0+dfsg-1_all.deb ...
Unpacking python-opengl (3.1.0+dfsg-1) ...
Selecting previously unselected package swig3.0.
Preparing to unpack .../2-swig3.0_3.0.12-1_amd64.deb ...
Unpacking swig3.0 (3.0.12-1) ...
Selecting previously unselected package swig.
Preparing to unpack .../3-swig_3.0.12-1_amd64.deb ...
Unpacking swig (3.0.12-1) ...
Selecting previously unselected package x11-utils.
Preparing to unpack .../4-x11-utils_7.7+3build1_amd64.deb ...
Unpacking x11-utils (7.7+3build1) ...
Selecting previously unselected package xvfb.
Preparing to unpack .../5-xvfb_2%3a1.19.6-1ubuntu4.8_amd64.deb 

In [2]:
# start a virtual display
import os
import pyvirtualdisplay

display = pyvirtualdisplay.Display(visible=0, size=(1024, 768))
display.start()

<pyvirtualdisplay.display.Display at 0x7f36dc851b70>

In [3]:
# we need GLX for rendering the episodes, so check if GLX is available
glxinfo = !xdpyinfo | grep GLX

for line in glxinfo:
    if line.strip() == 'GLX':
        print('GLX is available')
        break
else:
    print('GLX is unavailable')

GLX is available


In [4]:
# necessary imports

import time
import itertools
from collections import namedtuple

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.distributions as dist

In [5]:
# use cuda if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

if device.type == 'cuda':
    device_name = torch.cuda.get_device_name(0)
    # allocated, cached = torch.cuda.memory_allocated(0), torch.cuda.memory_cached(0)    # memory_cached is deprecated
    allocated, cached = torch.cuda.memory_allocated(0), torch.cuda.memory_reserved(0)
    allocated, cached = round(allocated / 1024**3, 1), round(cached / 1024**3, 1)

    print(f'Device name: {device_name}')
    print(f'Memory Usage: Allocated {allocated}GB, Cached {cached}GB')
else:
    print('CUDA is unavailable')

Using device: cuda
Device name: Tesla T4
Memory Usage: Allocated 0.0GB, Cached 0.0GB


In [6]:
# simple FNN to learn mapping from state to action
class Net(nn.Module):
    def __init__(self, observation_size, no_of_actions):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(observation_size, 128)
        self.fc2 = nn.Linear(128, no_of_actions)
        self.dropout = nn.Dropout(p=0.6)
        
    def forward(self, x):
        x = F.relu(self.dropout(self.fc1(x)))
        return F.softmax(self.fc2(x), dim=1)

In [7]:
# some hyperparameters
lr = 0.01
gamma = 0.99
eps = np.finfo(np.float32).eps.item()

# misc
seed, log_interval, record_interval = 2, 10, 50

In [8]:
# prepare gym environment
env = gym.make("CartPole-v1")
env_wrapper = gym.wrappers.Monitor(env, directory="CartPole_Q_learning", force=True, video_callable=lambda episode_idx: not bool(episode_idx % record_interval))

# get observation space and number of actions of the environment
observation_size, no_of_actions = env.observation_space.shape[0], env.action_space.n

In [9]:
# set seed to avoid randomness
env.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7f36dda7de88>

In [10]:
net = Net(observation_size, no_of_actions).to(device=device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=lr)

In [11]:
def get_action(state, saved_log_probs):
    state = torch.from_numpy(state).float().unsqueeze(0).to(device)
    probs = net(state)
    m = dist.Categorical(probs)
    action = m.sample()
    saved_log_probs.append(m.log_prob(action))
    return action.item()

def run_episode(env, rewards, saved_log_probs, max_steps=10000):
    total_reward = 0
    state = env.reset()
    for timestep in range(max_steps):
        action = get_action(state, saved_log_probs)
        state, reward, is_done, info = env.step(action)
        rewards.append(reward)
        total_reward += reward
        if is_done:
            break
    return total_reward, timestep+1

def learn(rewards, saved_log_probs):
    R, returns, policy_loss = 0, [], []
    for r in reversed(rewards):
        R = r + gamma * R
        returns.append(R)
    returns = torch.tensor(returns[::-1])
    returns = (returns - returns.mean()) / (returns.std() + eps)
    for log_prob, R in zip(saved_log_probs, returns):
        policy_loss.append(-log_prob * R)
    optimizer.zero_grad()
    policy_loss = torch.cat(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()

def main():
    running_reward = 10
    for episode_idx in itertools.count(1):
        rewards, saved_log_probs = [], []
        reward, timesteps = run_episode(env_wrapper, rewards, saved_log_probs)
        learn(rewards, saved_log_probs)
        running_reward = 0.05 * reward + (1 - 0.05) * running_reward
        if episode_idx % log_interval == 0:
            print(f'{episode_idx:4d}: reward={reward:4.2f}  running_reward={running_reward:4.2f}')
        if running_reward > env.spec.reward_threshold:
            print(f'reward threshold reached!')
            break

In [12]:
start_timer = time.time()
main()
print(f'Total time taken: {time.time() - start_timer}')

  10: reward=15.00  running_reward=15.00
  20: reward=49.00  running_reward=32.84
  30: reward=65.00  running_reward=40.67
  40: reward=43.00  running_reward=50.56
  50: reward=87.00  running_reward=51.61
  60: reward=79.00  running_reward=59.28
  70: reward=157.00  running_reward=87.11
  80: reward=161.00  running_reward=112.21
  90: reward=244.00  running_reward=131.09
 100: reward=199.00  running_reward=154.18
 110: reward=201.00  running_reward=173.51
 120: reward=178.00  running_reward=201.72
 130: reward=182.00  running_reward=187.51
 140: reward=138.00  running_reward=164.24
 150: reward=329.00  running_reward=171.90
 160: reward=108.00  running_reward=256.15
 170: reward=324.00  running_reward=325.75
 180: reward=186.00  running_reward=339.78
 190: reward=158.00  running_reward=287.15
 200: reward=121.00  running_reward=224.82
 210: reward=72.00  running_reward=176.36
 220: reward=35.00  running_reward=144.69
 230: reward=110.00  running_reward=131.25
 240: reward=105.00  runni

In [13]:
# finally, close gym environment
env_wrapper.close()
env.close()

In [14]:
# save the model parameters
torch.save(net, 'CartPole_Q_learning.pth')