## Setup arguments

This automatically parse arguments as jupyter inputs making it more user friendly :)

In [45]:
import ipywidgets as widgets
from IPython.display import display
from distutils.util import strtobool

from jupyter_utils import JupyterArgumentParser


def get_args():
    parser = JupyterArgumentParser()
    parser.add_argument("--seed", type=int, default=1,
        help="seed of the experiment")
    parser.add_argument("--torch-deterministic", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
        help="if toggled, `torch.backends.cudnn.deterministic=False`")
    parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
        help="if toggled, cuda will be enabled by default")
    parser.add_argument("--capture-video", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
        help="whether to capture videos of the agent performances (check out `videos` folder)")
    parser.add_argument("--env-id", type=str, default="BreakoutNoFrameskip-v4",
        help="the id of the environment")
    parser.add_argument("--total-timesteps", type=int, default=10000000,
        help="total timesteps of the experiments")
    parser.add_argument("--learning-rate", type=float, default=1e-4,
        help="the learning rate of the optimizer")
    parser.add_argument("--buffer-size", type=int, default=1000000,
        help="the replay memory buffer size")
    parser.add_argument("--gamma", type=float, default=0.99,
        help="the discount factor gamma")
    parser.add_argument("--target-network-frequency", type=int, default=1000,
        help="the timesteps it takes to update the target network")
    parser.add_argument("--batch-size", type=int, default=32,
        help="the batch size of sample from the reply memory")
    parser.add_argument("--start-e", type=float, default=1,
        help="the starting epsilon for exploration")
    parser.add_argument("--end-e", type=float, default=0.01,
        help="the ending epsilon for exploration")
    parser.add_argument("--exploration-fraction", type=float, default=0.10,
        help="the fraction of `total-timesteps` it takes from start-e to go end-e")
    parser.add_argument("--learning-starts", type=int, default=8000,
        help="timestep to start learning")
    parser.add_argument("--train-frequency", type=int, default=4,
        help="the frequency of training")
    return parser


parser = get_args()

IntText(value=1, description='seed')

Checkbox(value=True, description='torch_deterministic', indent=False)

Checkbox(value=True, description='cuda', indent=False)

Checkbox(value=True, description='capture_video', indent=False)

Text(value='BreakoutNoFrameskip-v4', description='env_id')

IntText(value=10000000, description='total_timesteps')

FloatText(value=0.0001, description='learning_rate')

IntText(value=1000000, description='buffer_size')

FloatText(value=0.99, description='gamma')

IntText(value=1000, description='target_network_frequency')

IntText(value=32, description='batch_size')

FloatText(value=1.0, description='start_e')

FloatText(value=0.01, description='end_e')

FloatText(value=0.1, description='exploration_fraction')

IntText(value=8000, description='learning_starts')

IntText(value=4, description='train_frequency')

## Setup wrappers

Use preprocessing wrappers to transfom the inputs according to baselines.

In [74]:
import time
import gym
import random

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from atari import make_atari_env
from utils import seed_everything
from buffers import ReplayBuffer

Parse experiment arguments..

In [59]:
args = parser.parse_args()
print("|param=value|\n|-|-|\n%s" % ("\n".join([f"|{key}={value}|" for key, value in vars(args).items()])))

|param=value|
|-|-|
|seed=1|
|torch_deterministic=True|
|cuda=True|
|capture_video=True|
|env_id=BreakoutNoFrameskip-v4|
|total_timesteps=10000000|
|learning_rate=0.0001|
|buffer_size=1000000|
|gamma=0.99|
|target_network_frequency=1000|
|batch_size=32|
|start_e=1.0|
|end_e=0.01|
|exploration_fraction=0.1|
|learning_starts=8000|
|train_frequency=4|


In [46]:
run_name = f"{args.env_id}__dqn__{args.seed}__{int(time.time())}" 
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
seed_everything(args.seed, args.torch_deterministic)

## Setup environment

In [62]:
envs = gym.vector.SyncVectorEnv([make_atari_env(args.env_id, args.seed, 0, args.capture_video, run_name)])
assert isinstance(envs.single_action_space, gym.spaces.Discrete), "only discrete action space is supported"

A.L.E: Arcade Learning Environment (version 0.7.5+db37282)
[Powered by Stella]


## Few examples

Show few env steps and the transformation input..

## Define Q Network

The Q network is ..

In [67]:
class QNetwork(nn.Module):
    def __init__(self, env):
        super().__init__()
        self.network = nn.Sequential(
            nn.Conv2d(4, 32, 8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, 4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, 3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, env.single_action_space.n),
        )

    def forward(self, x):
        return self.network(x / 255.0)

## Setup DQN training

Setup QNetwork and TargetNetwork along with ReplayBuffer, Optimization and schedules..

In [70]:
def linear_schedule(start_e: float, end_e: float, duration: int, t: int):
    slope = (end_e - start_e) / duration
    return max(slope * t + start_e, end_e)

In [69]:
q_network = QNetwork(envs).to(device)
optimizer = optim.Adam(q_network.parameters(), lr=args.learning_rate)
target_network = QNetwork(envs).to(device)
target_network.load_state_dict(q_network.state_dict())

rb = ReplayBuffer(
    args.buffer_size,
    envs.single_observation_space,
    envs.single_action_space,
    device,
    optimize_memory_usage=True
)



In [84]:
def train_step(obs, global_step):
    # ALGO LOGIC: put action logic here
    epsilon = linear_schedule(args.start_e, args.end_e, args.exploration_fraction * args.total_timesteps, global_step)
    if random.random() < epsilon:
        actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
    else:
        q_values = q_network(torch.Tensor(obs).to(device))
        actions = torch.argmax(q_values, dim=1).cpu().numpy()

    # TRY NOT TO MODIFY: execute the game and log data.
    next_obs, rewards, dones, infos = envs.step(actions)

    # TRY NOT TO MODIFY: record rewards for plotting purposes
    for info in infos:
        if "episode" in info.keys():
            print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
            # writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
            # writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step)
            # writer.add_scalar("charts/epsilon", epsilon, global_step)
            break

    # TRY NOT TO MODIFY: save data to reply buffer; handle `terminal_observation`
    real_next_obs = next_obs.copy()
    for idx, d in enumerate(dones):
        if "terminal_observation" in infos[idx].keys() and d:
            real_next_obs[idx] = infos[idx]["terminal_observation"]
    rb.add(obs, real_next_obs, actions, rewards, dones)

    # TRY NOT TO MODIFY: CRUCIAL step easy to overlook
    # obs = next_obs

    # ALGO LOGIC: training.
    if global_step > args.learning_starts and global_step % args.train_frequency == 0:
        data = rb.sample(args.batch_size)
        with torch.no_grad():
            target_max, _ = target_network(data.next_observations).max(dim=1)
            td_target = data.rewards.flatten() + args.gamma * target_max * (1 - data.dones.flatten())
        old_val = q_network(data.observations).gather(1, data.actions).squeeze()
        loss = F.mse_loss(td_target, old_val)

        # if global_step % 100 == 0:
            # writer.add_scalar("losses/td_loss", loss, global_step)
            # writer.add_scalar("losses/q_values", old_val.mean().item(), global_step)
            # print("SPS:", int(global_step / (time.time() - start_time)))
            # writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)

        # optimize the model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # update the target network
        if global_step % args.target_network_frequency == 0:
            target_network.load_state_dict(q_network.state_dict())
    return next_obs

In [85]:
obs = envs.reset()
for global_step in range(args.total_timesteps):
    next_obs = train_step(obs, global_step)
    obs = next_obs

envs.close()

global_step=255, episodic_return=4.0
global_step=416, episodic_return=1.0
global_step=559, episodic_return=1.0
global_step=846, episodic_return=4.0
global_step=961, episodic_return=0.0
global_step=1247, episodic_return=4.0
global_step=1456, episodic_return=2.0
global_step=1571, episodic_return=0.0
global_step=1759, episodic_return=2.0
global_step=1920, episodic_return=1.0
global_step=2033, episodic_return=0.0
global_step=2144, episodic_return=0.0
global_step=2259, episodic_return=0.0
global_step=2374, episodic_return=0.0
global_step=2612, episodic_return=3.0
global_step=2725, episodic_return=0.0
global_step=2838, episodic_return=0.0
global_step=3157, episodic_return=5.0
global_step=3270, episodic_return=0.0
global_step=3385, episodic_return=0.0
global_step=3542, episodic_return=1.0
global_step=3653, episodic_return=0.0
global_step=3766, episodic_return=0.0
global_step=3877, episodic_return=0.0
global_step=3992, episodic_return=0.0
global_step=4134, episodic_return=1.0
global_step=4247,

KeyboardInterrupt: 