In [1]:
import functools

from sample_factory.algo.utils.context import global_model_factory
from sample_factory.cfg.arguments import parse_full_cfg, parse_sf_args
from sample_factory.envs.env_utils import register_env
from sample_factory.train import run_rl

from sf_examples.vizdoom.doom.doom_model import make_vizdoom_encoder
from sf_examples.vizdoom.doom.doom_params import add_doom_env_args, doom_override_defaults
from sf_examples.vizdoom.doom.doom_utils import DOOM_ENVS, make_doom_env_from_spec



In [2]:
# Registers all the ViZDoom environments
def register_vizdoom_envs():
    for env_spec in DOOM_ENVS:
        make_env_func = functools.partial(make_doom_env_from_spec, env_spec)
        register_env(env_spec.name, make_env_func)


# Sample Factory allows the registration of a custom Neural Network architecture
# See https://github.com/alex-petrenko/sample-factory/blob/master/sf_examples/vizdoom/doom/doom_model.py for more details
def register_vizdoom_models():
    global_model_factory().register_encoder_factory(make_vizdoom_encoder)


def register_vizdoom_components():
    register_vizdoom_envs()
    register_vizdoom_models()


# parse the command line args and create a config
def parse_vizdoom_cfg(argv=None, evaluation=False):
    parser, _ = parse_sf_args(argv=argv, evaluation=evaluation)
    # parameters specific to Doom envs
    add_doom_env_args(parser)
    # override Doom default values for algo parameters
    doom_override_defaults(parser)
    # second parsing pass yields the final configuration
    final_cfg = parse_full_cfg(parser, argv)
    return final_cfg



## Train the agent

In [3]:
## Start the training, this should take around 15 minutes
register_vizdoom_components()

# The scenario we train on today is health gathering
# other scenarios include "doom_basic", "doom_two_colors_easy", "doom_dm", "doom_dwango5", "doom_my_way_home", "doom_deadly_corridor", "doom_defend_the_center", "doom_defend_the_line"
env = "doom_health_gathering_supreme"
cfg = parse_vizdoom_cfg(
    argv=[f"--env={env}", "--num_workers=8", "--num_envs_per_worker=4", "--train_for_env_steps=20000000"]
)

# status = run_rl(cfg)
run_rl(cfg)


[36m[2024-07-03 21:59:52,524][50933] register_encoder_factory: <function make_vizdoom_encoder at 0x720221f6a560>[0m
[33m[2024-07-03 21:59:52,531][50933] Saved parameter configuration for experiment default_experiment not found![0m
[33m[2024-07-03 21:59:52,533][50933] Starting experiment from scratch![0m
[36m[2024-07-03 21:59:52,539][50933] Experiment dir /home/raghu/DL/topics/RL/unit8B-AsyncPPO-SampleFactory/train_dir/default_experiment already exists![0m
[36m[2024-07-03 21:59:52,539][50933] Resuming existing experiment from /home/raghu/DL/topics/RL/unit8B-AsyncPPO-SampleFactory/train_dir/default_experiment...[0m
[36m[2024-07-03 21:59:52,540][50933] Weights and Biases integration disabled[0m
[36m[2024-07-03 21:59:55,257][50933] Queried available GPUs: 0
[0m
[37m[1m[2024-07-03 21:59:55,257][50933] Environment var CUDA_VISIBLE_DEVICES is 0
[0m
[36m[2024-07-03 21:59:57,119][52223] Doom resolution: 160x120, resize resolution: (128, 72)[0m
  logger.warn(
  logger.warn(
[

0

    "--algo"="APPO",                              Default
    "--env"="doom_health_gathering_supreme",
    "--experiment"="vizdoom_doom_health_gathering_supreme_2222",
    "--train_dir"="/scratch/sample_factory/train_dir/vizdoom",
    "--seed=2222",
    "--num_policies=1",                           Default
    "--num_workers=20",                           12
    "--num_envs_per_worker=12",                   2   Number of envs on a single CPU actor, in high throughput configurations this should be in 10-30 range for 
                                                      Atari/VizDoomMust be even for double-buffered sampling! (default: 2)
    "--batch_size=2048",                          1024
    "--num_epochs=1",                             Default
    "--rollout=32",                               Default
    "--recurrence=32",                            Default value (-1) sets recurrence to rollout length for RNNs and to 1 (no recurrence) for feed-forward nets.
    "--gamma=0.99",                               Default
    "--max_grad_norm=0.0",                        Max L2 norm of the gradient vector, set to 0 to disable gradient clipping (default: 4.0)
    "--decorrelate_experience_max_seconds=1",     0
    "--heartbeat_reporting_interval=300",         180     How often in seconds the runner checks for heartbeats
    "--train_for_seconds=3600000",                10000000000     Stop training after this many seconds
    "--benchmark=false",                          Default
    "--use_rnn=true",                             Default
    "--rnn_type="lstm"",                          gru
    "--nonlinearity="relu""                       elu

    "--num_batches_per_epoch"                     1

  --encoder_conv_architecture {convnet_simple,convnet_impala,convnet_atari,resnet_impala}
                        Architecture of the convolutional encoder. See
                        models.py for details. VizDoom and DMLab examples
                        demonstrate how to define custom architectures.
                        (default: convnet_simple)

sampling_size = num_workers * num_envs_per_worker * rollout

If sampling_size >> batch_size then we will need many iterations of training to go through the data, which will make some experience stale by the time it is used for training (policy lag).

The above six parameters (batch_size, num_batches_per_epoch, rollout, num_epochs, num_workers, num_envs_per_worker) have the biggest influence on the data regime of the RL algorithm and thus on the sample efficiency and the training speed.

num_workers, num_envs_per_worker, and rollout define how many samples are collected per iteration (one rollout for all envs), which is sampling_size = num_workers * num_envs_per_worker * rollout (note that this is further multiplied by env's num_agents for multi-agent envs).

batch_size and num_batches_per_epoch define how many samples are used for training per iteration.

If sampling_size >> batch_size then we will need many iterations of training to go through the data, which will make some experience stale by the time it is used for training (policy lag). See Policy Lag for additional information.

In [4]:
## Start the training, this should take around 15 minutes
register_vizdoom_components()

# The scenario we train on today is health gathering
# other scenarios include "doom_basic", "doom_two_colors_easy", "doom_dm", "doom_dwango5", "doom_my_way_home", "doom_deadly_corridor", "doom_defend_the_center", "doom_defend_the_line"
env = "doom_health_gathering_supreme"
cfg = parse_vizdoom_cfg(
    argv=[f"--env={env}", 
          "--seed=200",
          "--num_workers=16", 
          "--num_envs_per_worker=8", 
          "--batch_size=2048",
          "--train_for_env_steps=600000000"]
)

# status = run_rl(cfg)
run_rl(cfg)



[36m[2024-07-05 14:54:06,141][03423] register_encoder_factory: <function make_vizdoom_encoder at 0x7e40f2f2b9a0>[0m
[33m[2024-07-05 14:54:06,153][03423] Loading existing experiment configuration from /home/raghu/DL/topics/RL/unit8B-AsyncPPO-SampleFactory/train_dir/default_experiment/config.json[0m
[36m[2024-07-05 14:54:06,154][03423] Overriding arg 'train_for_env_steps' with value 600000000 passed from command line[0m
[36m[2024-07-05 14:54:06,163][03423] Experiment dir /home/raghu/DL/topics/RL/unit8B-AsyncPPO-SampleFactory/train_dir/default_experiment already exists![0m
[36m[2024-07-05 14:54:06,164][03423] Resuming existing experiment from /home/raghu/DL/topics/RL/unit8B-AsyncPPO-SampleFactory/train_dir/default_experiment...[0m
[36m[2024-07-05 14:54:06,164][03423] Weights and Biases integration disabled[0m
[36m[2024-07-05 14:54:09,617][03423] Queried available GPUs: 0
[0m
[37m[1m[2024-07-05 14:54:09,618][03423] Environment var CUDA_VISIBLE_DEVICES is 0
[0m
[36m[2024-0

0

In [5]:
from sample_factory.enjoy import enjoy

cfg = parse_vizdoom_cfg(
    argv=[f"--env={env}", 
          "--num_workers=1", 
          "--save_video", 
          "--no_render", 
          "--max_num_episodes=10"], evaluation=True
)
status = enjoy(cfg)

# 20000000: 11.939
# 30000000: 15.516
# 50000000: 15.954
# 70000000: 16.362
# 100000000: 18.66
# 150000000: 16.279
# 200000000: 19.264
# 250000000: 19.969
# 300000000: 17.653
# 350000000: 20.917
# 400000000: 18.08
# 450000000: 18.223
# 600000000: 19.392


[33m[2024-07-05 15:48:15,708][03423] Loading existing experiment configuration from /home/raghu/DL/topics/RL/unit8B-AsyncPPO-SampleFactory/train_dir/default_experiment/config.json[0m
[36m[2024-07-05 15:48:15,709][03423] Overriding arg 'num_workers' with value 1 passed from command line[0m
[36m[2024-07-05 15:48:15,709][03423] Adding new argument 'no_render'=True that is not in the saved config file![0m
[36m[2024-07-05 15:48:15,710][03423] Adding new argument 'save_video'=True that is not in the saved config file![0m
[36m[2024-07-05 15:48:15,710][03423] Adding new argument 'video_frames'=1000000000.0 that is not in the saved config file![0m
[36m[2024-07-05 15:48:15,710][03423] Adding new argument 'video_name'=None that is not in the saved config file![0m
[36m[2024-07-05 15:48:15,711][03423] Adding new argument 'max_num_frames'=1000000000.0 that is not in the saved config file![0m
[36m[2024-07-05 15:48:15,711][03423] Adding new argument 'max_num_episodes'=10 that is not in 

#### Convnet_impala

In [5]:
## Start the training, this should take around 15 minutes
register_vizdoom_components()

# The scenario we train on today is health gathering
# other scenarios include "doom_basic", "doom_two_colors_easy", "doom_dm", "doom_dwango5", "doom_my_way_home", "doom_deadly_corridor", "doom_defend_the_center", "doom_defend_the_line"
env = "doom_health_gathering_supreme"
cfg = parse_vizdoom_cfg(
    argv=[f"--env={env}", 
          "--experiment=conv_impala",
          "--seed=200",
          "--num_workers=8",                    # Number of parallel environment workers.8
          "--num_envs_per_worker=4",            # Number of envs on a single CPU actor.4
          "--batch_size=1024",
          "--encoder_conv_architecture=convnet_impala",
          "--train_for_env_steps=5000000"]
)

# sample_size = num_workers * num_envs_per_worker * rollout 
# = 8 * 4 * 32 = 1024
# = 16 * 8 * 32 = 4096
# = 20 * 12 * 32 = 7680

# batch_size = 2048

status = run_rl(cfg)
# run_rl(cfg)



[33m[2024-07-04 23:44:42,771][40844] Environment doom_basic already registered, overwriting...[0m
[33m[2024-07-04 23:44:42,772][40844] Environment doom_two_colors_easy already registered, overwriting...[0m
[33m[2024-07-04 23:44:42,773][40844] Environment doom_two_colors_hard already registered, overwriting...[0m
[33m[2024-07-04 23:44:42,773][40844] Environment doom_dm already registered, overwriting...[0m
[33m[2024-07-04 23:44:42,773][40844] Environment doom_dwango5 already registered, overwriting...[0m
[33m[2024-07-04 23:44:42,773][40844] Environment doom_my_way_home_flat_actions already registered, overwriting...[0m
[33m[2024-07-04 23:44:42,774][40844] Environment doom_defend_the_center_flat_actions already registered, overwriting...[0m
[33m[2024-07-04 23:44:42,774][40844] Environment doom_my_way_home already registered, overwriting...[0m
[33m[2024-07-04 23:44:42,774][40844] Environment doom_deadly_corridor already registered, overwriting...[0m
[33m[2024-07-04 23:4

In [7]:
from sample_factory.enjoy import enjoy

cfg = parse_vizdoom_cfg(
    argv=[f"--env={env}", 
          "--experiment=conv_impala",
          "--num_workers=1", 
          "--save_video", 
          "--no_render", 
          "--max_num_episodes=10"], evaluation=True
)
status = enjoy(cfg)



[33m[2024-07-04 23:50:28,906][40844] Loading existing experiment configuration from /home/raghu/DL/topics/RL/unit8B-AsyncPPO-SampleFactory/train_dir/conv_impala/config.json[0m
[36m[2024-07-04 23:50:28,907][40844] Overriding arg 'num_workers' with value 1 passed from command line[0m
[36m[2024-07-04 23:50:28,908][40844] Adding new argument 'no_render'=True that is not in the saved config file![0m
[36m[2024-07-04 23:50:28,909][40844] Adding new argument 'save_video'=True that is not in the saved config file![0m
[36m[2024-07-04 23:50:28,909][40844] Adding new argument 'video_frames'=1000000000.0 that is not in the saved config file![0m
[36m[2024-07-04 23:50:28,910][40844] Adding new argument 'video_name'=None that is not in the saved config file![0m
[36m[2024-07-04 23:50:28,910][40844] Adding new argument 'max_num_frames'=1000000000.0 that is not in the saved config file![0m
[36m[2024-07-04 23:50:28,911][40844] Adding new argument 'max_num_episodes'=10 that is not in the sav

#### convnet_resnet

In [3]:
## Start the training, this should take around 15 minutes
register_vizdoom_components()

# The scenario we train on today is health gathering
# other scenarios include "doom_basic", "doom_two_colors_easy", "doom_dm", "doom_dwango5", "doom_my_way_home", "doom_deadly_corridor", "doom_defend_the_center", "doom_defend_the_line"
env = "doom_health_gathering_supreme"
cfg = parse_vizdoom_cfg(
    argv=[f"--env={env}", 
          "--experiment=conv_resnet",
          "--seed=200",
          "--num_workers=8",                    # Number of parallel environment workers.8
          "--num_envs_per_worker=4",            # Number of envs on a single CPU actor.4
          "--batch_size=1024",
          "--encoder_conv_architecture=resnet_impala",
          "--train_for_env_steps=100000000"]
)

# sample_size = num_workers * num_envs_per_worker * rollout 
# = 8 * 4 * 32 = 1024
# = 16 * 8 * 32 = 4096
# = 20 * 12 * 32 = 7680

# batch_size = 2048

status = run_rl(cfg)
# run_rl(cfg)



[36m[2024-07-05 20:12:24,277][25130] register_encoder_factory: <function make_vizdoom_encoder at 0x75c2f59d2560>[0m
[33m[2024-07-05 20:12:24,286][25130] Loading existing experiment configuration from /home/raghu/DL/topics/RL/unit8B-AsyncPPO-SampleFactory/train_dir/conv_resnet/config.json[0m
[36m[2024-07-05 20:12:24,287][25130] Overriding arg 'train_for_env_steps' with value 100000000 passed from command line[0m
[36m[2024-07-05 20:12:24,295][25130] Experiment dir /home/raghu/DL/topics/RL/unit8B-AsyncPPO-SampleFactory/train_dir/conv_resnet already exists![0m
[36m[2024-07-05 20:12:24,296][25130] Resuming existing experiment from /home/raghu/DL/topics/RL/unit8B-AsyncPPO-SampleFactory/train_dir/conv_resnet...[0m
[36m[2024-07-05 20:12:24,296][25130] Weights and Biases integration disabled[0m
[36m[2024-07-05 20:12:26,315][25130] Queried available GPUs: 0
[0m
[37m[1m[2024-07-05 20:12:26,316][25130] Environment var CUDA_VISIBLE_DEVICES is 0
[0m
[36m[2024-07-05 20:12:28,312][25

In [4]:
from sample_factory.enjoy import enjoy

cfg = parse_vizdoom_cfg(
    argv=[f"--env={env}", 
          "--experiment=conv_resnet",
          "--num_workers=1", 
          "--save_video", 
          "--no_render", 
          "--max_num_episodes=10"], evaluation=True
)
status = enjoy(cfg)

# 10000000 - 13.522
# 20000000 - 11.983
# 70000000 - 20.935
# 100000000 - 20.119


[33m[2024-07-05 21:11:59,040][25130] Loading existing experiment configuration from /home/raghu/DL/topics/RL/unit8B-AsyncPPO-SampleFactory/train_dir/conv_resnet/config.json[0m
[36m[2024-07-05 21:11:59,041][25130] Overriding arg 'num_workers' with value 1 passed from command line[0m
[36m[2024-07-05 21:11:59,041][25130] Adding new argument 'no_render'=True that is not in the saved config file![0m
[36m[2024-07-05 21:11:59,042][25130] Adding new argument 'save_video'=True that is not in the saved config file![0m
[36m[2024-07-05 21:11:59,042][25130] Adding new argument 'video_frames'=1000000000.0 that is not in the saved config file![0m
[36m[2024-07-05 21:11:59,043][25130] Adding new argument 'video_name'=None that is not in the saved config file![0m
[36m[2024-07-05 21:11:59,043][25130] Adding new argument 'max_num_frames'=1000000000.0 that is not in the saved config file![0m
[36m[2024-07-05 21:11:59,043][25130] Adding new argument 'max_num_episodes'=10 that is not in the sav

## Visualize the performance of the agent

In [None]:
from base64 import b64encode
from IPython.display import HTML

mp4 = open("/content/train_dir/default_experiment/replay.mp4", "rb").read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML(
    """
<video width=640 controls>
      <source src="%s" type="video/mp4">
</video>
"""
    % data_url
)



## Upload results to HF Hub

In [5]:
from huggingface_hub import notebook_login
notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
from sample_factory.enjoy import enjoy

hf_username = "ra9hu"  # insert your HuggingFace username here

cfg = parse_vizdoom_cfg(
    argv=[
        f"--env={env}",
        "--experiment=conv_resnet",
        "--num_workers=1",
        "--save_video",
        "--no_render",
        "--max_num_episodes=10",
        "--max_num_frames=100000",
        "--push_to_hub",
        f"--hf_repository={hf_username}/rl_course_vizdoom_health_gathering_supreme",
    ],
    evaluation=True,
)
status = enjoy(cfg)



[33m[2024-07-05 17:23:13,711][04005] Loading existing experiment configuration from /home/raghu/DL/topics/RL/unit8B-AsyncPPO-SampleFactory/train_dir/conv_resnet/config.json[0m
[36m[2024-07-05 17:23:13,712][04005] Overriding arg 'num_workers' with value 1 passed from command line[0m
[36m[2024-07-05 17:23:13,712][04005] Adding new argument 'no_render'=True that is not in the saved config file![0m
[36m[2024-07-05 17:23:13,713][04005] Adding new argument 'save_video'=True that is not in the saved config file![0m
[36m[2024-07-05 17:23:13,713][04005] Adding new argument 'video_frames'=1000000000.0 that is not in the saved config file![0m
[36m[2024-07-05 17:23:13,714][04005] Adding new argument 'video_name'=None that is not in the saved config file![0m
[36m[2024-07-05 17:23:13,714][04005] Adding new argument 'max_num_frames'=100000 that is not in the saved config file![0m
[36m[2024-07-05 17:23:13,715][04005] Adding new argument 'max_num_episodes'=10 that is not in the saved con

events.out.tfevents.1720155218.Raghu-Laptop:   0%|          | 0.00/6.40k [00:00<?, ?B/s]

events.out.tfevents.1720155536.Raghu-Laptop:   0%|          | 0.00/2.23k [00:00<?, ?B/s]

events.out.tfevents.1720156453.Raghu-Laptop:   0%|          | 0.00/511k [00:00<?, ?B/s]

events.out.tfevents.1720155624.Raghu-Laptop:   0%|          | 0.00/390k [00:00<?, ?B/s]

events.out.tfevents.1720175313.Raghu-Laptop:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Upload 9 LFS files:   0%|          | 0/9 [00:00<?, ?it/s]

best_000015017_61509632_reward_55.424.pth:   0%|          | 0.00/48.9M [00:00<?, ?B/s]

checkpoint_000016993_69603328.pth:   0%|          | 0.00/48.9M [00:00<?, ?B/s]

checkpoint_000017091_70004736.pth:   0%|          | 0.00/48.9M [00:00<?, ?B/s]

replay.mp4:   0%|          | 0.00/41.3M [00:00<?, ?B/s]

[37m[1m[2024-07-05 17:24:27,169][04005] The model has been pushed to https://huggingface.co/ra9hu/rl_course_vizdoom_health_gathering_supreme[0m


## Download sample run from HF Hub

In [5]:
from sample_factory.enjoy import enjoy


In [7]:
## Start the training, this should take around 15 minutes
register_vizdoom_components()

env = "doom_health_gathering_supreme"

cfg = parse_vizdoom_cfg(
    argv=[
        f"--env={env}",
        "--num_workers=1",
        "--save_video",
        "--no_render",
        "--max_num_episodes=10",
        "--experiment=doom_health_gathering_supreme_2222",
        "--train_dir=train_dir",
    ],
    evaluation=True,
)
status = enjoy(cfg)


[36m[2024-07-04 19:45:47,107][19305] register_encoder_factory: <function make_vizdoom_encoder at 0x76a196e72560>[0m
[33m[2024-07-04 19:45:47,115][19305] Loading existing experiment configuration from train_dir/doom_health_gathering_supreme_2222/config.json[0m
[36m[2024-07-04 19:45:47,116][19305] Overriding arg 'experiment' with value 'doom_health_gathering_supreme_2222' passed from command line[0m
[36m[2024-07-04 19:45:47,117][19305] Overriding arg 'train_dir' with value 'train_dir' passed from command line[0m
[36m[2024-07-04 19:45:47,117][19305] Overriding arg 'num_workers' with value 1 passed from command line[0m
[36m[2024-07-04 19:45:47,118][19305] Adding new argument 'lr_adaptive_min'=1e-06 that is not in the saved config file![0m
[36m[2024-07-04 19:45:47,118][19305] Adding new argument 'lr_adaptive_max'=0.01 that is not in the saved config file![0m
[36m[2024-07-04 19:45:47,118][19305] Adding new argument 'env_gpu_observations'=True that is not in the saved config fi