In [1]:
import ray
# Initialize Ray
ray.shutdown()
ray.init(ignore_reinit_error=True,num_cpus=8, num_gpus=1)

2020-09-27 16:56:59,457	INFO resource_spec.py:231 -- Starting Ray with 13.57 GiB memory available for workers and up to 6.81 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-09-27 16:57:00,007	INFO services.py:1193 -- View the Ray dashboard at [1m[32mlocalhost:8267[39m[22m


{'node_ip_address': '192.168.7.73',
 'raylet_ip_address': '192.168.7.73',
 'redis_address': '192.168.7.73:59810',
 'object_store_address': '/tmp/ray/session_2020-09-27_16-56-59_456841_47258/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-09-27_16-56-59_456841_47258/sockets/raylet',
 'webui_url': 'localhost:8267',
 'session_dir': '/tmp/ray/session_2020-09-27_16-56-59_456841_47258'}

In [2]:
# https://docs.ray.io/en/latest/tune/user-guide.html#parallelism-gpus
ray.cluster_resources()

{'object_store_memory': 96.0,
 'GPU': 1.0,
 'GPUType:GTX': 1.0,
 'memory': 278.0,
 'CPU': 8.0,
 'node:192.168.7.73': 1.0}

In [3]:
print("Dashboard URL: http://{}".format(ray.get_webui_url()))

Dashboard URL: http://localhost:8267


## MineRL Gym Environment wrapper for action space etc

In [4]:
# Make sure env_wrappers.py is in the same directory
# https://github.com/minerllabs/baselines/blob/master/2019/general/chainerrl/baselines/env_wrappers.py
import gym
from env_wrappers import (
    SerialDiscreteActionWrapper, CombineActionWrapper, SerialDiscreteCombineActionWrapper,
    MoveAxisWrapper, FrameSkip, ObtainPoVWrapper, PoVWithCompassAngleWrapper, GrayScaleWrapper
)

# Sean: Skip logger for now
# from logging import getLogger
# logger = getLogger(__name__)

def wrap_env(env, args, test=False):
    # wrap env: time limit...
    if isinstance(env, gym.wrappers.TimeLimit):
#         logger.info('Detected `gym.wrappers.TimeLimit`! Unwrap it and re-wrap our own time limit.')
        env = env.env
        max_episode_steps = env.spec.max_episode_steps
        
        # Sean: ContinuingTimeLimit is a chainrl feature which we will not be using
#         env = ContinuingTimeLimit(env, max_episode_steps=max_episode_steps)

    # wrap env: observation...
    # NOTE: wrapping order matters!

    if test and args.monitor:
        pass
        # Sean: ContinuingTimeLimit is a chainrl feature which we will not be using
#         env = ContinuingTimeLimitMonitor(
#             env, os.path.join(args.outdir, 'monitor'),
#             mode='evaluation' if test else 'training', video_callable=lambda episode_id: True)
    if args.frame_skip is not None:
        env = FrameSkip(env, skip=args.frame_skip)
    if args.gray_scale:
        env = GrayScaleWrapper(env, dict_space_key='pov')
    if args.env.startswith('MineRLNavigate'):
        env = PoVWithCompassAngleWrapper(env)
    else:
        env = ObtainPoVWrapper(env)
    
    # Sean: Skip ChainRL requirements
#     env = MoveAxisWrapper(env, source=-1, destination=0)  # convert hwc -> chw as Chainer requires.
#     env = ScaledFloatFrame(env)
    
    if args.frame_stack is not None and args.frame_stack > 0:
        env = FrameStack(env, args.frame_stack, channel_order='chw')

    # wrap env: action...
    if not args.disable_action_prior:
        env = SerialDiscreteActionWrapper(
            env,
            always_keys=args.always_keys, reverse_keys=args.reverse_keys, exclude_keys=args.exclude_keys, exclude_noop=args.exclude_noop)
    else:
        env = CombineActionWrapper(env)
        env = SerialDiscreteCombineActionWrapper(env)

    # Sean: not supported yet ? according to original file
    # env_seed = test_seed if test else train_seed
    # env.seed(int(env_seed))  # TODO: not supported yet
    return env




## Configuration for MineRL wrapper

In [5]:
# Agruments for wrapper
from datetime import datetime
class Args:
    def __init__(self):
        # Set environment Name
        self.env = 'MineRLNavigateDense-v0' #'MineRLNavigateDenseVectorObf-v0'
        
        # Set frame skipping or stacking
        self.frame_skip = None
        self.frame_stack = None
        
        # Set gray scale or rgb input
        self.gray_scale = False
        
        # Toggle for monitoring / video recordings
        self.monitor = False
        
        # Output folder for monitor
        now = datetime.now()
        date_time = now.strftime("%m_%d_%Y_HR_%H_MIN_%M_SEC_%S")
        self.outdir = f'results/{self.env}/date_time'
        
        # Disable prior action sets such as repeating or excluding actions
        self.disable_action_prior = True
        
        # Set always pressed / repeated keys by agent
        self.always_keys = 'forward sprint attack' # Just an example !
        
        # Set excluded keys for agent
        exclude_keys = 'back left right sneak place' # Just an example !
        
        # Check code from baseline script for more info:
        # https://github.com/minerllabs/baselines/blob/master/2019/general/chainerrl/baselines/ppo.sh
        # https://github.com/minerllabs/baselines/blob/master/2019/general/chainerrl/baselines/ppo.py

## Registering MineRL Envs in RLLIB

In [6]:
from ray.tune.registry import register_env

def minerl_env_creator(env_config):
    import minerl # Load MineRL environments as Gym Environments
#     import gym
    # Load wrapper configurations
    args = Args()
    
    # Create Minecraft environment
    core_env = gym.make(args.env)
    minerl_env = wrap_env(core_env, args, test=False)
    
    # Code below is from original code in 
    # https://github.com/minerllabs/baselines/blob/master/2019/general/chainerrl/baselines/ppo.py
    # eval_env = gym.make(args.env)  # Can't create multiple MineRL envs
    # eval_env = wrap_env(eval_env, test=True)
    # eval_env = wrap_env(core_env, test=True)
    
    return minerl_env  

In [7]:
# Register MineRL environment in RLLIB
register_env("minerl", minerl_env_creator)

In [8]:
# import minerl
# import gym
# # Check minerl environments
# # This only checks Gym environment not the ones registered in Ray!
# all_envs = gym.envs.registry.all()
# env_ids = [env_spec.id for env_spec in all_envs]
# print(env_ids)

In [9]:
# # Test run

# import minerl
# args = Args()
# core_env = gym.make(args.env)
# minerl_env = wrap_env(core_env, args, test=False)
# minerl_env.reset()
# for i in range(500):
#     minerl_env.step(minerl_env.action_space.sample())

## Register custom vision network to process input

In [10]:
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import torch.nn.functional as F

from ray import tune
from ray.tune.schedulers import ASHAScheduler

In [11]:
# Registering a custom model, simple version --> fully connected network
# We need this because the default configurations for the model network is
# not compatible with the shape of the output from the MineRL environment
# The output is a 64 x 64 pixels with RGBA of game play P.O.V. --> shape = (4, 64, 64)

# Otherwise we will get:
# ValueError: No default configuration for obs shape [4, 64, 64], you must specify 
# `conv_filters` manually as a model option. Default configurations are only available
# for inputs of shape [42, 42, K] and [84, 84, K]. You may alternatively want to use 
# a custom model or preprocessor.

from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC

# https://docs.ray.io/en/releases-0.8.5/rllib-examples.html
# The register custom env and model links to custom_env.py
# https://github.com/ray-project/ray/blob/master/rllib/examples/custom_env.py
class TorchCustomModel(TorchModelV2, nn.Module):
    """Example of a PyTorch custom model that just delegates to a fc-net."""

    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs,
                              model_config, name)
        nn.Module.__init__(self)

        self.torch_sub_model = TorchFC(obs_space, action_space, num_outputs,
                                       model_config, name)

    def forward(self, input_dict, state, seq_lens):
        input_dict["obs"] = input_dict["obs"].float()
        fc_out, _ = self.torch_sub_model(input_dict, state, seq_lens)
        return fc_out, []

    def value_function(self):
        return torch.reshape(self.torch_sub_model.value_function(), [-1])

ModelCatalog.register_custom_model("fc_pov", TorchCustomModel)
# Weird tensorflow error? even though using torch

Instructions for updating:
non-resource variables are not supported in the long term


In [12]:
# https://groups.google.com/forum/#!topic/ray-dev/coaz8dgHyYw
# https://docs.ray.io/en/latest/rllib-training.html#specifying-resources
# Setting resources_per_trial={"cpu": 8, "gpu": 1} will cause issues

In [13]:
from ray import tune
# https://docs.ray.io/en/latest/rllib-training.html#tuned-examples

tune.run("PPO",
         config={"env": "minerl",
                 "num_gpus": 1,
                 "num_workers": 1,
                 "num_envs_per_worker": 1,
                 "eager": False,
                 "use_pytorch": True,
#                  'monitor':True, 
                 "model": {
                    "custom_model": "fc_pov",
                    }
                 }
         ,stop={"training_iteration": 2})  

Trial name,status,loc
PPO_minerl_24b7b_00000,RUNNING,


[2m[36m(pid=47392)[0m Instructions for updating:
[2m[36m(pid=47392)[0m non-resource variables are not supported in the long term
[2m[36m(pid=47392)[0m 2020-09-27 16:57:06,883	INFO trainer.py:632 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=47392)[0m 2020-09-27 16:57:44,736	INFO trainable.py:251 -- Trainable.setup took 38.238 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(pid=47391)[0m Instructions for updating:
[2m[36m(pid=47391)[0m non-resource variables are not supported in the long term
[2m[36m(pid=47391)[0m   tensor = torch.from_numpy(np.asarray(item))


Result for PPO_minerl_24b7b_00000:
  custom_metrics: {}
  date: 2020-09-27_16-59-32
  done: false
  episode_len_mean: .nan
  episode_reward_max: .nan
  episode_reward_mean: .nan
  episode_reward_min: .nan
  episodes_this_iter: 0
  episodes_total: 0
  experiment_id: 5c00d60eb8004d97a567ce40e6a53f34
  experiment_tag: '0'
  hostname: blackbox
  info:
    learner:
      default_policy:
        allreduce_latency: 0.0
        cur_kl_coeff: 0.2
        cur_lr: 5.0e-05
        entropy: 2.45800544321537
        entropy_coeff: 0.0
        kl: 0.027558082132600248
        policy_loss: -0.010829467937583104
        total_loss: 0.2658954306971282
        vf_explained_var: 0.11666689068078995
        vf_loss: 0.2712132791057229
    num_steps_sampled: 4000
    num_steps_trained: 4000
  iterations_since_restore: 1
  node_ip: 192.168.7.73
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 41.34220779220779
    ram_util_percent: 63.077272727272735
  pid: 47392
  policy_rew

Trial name,status,loc,iter,total time (s),ts,reward
PPO_minerl_24b7b_00000,RUNNING,192.168.7.73:47392,1,107.599,4000,


Result for PPO_minerl_24b7b_00000:
  custom_metrics: {}
  date: 2020-09-27_17-00-46
  done: true
  episode_len_mean: 6000.0
  episode_reward_max: 8.999691009521484
  episode_reward_mean: 8.999691009521484
  episode_reward_min: 8.999691009521484
  episodes_this_iter: 1
  episodes_total: 1
  experiment_id: 5c00d60eb8004d97a567ce40e6a53f34
  experiment_tag: '0'
  hostname: blackbox
  info:
    learner:
      default_policy:
        allreduce_latency: 0.0
        cur_kl_coeff: 0.30000000000000004
        cur_lr: 5.0e-05
        entropy: 2.4419078305363655
        entropy_coeff: 0.0
        kl: 0.01417196518741548
        policy_loss: -0.02255858271382749
        total_loss: 0.14833057206124067
        vf_explained_var: 0.2861601710319519
        vf_loss: 0.166637564310804
    num_steps_sampled: 8000
    num_steps_trained: 8000
  iterations_since_restore: 2
  node_ip: 192.168.7.73
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 33.852380952380955
    ram_ut

Trial name,status,loc,iter,total time (s),ts,reward
PPO_minerl_24b7b_00000,TERMINATED,,2,181.444,8000,8.99969


Trial name,status,loc,iter,total time (s),ts,reward
PPO_minerl_24b7b_00000,TERMINATED,,2,181.444,8000,8.99969


[2m[36m(pid=47392)[0m *** Aborted at 1601251249 (unix time) try "date -d @1601251249" if you are using GNU date ***


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7fb7fda97ed0>

[2m[36m(pid=47392)[0m PC: @                0x0 (unknown)
[2m[36m(pid=47392)[0m *** SIGSEGV (@0x7fc1eee1c9d0) received by PID 47566 (TID 0x7fc3cd309740) from PID 18446744073422358992; stack trace: ***
[2m[36m(pid=47392)[0m     @     0x7fc3cd67a3c0 (unknown)
[2m[36m(pid=47392)[0m     @     0x7fc3cd66faab __pthread_clockjoin_ex
[2m[36m(pid=47392)[0m     @     0x7fc3cb2452d3 std::thread::join()
[2m[36m(pid=47392)[0m     @     0x7fc3cb776493 ray::gcs::GlobalStateAccessor::Disconnect()
[2m[36m(pid=47392)[0m     @     0x7fc3cb615fbc __pyx_pw_3ray_7_raylet_19GlobalStateAccessor_5disconnect()
[2m[36m(pid=47392)[0m     @     0x55ac1e341b71 _PyMethodDef_RawFastCallKeywords
[2m[36m(pid=47392)[0m     @     0x55ac1e348aef _PyMethodDescr_FastCallKeywords
[2m[36m(pid=47392)[0m     @     0x55ac1e3ad37c _PyEval_EvalFrameDefault
[2m[36m(pid=47392)[0m     @     0x55ac1e34120b _PyFunction_FastCallKeywords
[2m[36m(pid=47392)[0m     @     0x55ac1e3a8e70 _PyEval_EvalFrameDe

In [None]:
## Something weird about using Trainer + Tune
# from ray import tune
# import ray.rllib.agents.ppo as ppo
# # from ray.rllib.agents.dqn import PPOTrainer
# # from ray.rllib.agents.dqn import DQNTrainer

# # https://docs.ray.io/en/latest/rllib-training.html#tuned-examples
# config = ppo.DEFAULT_CONFIG.copy()
# config["num_gpus"] = 1
# config["num_workers"] = 1
# config["num_envs_per_worker"] = 1
# config["eager"] = False
# trainer = ppo.PPOTrainer(config=config)#, env="minerl")

# tune.run(trainer,
#          config={"env": "minerl",
#                  "use_pytorch": True,
# #                  'monitor':True, 
#                  "model": {
#                     "custom_model": "fc_pov",
#                     }
#                  }
#          ,stop={"training_iteration": 2},
#         resources_per_trial={"cpu": 8, "gpu": 1})  