In [1]:
import gym
import ray

In [2]:
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import torch.nn.functional as F

from ray import tune
from ray.tune.schedulers import ASHAScheduler

In [3]:
# Initialize Ray
ray.shutdown()
ray.init(ignore_reinit_error=True,num_cpus=8, num_gpus=1)

2020-09-27 17:08:17,441	INFO resource_spec.py:231 -- Starting Ray with 13.13 GiB memory available for workers and up to 6.58 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-09-27 17:08:17,970	INFO services.py:1193 -- View the Ray dashboard at [1m[32mlocalhost:8269[39m[22m


{'node_ip_address': '192.168.7.73',
 'raylet_ip_address': '192.168.7.73',
 'redis_address': '192.168.7.73:45453',
 'object_store_address': '/tmp/ray/session_2020-09-27_17-08-17_440757_49014/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-09-27_17-08-17_440757_49014/sockets/raylet',
 'webui_url': 'localhost:8269',
 'session_dir': '/tmp/ray/session_2020-09-27_17-08-17_440757_49014'}

In [4]:
# https://docs.ray.io/en/latest/tune/user-guide.html#parallelism-gpus
ray.cluster_resources()

{'node:192.168.7.73': 1.0,
 'GPUType:GTX': 1.0,
 'CPU': 8.0,
 'GPU': 1.0,
 'memory': 269.0,
 'object_store_memory': 92.0}

In [5]:
print("Dashboard URL: http://{}".format(ray.get_webui_url()))

Dashboard URL: http://localhost:8269


### Register MineRL Gym Environment to RLlib

In [6]:
from minerl_rllib.envs import register



In [7]:
register()

## Example of registering a custom model

In [8]:
# Registering a custom model, simple version --> fully connected network
# We need this because the default configurations for the model network is
# not compatible with the shape of the output from the MineRL environment
# The output is a 64 x 64 pixels with RGBA of game play P.O.V. --> shape = (4, 64, 64)

# Otherwise we will get:
# ValueError: No default configuration for obs shape [4, 64, 64], you must specify 
# `conv_filters` manually as a model option. Default configurations are only available
# for inputs of shape [42, 42, K] and [84, 84, K]. You may alternatively want to use 
# a custom model or preprocessor.

from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC

# https://docs.ray.io/en/releases-0.8.5/rllib-examples.html
# The register custom env and model links to custom_env.py
# https://github.com/ray-project/ray/blob/master/rllib/examples/custom_env.py
class TorchCustomModel(TorchModelV2, nn.Module):
    """Example of a PyTorch custom model that just delegates to a fc-net."""

    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs,
                              model_config, name)
        nn.Module.__init__(self)

        self.torch_sub_model = TorchFC(obs_space, action_space, num_outputs,
                                       model_config, name)

    def forward(self, input_dict, state, seq_lens):
        input_dict["obs"] = input_dict["obs"].float()
        fc_out, _ = self.torch_sub_model(input_dict, state, seq_lens)
        return fc_out, []

    def value_function(self):
        return torch.reshape(self.torch_sub_model.value_function(), [-1])


Instructions for updating:
non-resource variables are not supported in the long term


In [9]:
ModelCatalog.register_custom_model("fc_pov", TorchCustomModel)

## Run training

In [10]:
from ray import tune
# https://docs.ray.io/en/latest/rllib-training.html#tuned-examples

import ray.rllib.agents.ppo as ppo
from ray.rllib.agents.ppo import PPOTrainer

tune.run(PPOTrainer,
         config={"env": "MineRLNavigateDenseVectorObf-v0",
                 "num_gpus": 1,
                 "num_workers": 1,
                 "num_envs_per_worker": 1,
                 "eager": False,
                 "use_pytorch": True,
                 'monitor':True, 
#                  "model": {
#                     "custom_model": "fc_pov",
#                     }
                 }
         ,stop={"training_iteration": 3})  

Trial name,status,loc
PPO_MineRLNavigateDenseVectorObf-v0_b8415_00000,RUNNING,


[2m[36m(pid=49089)[0m Instructions for updating:
[2m[36m(pid=49089)[0m non-resource variables are not supported in the long term
[2m[36m(pid=49089)[0m 2020-09-27 17:08:24,419	INFO trainer.py:632 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=49090)[0m Instructions for updating:
[2m[36m(pid=49090)[0m non-resource variables are not supported in the long term
[2m[36m(pid=49090)[0m   tensor = torch.from_numpy(np.asarray(item))
[2m[36m(pid=49090)[0m   x[..., a:b] = e_x / e_x.sum(axis=-1)


Result for PPO_MineRLNavigateDenseVectorObf-v0_b8415_00000:
  custom_metrics: {}
  date: 2020-09-27_17-10-11
  done: false
  episode_len_mean: .nan
  episode_reward_max: .nan
  episode_reward_mean: .nan
  episode_reward_min: .nan
  episodes_this_iter: 0
  episodes_total: 0
  experiment_id: 24e4d9647e6b43e78da8476d3abdcbc8
  experiment_tag: '0'
  hostname: blackbox
  info:
    learner:
      default_policy:
        allreduce_latency: 0.0
        cur_kl_coeff: 0.2
        cur_lr: 5.0e-05
        entropy: 90.72712445259094
        entropy_coeff: 0.0
        kl: 0.01840121901477687
        policy_loss: -0.02523390165879391
        total_loss: 0.2114511642139405
        vf_explained_var: 0.0006016697734594345
        vf_loss: 0.2330048200674355
    num_steps_sampled: 4000
    num_steps_trained: 4000
  iterations_since_restore: 1
  node_ip: 192.168.7.73
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 39.44
    ram_util_percent: 54.817333333333345
  pid: 4908

Trial name,status,loc,iter,total time (s),ts,reward
PPO_MineRLNavigateDenseVectorObf-v0_b8415_00000,RUNNING,192.168.7.73:49089,1,104.744,4000,


Result for PPO_MineRLNavigateDenseVectorObf-v0_b8415_00000:
  custom_metrics: {}
  date: 2020-09-27_17-11-26
  done: false
  episode_len_mean: 6000.0
  episode_reward_max: -9.173072814941406
  episode_reward_mean: -9.173072814941406
  episode_reward_min: -9.173072814941406
  episodes_this_iter: 1
  episodes_total: 1
  experiment_id: 24e4d9647e6b43e78da8476d3abdcbc8
  experiment_tag: '0'
  hostname: blackbox
  info:
    learner:
      default_policy:
        allreduce_latency: 0.0
        cur_kl_coeff: 0.2
        cur_lr: 5.0e-05
        entropy: 90.82395577430725
        entropy_coeff: 0.0
        kl: 0.019408520980505273
        policy_loss: -0.023212998756207526
        total_loss: 0.22844246472232044
        vf_explained_var: 0.00020822696387767792
        vf_loss: 0.2477737539447844
    num_steps_sampled: 8000
    num_steps_trained: 8000
  iterations_since_restore: 2
  node_ip: 192.168.7.73
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 34.3065420

Trial name,status,loc,iter,total time (s),ts,reward
PPO_MineRLNavigateDenseVectorObf-v0_b8415_00000,RUNNING,192.168.7.73:49089,2,180.042,8000,-9.17307


Result for PPO_MineRLNavigateDenseVectorObf-v0_b8415_00000:
  custom_metrics: {}
  date: 2020-09-27_17-12-39
  done: true
  episode_len_mean: 6000.0
  episode_reward_max: 6.425428867340088
  episode_reward_mean: -1.3738219738006592
  episode_reward_min: -9.173072814941406
  episodes_this_iter: 1
  episodes_total: 2
  experiment_id: 24e4d9647e6b43e78da8476d3abdcbc8
  experiment_tag: '0'
  hostname: blackbox
  info:
    learner:
      default_policy:
        allreduce_latency: 0.0
        cur_kl_coeff: 0.2
        cur_lr: 5.0e-05
        entropy: 90.83398461341858
        entropy_coeff: 0.0
        kl: 0.02367725333897397
        policy_loss: -0.026379756396636367
        total_loss: 0.20250490598846227
        vf_explained_var: 0.00019955821335315704
        vf_loss: 0.2241492117755115
    num_steps_sampled: 12000
    num_steps_trained: 12000
  iterations_since_restore: 3
  node_ip: 192.168.7.73
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 32.4180952

Trial name,status,loc,iter,total time (s),ts,reward
PPO_MineRLNavigateDenseVectorObf-v0_b8415_00000,TERMINATED,,3,253.009,12000,-1.37382


Trial name,status,loc,iter,total time (s),ts,reward
PPO_MineRLNavigateDenseVectorObf-v0_b8415_00000,TERMINATED,,3,253.009,12000,-1.37382


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7f232aebef90>

[2m[36m(pid=49090)[0m *** Aborted at 1601251962 (unix time) try "date -d @1601251962" if you are using GNU date ***
[2m[36m(pid=49090)[0m PC: @                0x0 (unknown)
[2m[36m(pid=49090)[0m *** SIGSEGV (@0x7ff06963c9d0) received by PID 49320 (TID 0x7ff238974740) from PID 1768147408; stack trace: ***
[2m[36m(pid=49090)[0m     @     0x7ff238ce53c0 (unknown)
[2m[36m(pid=49090)[0m     @     0x7ff238cdaaab __pthread_clockjoin_ex
[2m[36m(pid=49090)[0m     @     0x7ff2368b02d3 std::thread::join()
[2m[36m(pid=49090)[0m     @     0x7ff236de1493 ray::gcs::GlobalStateAccessor::Disconnect()
[2m[36m(pid=49090)[0m     @     0x7ff236c80fbc __pyx_pw_3ray_7_raylet_19GlobalStateAccessor_5disconnect()
[2m[36m(pid=49090)[0m     @     0x55f0600bfb71 _PyMethodDef_RawFastCallKeywords
[2m[36m(pid=49090)[0m     @     0x55f0600c6aef _PyMethodDescr_FastCallKeywords
[2m[36m(pid=49090)[0m     @     0x55f06012b37c _PyEval_EvalFrameDefault
[2m[36m(pid=49090)[0m     @     0x55