In [1]:
pwd = %pwd

import os, sys
sys.path.insert(0, os.path.join(pwd, "../"))


#

-------

自作環境を用意するには 'srl.base.env.EnvBase' を継承して作ります。  
ただ、作りたい環境によってはいらない情報もあるので、  
'srl.base.env.genre'に各環境に合わせたインタフェースも用意しています。  


ここでは一人プレイでアクションと状態が共に離散値を取る環境を作成していきます。  
'srl.base.env.genre.singleplay.SingleActionDiscrete' を継承してクラスを作っていきます。

以下のプロパティと関数が必要となります。

``` python
from typing import Any, List, Tuple, cast

import gym.spaces
from srl.base.env.genre.singleplay import SingleActionDiscrete
from srl.base.define import EnvObservationType

class MyEnv(SingleActionDiscrete):

    # 取りうるアクションの数を返す
    @property
    def action_num(self) -> int:
        raise NotImplementedError()

    # 状態の取りうる値(gym.spacesで指定、将来的に変わるかも？ TODO)
    @property
    def observation_space(self) -> gym.spaces.Space:
        raise NotImplementedError()

    # 環境の種類を指定
    @property
    def observation_type(self) -> EnvObservationType:
        raise NotImplementedError()
    
    # 1エピソードの最大ターン数
    @property
    def max_episode_steps(self) -> int:
        raise NotImplementedError()

    # エピソードの最初に呼ばれる関数
    # 初期状態を返す
    def reset_single(self) -> Any:
        raise NotImplementedError()

    # 1stepの関数
    # アクションを受け取り、1step進めて、[状態,報酬,終わりか,情報]を返す
    def step_single(self, action: int) -> Tuple[Any, float, bool, dict]:
        raise NotImplementedError()
    
    # backup/restoreで環境の状態を保存・復元できるように関数を作る
    def backup(self) -> Any:
        raise NotImplementedError()
    def restore(self, data: Any) -> None:
        raise NotImplementedError()

    # 可視化用の関数(option)
    def render_terminal(self, **kwargs) -> None:
        raise NotImplementedError()

    def render_gui(self, **kwargs) -> None:
        raise NotImplementedError()

    def render_rgb_array(self, **kwargs) -> np.ndarray:
        raise NotImplementedError()
```


実際に作ってみます。

In [2]:
import enum
from dataclasses import dataclass
from typing import Any, List, Tuple, cast

import gym.spaces
import numpy as np
from srl.base.env.genre.singleplay import SingleActionDiscrete
from srl.base.define import EnvObservationType


class Action(enum.Enum):
    LEFT = 0
    DOWN = 1
    RIGHT = 2
    UP = 3

@dataclass
class MyEnv(SingleActionDiscrete):
    
    move_reward: float = -0.04

    def __post_init__(self):
        self.base_field = [
            [0, 0, 0, 1],
            [0, 9, 0, -1],
            [0, 0, 0, 0],
        ]
        self.H = 3
        self.W = 4
    
    @property
    def action_num(self) -> int:
        return len(Action)

    @property
    def observation_space(self) -> gym.spaces.Space:
        return gym.spaces.Box(
            low=0,
            high=np.asarray([self.W, self.H]),
            shape=(2,),
        )

    @property
    def observation_type(self) -> EnvObservationType:
        return EnvObservationType.DISCRETE
    
    @property
    def max_episode_steps(self) -> int:
        return 100

    def reset_single(self) -> Any:
        self.player_pos = [0, 2]
        return tuple(self.player_pos)

    def step_single(self, action_: int) -> Tuple[Any, float, bool, dict]:
        action = Action(action_)

        next_player_pos = self.player_pos[:]

        if action == Action.UP:
            next_player_pos[1] -= 1
        elif action == Action.DOWN:
            next_player_pos[1] += 1
        elif action == Action.LEFT:
            next_player_pos[0] -= 1
        elif action == Action.RIGHT:
            next_player_pos[0] += 1
        else:
            raise ValueError()

        is_move = True
        if not (0 <= next_player_pos[0] < self.W):
            is_move = False
        elif not (0 <= next_player_pos[1] < self.H):
            is_move = False
        elif self.base_field[next_player_pos[1]][next_player_pos[0]] == 9:
            is_move = False

        if is_move:
            self.player_pos = next_player_pos

        reward = self.move_reward
        done = False

        attribute = self.base_field[self.player_pos[1]][self.player_pos[0]]
        if attribute == 1:
            reward = 1
            done = True
        elif attribute == -1:
            reward = -1
            done = True

        return tuple(self.player_pos), reward, done, {}
    
    def backup(self) -> Any:
        return self.player_pos[:]

    def restore(self, data: Any) -> None:
        self.player_pos = data

    def render_terminal(self):
        for y in range(self.H):
            s = ""
            for x in range(self.W):
                n = self.base_field[y][x]
                if self.player_pos[0] == x and self.player_pos[1] == y:  # player
                    s += "P"
                elif n == 0:  # 道
                    s += "."
                elif n == 1:  # goal
                    s += "G"
                elif n == -1:  # 穴
                    s += "X"
                else:
                    s += str(n)
            print(s)
        print("")


作成出来たら登録します。  
引数は以下です。  

``` python
id: ユニークな名前  
entry_point: __name__ + ":" + クラス名  
kwargs: クラス生成時の引数
```

In [3]:
from srl.base.env import registration

registration.register(
    id="MyEnv",
    entry_point=__name__ + ":MyEnv",
    kwargs={
        "move_reward": -0.04,
    },
)


以下のように実行できます。  
'SinglePlayerWrapper' を通すとシングルプレイ用のインタフェースで実行できます。


In [4]:
import srl
from srl.base.env.singleplay_wrapper import SinglePlayerWrapper

env = srl.envs.make("MyEnv")
env = SinglePlayerWrapper(env)  # change single play interface

env.reset()
done = False
total_reward = 0
step = 0
env.render()

while not done:
    action = env.sample()
    state, reward, done, _ = env.step(action)
    total_reward += reward
    step += 1
    print(f"step {step}, action {action}, reward {reward}, done {done}")
    env.render()

print(total_reward)



...G
.9.X
P...

step 1, action 0, reward -0.04, done False
...G
.9.X
P...

step 2, action 2, reward -0.04, done False
...G
.9.X
.P..

step 3, action 2, reward -0.04, done False
...G
.9.X
..P.

step 4, action 3, reward -0.04, done False
...G
.9PX
....

step 5, action 1, reward -0.04, done False
...G
.9.X
..P.

step 6, action 3, reward -0.04, done False
...G
.9PX
....

step 7, action 1, reward -0.04, done False
...G
.9.X
..P.

step 8, action 0, reward -0.04, done False
...G
.9.X
.P..

step 9, action 1, reward -0.04, done False
...G
.9.X
.P..

step 10, action 1, reward -0.04, done False
...G
.9.X
.P..

step 11, action 3, reward -0.04, done False
...G
.9.X
.P..

step 12, action 1, reward -0.04, done False
...G
.9.X
.P..

step 13, action 2, reward -0.04, done False
...G
.9.X
..P.

step 14, action 1, reward -0.04, done False
...G
.9.X
..P.

step 15, action 2, reward -0.04, done False
...G
.9.X
...P

step 16, action 3, reward -1, done True
...G
.9.P
....

-1.6


Runnerを使うと以下のように学習できます。

In [5]:
import srl
from srl.runner import sequence
from srl.runner.callbacks import PrintProgress

config = sequence.Config(
    env_name="MyEnv",
    rl_config=srl.rl.ql.Config(),  # Q学習
)

# --- train
config.set_play_config(max_episodes=10000, training=True, callbacks=[PrintProgress()])
parameter, memory = sequence.train(config)


### env: MyEnv, max episodes: 10000, max steps: -1, timeout:  -1.00s
20:42:41   0.00s   10000ep   55143tr   0.00s(remain), -1.960 0.760 0.840 reward, 5.5 step, 0.00s/ep, 0.0000s/tr,        0 mem|Q 10.999|td_error 0.005


In [7]:
config.set_play_config(max_episodes=100)
rewards, _, _ = sequence.play(config, parameter)
print("100エピソードの平均結果", np.mean(rewards))


100エピソードの平均結果 0.8400000000000002


In [9]:
from srl.runner.callbacks import Rendering

config.set_play_config(max_episodes=1, callbacks=[Rendering(step_stop=False)])
_ = sequence.play(config, parameter)


### 0
...G
.9.X
P...

 0: 0.42669
 1: 0.42669
*2: 0.51854
 3: 0.51854
### 1, done: False
player 0, action 2, reward: -0.04
...G
.9.X
.P..

env_info  : {}
work_info 0: {}
train_info: None
 0: 0.42669
 1: 0.51854
*2: 0.62060
 3: 0.51854
### 2, done: False
player 0, action 2, reward: -0.04
...G
.9.X
..P.

env_info  : {}
work_info 0: {}
train_info: None
 0: 0.51854
 1: 0.62060
 2: 0.51854
*3: 0.73400
### 3, done: False
player 0, action 3, reward: -0.04
...G
.9PX
....

env_info  : {}
work_info 0: {}
train_info: None
 0: 0.73400
 1: 0.62060
 2: -1.00000
*3: 0.86000
### 4, done: False
player 0, action 3, reward: -0.04
..PG
.9.X
....

env_info  : {}
work_info 0: {}
train_info: None
 0: 0.73400
 1: 0.73400
*2: 1.00000
 3: 0.86000
### 5, done: True
player 0, action 2, reward: 1.0
...P
.9.X
....

env_info  : {}
work_info 0: {}
train_info: None
