# Building Your First Distributed Application With Ray Core

In [1]:
# tag::discrete_actions[]
import random


class Discrete:
    def __init__(self, num_actions: int):
        """ Discrete action space for num_actions."""
        self.n = num_actions

    def sample(self):
        return random.randint(0, self.n - 1)  # <1>


space = Discrete(4)
print(space.sample())  # <2>
# end::discrete_actions[]

3


In [2]:
# tag::init_env[]
import os


class Environment:

    seeker, goal = (0, 0), (4, 4)  # <1>
    info = {'seeker': seeker, 'goal': goal}

    def __init__(self,  *args, **kwargs):
        self.action_space = Discrete(4)  # <2>
        self.observation_space = Discrete(5*5)  # <3>
# end::init_env[]

# tag::env_helpers[]
    def reset(self):  # <1>
        """Reset seeker and goal positions, return observations."""
        self.seeker = (0, 0)
        self.goal = (4, 4)

        return self.get_observation()

    def get_observation(self):
        """Encode the seeker position as integer"""
        return 5 * self.seeker[0] + self.seeker[1]  # <2>

    def get_reward(self):
        """Reward finding the goal"""
        return 1 if self.seeker == self.goal else 0  # <3>

    def is_done(self):
        """We're done if we found the goal"""
        return self.seeker == self.goal  # <4>
# end::env_helpers[]

# tag::env_step[]
    def step(self, action):
        """Take a step in a direction and return all available information."""
        if action == 0:  # move down
            self.seeker = (min(self.seeker[0] + 1, 4), self.seeker[1])
        elif action == 1:  # move left
            self.seeker = (self.seeker[0], max(self.seeker[1] - 1, 0))
        elif action == 2:  # move up
            self.seeker = (max(self.seeker[0] - 1, 0), self.seeker[1])
        elif action == 3:  # move right
            self.seeker = (self.seeker[0], min(self.seeker[1] + 1, 4))
        else:
            raise ValueError("Invalid action")

        return self.get_observation(), self.get_reward(), self.is_done(), self.info  # <1>
# end::env_step[]

# tag::env_render[]
    def render(self, *args, **kwargs):
        """Render the environment, e.g. by printing its representation."""
        os.system('cls' if os.name == 'nt' else 'clear')  # <1>
        grid = [['| ' for _ in range(5)] + ["|\n"] for _ in range(5)]
        grid[self.goal[0]][self.goal[1]] = '|G'
        grid[self.seeker[0]][self.seeker[1]] = '|S'  # <2>
        print(''.join([''.join(grid_row) for grid_row in grid]))  # <3>
# end::env_render[]

In [3]:
# tag::env_test[]
import time

environment = Environment()

while not environment.is_done():
    random_action = environment.action_space.sample()  # <1>
    environment.step(random_action)
    time.sleep(0.1)
    environment.render()  # <2>
# end::env_test[]

[H[2J|S| | | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J|S| | | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J|S| | | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J|S| | | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J|S| | | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J|S| | | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J| | | | | |
|S| | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J|S| | | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J| | | | | |
|S| | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J| | | | | |
| | | | | |
|S| | | | |
| | | | | |
| | | | |G|

[H[2J| | | | | |
| | | | | |
| | | | | |
|S| | | | |
| | | | |G|

[H[2J| | | | | |
| | | | | |
| | | | | |
|S| | | | |
| | | | |G|

[H[2J| | | | | |
| | | | | |
| | | | | |
| |S| | | |
| | | | |G|

[H[2J| | | | | |
| | | | | |
| | | | | |
| | |S| | |
| | | | |G|

[H[2J| | | | | |
| | | | | |
| | | | | |
| | |

In [4]:
# tag::policy[]
class Policy:

    def __init__(self, env):
        """A Policy suggest actions based on the current state.
        We do this by tracking the value of each state-action pair.
        """
        self.state_action_table = [
            [0 for _ in range(env.action_space.n)]for _ in range(env.observation_space.n)  # <1>
        ]
        self.action_space = env.action_space

    def get_action(self, state, explore=True, epsilon=0.1):
        """Explore randomly or exploit the best value currently available."""
        if explore and random.uniform(0, 1) < epsilon:  # <2>
            return self.action_space.sample()
        return np.argmax(self.state_action_table[state])  # <3>
# end::policy[]

In [5]:
# tag::simulation[]
class Simulation(object):
    def __init__(self, env):
        """Simulates rollouts of an environment, given a policy to follow."""
        self.env = env

    def rollout(self, policy, render=False, explore=True, epsilon=0.1):  # <1>
        """Returns experiences for a policy rollout."""
        experiences = []
        state = self.env.reset()  # <2>
        done = False
        while not done:
            action = policy.get_action(state, explore, epsilon)  # <3>
            next_state, reward, done, info = self.env.step(action)  # <4>
            experiences.append([state, action, reward, next_state])  # <5>
            state = next_state
            if render:  # <6>
                time.sleep(0.05)
                self.env.render()

        return experiences
# end::simulation[]

In [6]:
# tag::naive_rollout[]
untrained_policy = Policy(environment)
sim = Simulation(environment)

exp = sim.rollout(untrained_policy, render=True, epsilon=1.0)  # <1>
for row in untrained_policy.state_action_table:
    print(row)  # <2>
# end::naive_rollout[]

[H[2J|S| | | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J|S| | | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J|S| | | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J|S| | | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J| | | | | |
|S| | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J| | | | | |
| |S| | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J| |S| | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J|S| | | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J| |S| | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J|S| | | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J|S| | | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J|S| | | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J| |S| | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J| | | | | |
| |S| | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J| | | | | |
| | | | | |
| |S| | | |
| | |

In [7]:
# tag::update_policy[]
import numpy as np


def update_policy(policy, experiences):
    """Updates a given policy with a list of (state, action, reward, state) experiences."""
    alpha = 0.1
    gamma = 0.6
    for state, action, reward, next_state in experiences:  # <1>
        next_max = np.max(policy.state_action_table[next_state])  # <2>
        value = policy.state_action_table[state][action]  # <3>
        new_value = (1 - alpha) * value + alpha * (reward + gamma * next_max)  # <4>
        policy.state_action_table[state][action] = new_value  # <5>
# end::update_policy[]

In [8]:
# tag::train_policy[]
def train_policy(env, num_episodes=10000):
    """Training a policy by updating it with rollout experiences."""
    policy = Policy(env)
    sim = Simulation(env)
    for _ in range(num_episodes):
        experiences = sim.rollout(policy)  # <1>
        update_policy(policy, experiences)  # <2>

    return policy


trained_policy = train_policy(environment)  # <3>
# end::train_policy[]

In [9]:
# tag::evaluate_policy[]
def evaluate_policy(env, policy, num_episodes=10):
    """Evaluate a trained policy through rollouts."""
    simulation = Simulation(env)
    steps = 0

    for _ in range(num_episodes):
        experiences = simulation.rollout(policy, render=True, explore=False)  # <1>
        steps += len(experiences)  # <2>

    print(f"{steps / num_episodes} steps on average "
          f"for a total of {num_episodes} episodes.")


evaluate_policy(environment, trained_policy)
# end::evaluate_policy[]

[H[2J| | | | | |
|S| | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J| | | | | |
| | | | | |
|S| | | | |
| | | | | |
| | | | |G|

[H[2J| | | | | |
| | | | | |
| | | | | |
|S| | | | |
| | | | |G|

[H[2J| | | | | |
| | | | | |
| | | | | |
| | | | | |
|S| | | |G|

[H[2J| | | | | |
| | | | | |
| | | | | |
| | | | | |
| |S| | |G|

[H[2J| | | | | |
| | | | | |
| | | | | |
| | | | | |
| | |S| |G|

[H[2J| | | | | |
| | | | | |
| | | | | |
| | | | | |
| | | |S|G|

[H[2J| | | | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |S|

[H[2J| | | | | |
|S| | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J| | | | | |
| | | | | |
|S| | | | |
| | | | | |
| | | | |G|

[H[2J| | | | | |
| | | | | |
| | | | | |
|S| | | | |
| | | | |G|

[H[2J| | | | | |
| | | | | |
| | | | | |
| | | | | |
|S| | | |G|

[H[2J| | | | | |
| | | | | |
| | | | | |
| | | | | |
| |S| | |G|

[H[2J| | | | | |
| | | | | |
| | | | | |
| | | | | |
| | |S| |G|

[H[2J| | | | | |
| | | | | |
| | | | | |
| | |

In [None]:
# tag::ray_policy_simulation[]
import ray

ray.init()
environment = Environment()
env_ref = ray.put(environment)  # <1>


@ray.remote
def create_policy():
    env = ray.get(env_ref)
    return Policy(env)  # <2>


@ray.remote
class SimulationActor(Simulation):  # <3>
    """Ray actor for a Simulation."""
    def __init__(self):
        env = ray.get(env_ref)
        super().__init__(env)
# end::ray_policy_simulation[]

2022-04-29 19:15:12,562	INFO services.py:1456 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8266[39m[22m


In [None]:
# tag::ray_training[]
@ray.remote
def update_policy_task(policy_ref, experiences_list):
    """Remote Ray task for updating a policy with experiences in parallel."""
    [update_policy(policy_ref, ray.get(xp)) for xp in experiences_list]  # <1>
    return policy_ref


def train_policy_parallel(num_episodes=1000, num_simulations=10):
    """Parallel policy training function."""
    policy = create_policy.remote()  # <2>
    simulations = [SimulationActor.remote() for _ in range(num_simulations)]  # <3>

    for _ in range(num_episodes):
        experiences = [sim.rollout.remote(policy) for sim in simulations]  # <4>
        policy = update_policy_task.remote(policy, experiences)  # <5>

    return ray.get(policy)  # <6>
# end::ray_training[]

# ![Task dependency](https://raw.githubusercontent.com/maxpumperla/learning_ray/main/notebooks/images/chapter_03/train_policy.png)

In [None]:
# tag::ray_evaluation[]
parallel_policy = train_policy_parallel()
evaluate_policy(environment, parallel_policy)
# end::ray_evaluation[]

In [None]:

# tag:: override_env[]
import gym
from gym.spaces import Discrete


class Environment(Environment, gym.Env):
    def __init__(self, *args, **kwargs):
        """Make our original `Environment` a gym `Env`."""
        super().__init__(*args, **kwargs)


gym_env = Environment()
# end::override_env[]

In [None]:
# tag::rllib_dqn_simple[]
from ray.rllib.agents.dqn import DQNTrainer

trainer = DQNTrainer(env=Environment)
res = trainer.train()
# end::rllib_dqn_simple[]