# 3 - RL Enviroment 
In this part we are going to build the most essential Enviroment to create a RL  Pipeline.

The first framework that we are going to use is the  **RAY**

We are going to  pass either a string name or a Python class to specify an environment.  In particular we are going to choose the simplest local enviroment.

Custom env classes passed directly to the algorithm must take a single env_config parameter in their constructor:



### Example 1 - Gym + Ray

In [1]:
import gym, ray
from gym import spaces
from ray.rllib.algorithms import ppo

class MyEnv(gym.Env):
    def __init__(self, env_config=None):
       # There are two actions, first will get reward of 1, second reward of -1. 
        self.action_space = spaces.Discrete(5)      #<gym.Space>
        self.observation_space = spaces.Discrete(2) #<gym.Space>
    
    def reset(self):
        state = 0
        #return <obs>
        return state
                           
    def step(self, action):

        # if we took an action, we were in state 1
        state = 1
    
        if action == 2:
            reward = 1
        else:
            reward = -1
            
        # regardless of the action, game is done after a single step
        done = True

        info = {}
        # return <obs>, <reward: float>, <done: bool>, <info: dict>
        return state, reward, done, info   

Python 3.8.x
ray 1.0
tensorflow 2.3.1
tensorflow-probability 0.11
gym 0.17.3
pygame 2.0.0

numpy==1.23.1

In [2]:
from ray import tune

In [3]:
tune.run(
    "SAC", # reinforced learning agent
    name = "Training1",
    checkpoint_freq = 100,
    checkpoint_at_end = True,
    local_dir = r'./ray_results/',
    config={
        "env": MyEnv,
        "num_workers": 30,
        "num_cpus_per_worker": 0.5,
        "env_config":{
            "max_steps": 100,
            "export_frames": False,
            "export_states": False,
            # "reward_mode": "continuous",
            # "env_flipped": True,
            # "env_flipmode": True,
            }
        },
    stop = {
        "timesteps_total": 5_000,
        },
    )

2023-02-03 22:55:17,824	INFO worker.py:1538 -- Started a local Ray instance.


ImportError: Could not import PyTorch! RLlib requires you to install at least one deep-learning framework: `pip install [torch|tensorflow|jax]`.

In [None]:
ray.init()
#algo = ppo.PPO(env=MyEnv, config={"env_config": {},  # config to pass to env class
#})

#algo = ppo.PPO(env=MyEnv, config=config) 

algo = ppo.PPO(env=MyEnv,config={"num_workers": 4})

In [None]:
mean_ppo = []
for _ in range(25):
    result = algo.train()
    print("episode reward mean:", _, result['episode_reward_mean'])
    mean_ppo.append(result['episode_reward_mean'])

In [None]:
import matplotlib.pyplot as plt

xs = [x for x in range(len(mean_ppo))]

plt.plot(xs, mean_ppo)
plt.show()


### How to use the trained algorithm in RL with PP0

In [None]:
trainer=algo

In [None]:
checkpoint = trainer.save()

In [None]:
print(checkpoint)

In [None]:
#TODO Fix the windows path
#evaluation = trainer.evaluate(checkpoint)

## Computing actions

In [None]:
env = MyEnv()

In [None]:
done = False
total_reward = 0

In [None]:
observations = env.reset()

In [None]:
print(observations) # The state which you should determine the action

Given any state compute the action which you get the maximum reward in according to the traning 

In [None]:
action = trainer.compute_single_action(observations)

In [None]:
action

In [None]:
while not done:
    action = trainer.compute_single_action(observations)
    observations, reward, done, info = env.step(action)
    total_reward += reward
    print("observations, reward, done, info",observations, reward, done, info)


In [None]:
action = trainer.compute_actions({"obs_1": observations, "obs_2": observations})
print(action)
# {'obs_1': 0, 'ob

In the following rl test we are going to use  stable_baselines3

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv

import os
import time
import pygame
from pygame.surfarray import array3d

import matplotlib.pyplot as plt

In [None]:
from stable_baselines3.common.env_checker import check_env
env = MyEnv()
check_env(env)

In [None]:
models_dir = f"models/{int(time.time())}/"
logdir = f"logs/{int(time.time())}/"

fps_controller = pygame.time.Clock()
fps_controller.tick(60)

# Checks for errors encountered
pygame.init()

# Initialise game window
pygame.display.set_caption('Training')

if not os.path.exists(models_dir):
    os.makedirs(models_dir)

if not os.path.exists(logdir):
    os.makedirs(logdir)


env = DummyVecEnv([lambda: Monitor(MyEnv(),logdir,allow_early_resets=True)])

model = PPO("MlpPolicy", env,verbose=1, tensorboard_log=logdir,n_epochs=10)

TIMESTEPS = 100
iters = 0

obs = env.reset()
while True:
    iters += 1
    model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name=f"PPO")
    model.save(f"{models_dir}/{TIMESTEPS*iters}")

In the following example we are interested to include pygame

### Example 2 - Gym + Ray + Pygame

In order to create an appropiate gym enviroment applied to ray and pygame we need need to pay attention into two gym objects:

-- action (object): The action to be performed in the step() function. 
In a game of chess, the action would be the specific, legal move performed by a player.

--observation (object): This is all the information available to the agent to choose the next action. 
The observation is based only on the current state of the environment.

## Observation_space and Action_space
In  particular  the observation_space and action_space:
there are only certain actions and observations are valid in regards of a specific environment. 

To define a format, the observation_space and action_space variables need to be assigned to a respective gym.space class.

Spaces can differ in their dimensionality and their value range. Continuous and discrete spaces are both possible.

self.observation_space = <gym.space>
self.action_space = <gym.space>


We are going to consider an enviroment where there are two
points, one red and one blu. The purpose of the game is give a blue point ( worker) where intercept the red point (target)

## Definition of action space

We want to control the position of the blue point.

So the action is the position, the action are the coordinates that you provides to the enviroment

action =[x, y]

The value of each coordinate are continous and must be in the range of the size of the horizontal box

gym.spaces.Box(low=min_x., high=max_x., shape=(2,), dtype=np.float32)

# Analysis of Spaces
Before we continue le us check some examples of spaces in order to understand how it works.

## Box

Box - Supports continuous (and discrete) vectors or matrices, used for vector observations, images, etc



In [None]:
from gym.spaces import Box, Discrete,MultiBinary , MultiDiscrete

In [None]:
# Example 1
observation_space = Box(low=-1.0, high=2.0, shape=(3, 4), dtype=np.float32)
print(observation_space.sample().shape)
observation_space.sample()

In [None]:
#Example 2
observation_space = Box(low=np.array([-1.0, -2.0]), high=np.array([2.0, 4.0]), dtype=np.float32)
print(observation_space.sample().shape)
observation_space.sample()

In [None]:
#Example 2
observation_space = Box(low=np.array(-1.0), high=np.array(2.0), dtype=np.float32)
print(observation_space.sample().shape)
observation_space.sample()

In [None]:
# Example 1
observation_space = Box(low=0, high=200, shape=(2,), dtype=np.float32)
print(observation_space.sample().shape)
observation_space.sample()

In [None]:
print(type(observation_space.sample()))

## Discrete

In [None]:
#Example 3
observation_space =Discrete(2)            # {0, 1}
observation_space.sample()

In [None]:
#Example 4
observation_space =Discrete(3)  # {0, 1, 2}
observation_space.sample()

## MultiBinary

In [None]:
# Example 5
observation_space = MultiBinary(5)
print(observation_space.sample().shape)
observation_space.sample()

In [None]:
# Example 5
observation_space = MultiBinary(2)
print(observation_space.sample().shape)
observation_space.sample()

# MultiDiscrete

In [None]:
# Example 6
observation_space =  MultiDiscrete(np.array([[1, 2], [3, 4]]))
print(observation_space.sample().shape)
observation_space.sample()

# Text

In [None]:
# Example 6a
#observation_space =Text(5)
# {"0", "42", "0123456789", ...}
#observation_space.sample()

In [None]:
# Example 6b
#import string
#observation_space = Text(min_length = 1,
#     max_length = 10,
#     charset = string.digits)
#observation_space.sample()

# Dict
Elements of this space are (ordered) dictionaries of elements from the constituent spaces

In [None]:
# Example 7
from gym.spaces import Dict, Discrete
observation_space = Dict({"position": Discrete(2), "velocity": Discrete(3)})
observation_space.sample()

In [None]:
#Example 8 [nested]:
from gym.spaces import Box, Dict, Discrete, MultiBinary, MultiDiscrete
observation_space =Dict(
    {
        "ext_controller": MultiDiscrete([5, 2, 2]),
        "inner_state": Dict(
            {
                "charge": Discrete(100),
                "system_checks": MultiBinary(10),
                "job_status": Dict(
                    {
                        "task": Discrete(5),
                        "progress": Box(low=0, high=100, shape=()),
                    }
                ),
            }
        ),
    }
)

In [None]:
type(observation_space.sample())

In [None]:
observation_space.sample()

In [None]:
# Example 9
from gym.spaces import Box, Discrete
observation_space = Dict({"position": Box(-1, 1, shape=(2,)), "color": Discrete(3)})

In [None]:
observation_space.sample()

In [None]:
import pandas as pd

In [None]:
df1=pd.read_csv('seats_dataset.csv')

In [None]:
df1.head()

In [None]:
#id  x, y z 

In [None]:
df2 = pd.read_pickle("./employees.pkl")  

In [None]:
df2.head()

In [None]:
df1.head()

In [None]:
#Example 10
observation_space = = gym.spaces.Dict(
    {"x_position": gym.spaces.Box(low=0, high=6, shape=(self.max_sit,), dtype=np.uint8),
     "y_position": gym.spaces.Box(low=0, high=6, shape=(self.max_sit,), dtype=np.uint8),
     "cluster": gym.spaces.Box(low=0, high=6, shape=(self.max_sit,), dtype=np.uint8),
     "project": gym.spaces.Box(low=0, high=6, shape=(self.max_sit,), dtype=np.uint8),
     "energy_consumption": gym.spaces.Box(low=0, high=1, shape=(self.max_sit,)),
     "emp_project": gym.spaces.Box(low=0, high=6, shape=(1,), dtype=np.int32),
     "emp_energy_consumption": gym.spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32)
     }
)

# Tuple

In [None]:
# Example 10
from gym.spaces import Box, Discrete, Tuple
observation_space = Tuple((Discrete(2), Box(-1, 1, shape=(2,))))
observation_space.sample()

# Sequence

In [None]:
# Example 11
#from gym.spaces import Sequence
#space = Sequence(Box(0, 1))
#space.sample()

for futher information visit
https://gymnasium.farama.org/api/spaces/composite/#gymnasium.spaces.Dict

# ---- Summary---

Discrete - Supports a single discrete number of values with an optional start for the values

MultiDiscrete - Supports single or matrices of binary values, used for holding down a button or if an agent has an object

MultiBinary - Supports multiple discrete values with multiple axes, used for controller actions

Text - Supports strings, used for passing agent messages, mission details, etc

Composite Spaces
Often environment spaces require joining fundamental spaces together for vectorised environments, separate agents or readability of the space.

Dict - Supports a dictionary of keys and subspaces, used for a fixed number of unordered spaces

Tuple - Supports a tuple of subspaces, used for multiple for a fixed number of ordered spaces

Sequence - Supports a variable number of instances of a single subspace, used for entities spaces or selecting a variable number of actions

Graph - Supports graph based actions or observations with discrete or continuous nodes and edge values.


Example: if we want to build an observation of a PNG image , you can use the follow:


    # The action and observation spaces need to be gym.spaces objects:
    self.action_space = Discrete(4)  # up, left, right, down
    # Here's an observation space for 200 wide x 100 high RGB image inputs:
    self.observation_space = Box(
        low=0, high=255, shape=(100, 200, 3), dtype=np.uint8)

The standard structure of the ray  enviroment 
https://docs.ray.io/en/latest/rllib/rllib-env.html
should be:

import gym, ray
from ray.rllib.algorithms import ppo

class MyEnv(gym.Env):
    def __init__(self, env_config):
        self.action_space = <gym.Space>
        self.observation_space = <gym.Space>
    def reset(self):
        return <obs>
    def step(self, action):
        return <obs>, <reward: float>, <done: bool>, <info: dict>

ray.init()
algo = ppo.PPO(env=MyEnv, config={
    "env_config": {},  # config to pass to env class
})

while True:
    print(algo.train())
    
    
    
https://docs.ray.io/en/latest/rllib/rllib-algorithms.html?highlight=%20APPOConfig()#appo    

In [None]:
import gym
from gym import spaces
import numpy as np
import pygame
from pygame import display
from pygame.surfarray import array3d
import random

BLACK = pygame.Color(0, 0, 0)
WHITE = pygame.Color(255, 255, 255)
RED = pygame.Color(255, 0, 0)
GREEN = pygame.Color(0, 255, 0)
BLUE = pygame.Color(0, 0, 255)
worker_pos=[0,0]


class MyEnv(gym.Env):
    
    metadata = {'render.modes': ['human']}

    def __init__(self, env_config):

        # We inizialize the display
        self.frame_size_x = 200
        self.frame_size_y = 200
        self.game_window = pygame.display.set_mode((self.frame_size_x, self.frame_size_y))   
        
        # Define action and observation space
        # They must be gym.spaces objects    
            
        # ------------------ACTION SPACE----------------------------------
        # The action space are the possible actions that is allowed by the worker
        # previously was a simple number 
        # self.action_space = spaces.Discrete(1)
        # Now in this example a simple action is a single coordinates (x,y)      
        self.action_space = gym.spaces.Box(low=0, high=200, shape=(2,), dtype=np.int32)#dtype=np.float32) 
        
        
        # ------------------OBSERVATION SPACE-------------------------------
        # Is what the state  is observed
        #Previously was a single number discrete
        #self.observation_space = gym.spaces.Discrete(2)
        # For this example we choose a numpy array   shape=(2,) or an image
        # Attention!!! The observation space should have the same shape that the reset 
        # In order to perform the RL training otherwise will fail!!
       
        self.observation_space = spaces.Box(low=0, high=200,
                                            shape=(2,), dtype=np.int32) #np.float32) 
        '''
        self.observation_space = gym.spaces.Dict(
            {"x_position": gym.spaces.Box(low=0, high=6, shape=(1,), dtype=np.uint8),
             "y_position": gym.spaces.Box(low=0, high=6, shape=(1,), dtype=np.uint8),
             }
        )
        '''
        
        # Initial conditions
        
        self.game_window.fill(WHITE)
        # Moreover we add a position in the screen display
        
        # Initial worker position
        self.worker_pos=[0,0]
        
    
        #We assing a kind of gym object to a circle 
        self.worker_rect=pygame.draw.circle(self.game_window,BLUE,(self.worker_pos[0], self.worker_pos[1]),6) # DRAW CIRCLE

        # Initial target position
        
        self.target_pos = [100, 100]
        print('Initial target position',self.target_pos[0],self.target_pos[1])
        self.target_rect=pygame.draw.circle(self.game_window,RED,(self.target_pos[0], self.target_pos[1]),6) # DRAW CIRCLE
        self.steps = 0

    def reward_value(self,worker,target):
         #Check for collision between two rects
        if worker.colliderect(target):
            #print("worker, target",worker, target)
            reward=1.0
            done=True    
        else:
            reward=-1.0
            done=False
        return reward
              
    def step(self, action):
        #reward = 0.0
        
        self.worker_pos = action
        #print('worker_pos:'self.worker_pos)
            
        # We update the state with an image ( in other words plot the points due to the action)
        self.update_game_state()

        #print(self.worker_rect,self.target_rect)
        
        reward= self.reward_value(self.worker_rect,self.target_rect)
 
        # regardless of the action, game is done after step becomes true
        reward_tmp, done = self.game_over(reward)
        
        #self.reward=self.reward+reward_tmp
        self.reward=reward_tmp
        
        
        info = {}        
        
        
        #Accumulative reward
        print('Reward in step:',self.steps,self.reward)
        
        # -----Under the assuption that we deal with an image---
        #img = self.get_image_array_from_game()
        #state=img
        #--------------------------------------------------------
        self.state=[self.target_pos[0], self.target_pos[1]]

        #print('step:', self.steps)
        self.steps += 1
        
        observation = np.array(self.state, dtype=np.int32) #float32)
        return observation, self.reward, done, info
    
    def worker_step(self,event):   
        '''
        Takes human keyboard event and then returns it as an action string
        '''
        action = None
        if event.type == pygame.QUIT:
            pygame.quit()
            sys.exit()
            
        #Move based on mouse clicks
        if event.type == pygame.MOUSEBUTTONDOWN:
            #print(event)
            mouse_x = event.pos[0]
            mouse_y = event.pos[1]
            self.worker_pos[0]=mouse_x
            self.worker_pos[1]=mouse_y
            action = [self.worker_pos[0], self.worker_pos[1]]
        #Drag the object when the mouse button is clicked
        
        if event.type == pygame.MOUSEMOTION and event.buttons[0] == 1:
            #print(event)
            mouse_x = event.pos[0]
            mouse_y = event.pos[1]
            self.worker_pos[0]=mouse_x
            self.worker_pos[1]=mouse_y
            action = [self.worker_pos[0], self.worker_pos[1]]   
        
        elif event.type == pygame.KEYDOWN:
            # Esc -> Create event to quit the game
            if event.key == pygame.K_ESCAPE:
                pygame.event.post(pygame.event.Event(pygame.QUIT))                
        
        return  action    
    
    def update_game_state(self):
        
        #We fill the screen to white
        self.game_window.fill(WHITE)
        
        #Draw rectangles to represent the rect's of each object
        # For the worker
        
        self.worker_rect.x=self.worker_pos[0]
        self.worker_rect.y=self.worker_pos[1]
        self.worker_rect=pygame.draw.circle(self.game_window,BLUE,(self.worker_rect.x,self.worker_rect.y),6) # DRAW CIRCLE
        
        # For the target
        self.target_rect.x=self.target_pos[0]
        self.target_rect.y=self.target_pos[1]        
        pygame.draw.circle(self.game_window,RED,(self.target_rect.x,self.target_rect.y),6) # DRAW CIRCLE


    def get_image_array_from_game(self):
        img = array3d(display.get_surface())
        #Preprocessing of channels ( needed for tensorflow)
        img = np.swapaxes(img, 0, 1)
        return img    
    
    def reset(self):
        
        # Type 1 Observation
        #print('Target position',self.target_pos[0], self.target_pos[1])
        #observation = np.array([self.target_pos[0], self.target_pos[1]])
        
        #Type 2 Observation
        #We create a simple observation state from a sample
        #self.state = self.observation_space.sample()
        #observation = np.array(self.state, dtype=np.int32) #float32)

        #Type 2 From a random sit
        #sit_random = np.random.randint(1, 9)
        #self.state['x_position'][sit_random:] = 0
        #self.state['y_position'][sit_random:] = 0
        #observation = self.state
        
        #Type 4 From a target
        self.state=[self.target_pos[0], self.target_pos[1]]        
        observation = np.array(self.state, dtype=np.int32) #float32)       
        
        # Type 3 Observation
        #img = array3d(display.get_surface())
        #img = np.swapaxes(img, 0, 1)
        #observation=img
        
        self.steps = 0
        self.reward = 0

        #print("Game Reset.")
        #print('observation',observation)
        return observation    
    
    def render(self, mode='human'):
        if mode == "human":
            display.update()        
    def close(self):
        pass
    
    def game_over(self, reward):
        if (reward < 1) or (self.steps >= 1000): 
            return -1, False
        else:
            return reward, True

# Ray testing

In [None]:
import gym, ray
from ray.rllib.algorithms import ppo
ray.init()
algo = ppo.PPO(env=MyEnv, config={
    "env_config": {},  # config to pass to env class
})

In [None]:
mean_ppo = []
for _ in range(25):
    result = algo.train()
    print("episode reward mean:", _, result['episode_reward_mean'])
    mean_ppo.append(result['episode_reward_mean'])

In [None]:
import matplotlib.pyplot as plt

xs = [x for x in range(len(mean_ppo))]

plt.plot(xs, mean_ppo)
plt.show()


# Testing Code

In [None]:
# method 1 - use local test class
# Testing local frame
env = MyEnv(env_config={})


In [None]:
test=env.reset()

In [None]:
test.shape

In [None]:
action = env.action_space.sample()
print('action',action)

In [None]:
#action=[100,100]
state, reward, done, info = env.step(action)
print("Reward = {} with action = {}".format(reward,action))
import matplotlib.pyplot as plt
print(reward, done, info)
#state = np.array(state)

print(state,type(state))
#plt.figure()
#plt.imshow(state)

In [None]:
# Testing multiple frames
import sys
env = MyEnv(env_config={})
env.reset()
# This is technically a FPS Refresh rate
FPS = 10
# FPS (frames per second) controller
fps_controller = pygame.time.Clock()
# Checks for errors encountered
check_errors = pygame.init()
# Initialise game window
pygame.display.set_caption('Testing Game') 
#The main game loop
running = True
while running:
    # Check Input from Human Step 
    for event in pygame.event.get():
        action = env.worker_step(event)    
        if event.type == pygame.QUIT:
            running = False
        pygame.display.update()
        if action != None :# (0,0) :# and reward!=0:
            #print(action,type(action))
            state, reward, done, info = env.step(action)
            print("Reward = {} with action = {} , done = {}".format(reward,action,done))
            # Refresh game screen    
    # Refresh rate
    fps_controller.tick(FPS)
    img = array3d(env.game_window)

# stable_baselines3

In [None]:
from stable_baselines3.common.env_checker import check_env
#env = MyEnv()
env = MyEnv(env_config={})

check_env(env)

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
import os
import time
import pygame
from pygame.surfarray import array3d
import matplotlib.pyplot as plt

In [None]:
models_dir = f"models/{int(time.time())}/"
logdir = f"logs/{int(time.time())}/"

fps_controller = pygame.time.Clock()
fps_controller.tick(60)

# Checks for errors encountered
pygame.init()

# Initialise game window
pygame.display.set_caption('Traning')

if not os.path.exists(models_dir):
    os.makedirs(models_dir)

if not os.path.exists(logdir):
    os.makedirs(logdir)

In [None]:
envrl = DummyVecEnv([lambda: Monitor(env,logdir,allow_early_resets=True)])

In [None]:
model = PPO("MlpPolicy", env,verbose=1, tensorboard_log=logdir,n_epochs=40)
#model = PPO("MultiInputPolicy", envrl,verbose=1, tensorboard_log=logdir,n_epochs=40)

In [None]:
TIMESTEPS = 100
iters = 0
obs = env.reset()
while True:
    iters += 1
    model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name=f"PPO")
    model.save(f"{models_dir}/{TIMESTEPS*iters}")

### Init Ray


In [None]:
import ray
#from ray.rllib.env.env_context import EnvContext
#from ray.rllib.algorithms import appo
#from ray.rllib.algorithms.appo import APPOConfig
if not ray.is_initialized():
    ray.init()
    assert ray.is_initialized()
    

In [None]:
from ray.rllib.algorithms.appo import APPOConfig
config = (
    APPOConfig()
    .rollouts(horizon=10000)
    .environment(
        MyEnv,
        env_config={}
    )
)

# ray.rllib

In [None]:
import gym, ray
from ray.rllib.algorithms import ppo
#import ray.rllib.agents.ppo as ppo
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print

In [None]:
#dir(ray)
ray.shutdown()
ray.init(ignore_reinit_error=True)

In [None]:
config = ppo.DEFAULT_CONFIG.copy()
config["log_level"] = "WARN"

In [None]:
algo = ppo.PPO(env=MyEnv, config=config) 

In [None]:
#print("Dashboard URL: http://{}".format(ray.get_webui_url()))

In [None]:
#config = PPOConfig()#.rollouts(horizon=200) 
config={
        "env": MyEnv,
        "num_workers": 30,
        "num_cpus_per_worker": 0.5,
        "env_config":{
            "max_steps": 1000,
            "export_frames": False,
            "export_states": False,
            # "reward_mode": "continuous",
            # "env_flipped": True,
            # "env_flipmode": True,
            }
        }

In [None]:
#algo = ppo.PPO(env=MyEnv, config={"env_config": {},  # config to pass to env class
#})

In [None]:
mean_ppo = []
for _ in range(25):
    result = algo.train()
    print("episode reward mean:", _, result['episode_reward_mean'])
    mean_ppo.append(result['episode_reward_mean'])

In [None]:
from ray import tune
tune.run(
    "SAC", # reinforced learning agent
    name = "Training2",
    checkpoint_freq = 100,
    checkpoint_at_end = True,
    local_dir = r'./ray_results/',
    config={
        "env": MyEnv,
        "num_workers": 30,
        "num_cpus_per_worker": 0.5,
        "env_config":{
            "max_steps": 1000,
            "export_frames": False,
            "export_states": False,
            # "reward_mode": "continuous",
            # "env_flipped": True,
            # "env_flipmode": True,
            }
        },
    stop = {
        "timesteps_total": 5_000_000,
        },
    )

In [None]:
N_ITER = 30
s = "{:3d} reward {:6.2f}/{:6.2f}/{:6.2f} len {:6.2f} saved {}"

for n in range(N_ITER):
  result = algo.train()
  #file_name = agent.save(CHECKPOINT_ROOT)

  print(s.format(
    n + 1,
    result["episode_reward_min"],
    result["episode_reward_mean"],
    result["episode_reward_max"],
    result["episode_len_mean"],
    file_name
   ))