# Ray library for RL

`!pip install box2d-py`<br>
`!pip install ray[rllib]`<br>
On 2022-jun there was a problem with the latest version of the OpenAI Gym library. 

- [Algorithms](https://docs.ray.io/en/latest/rllib/rllib-algorithms.html)
- [Helpful video series](https://www.youtube.com/watch?v=krz8SCds7yA)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import ray

ray.shutdown()
ray.init(ignore_reinit_error=True)

## Choosing an environment and Set up file locations for checkpoints

In [None]:
import shutil

#ENV_NAME = "CartPole-v1"
#ENV_NAME = "MountainCar-v0"
#ENV_NAME = "LunarLander-v2"
ENV_NAME = "BipedalWalker-v3"

CHECKPOINT_ROOT = "tmp/"+ENV_NAME
shutil.rmtree(CHECKPOINT_ROOT, ignore_errors=True, onerror=None)

RAY_RESULTS = "tmp/ray_results/"
shutil.rmtree(RAY_RESULTS, ignore_errors=True, onerror=None)

## Choose a method

- https://docs.ray.io/en/latest/rllib/rllib-algorithms.html - List of methods
- https://docs.ray.io/en/latest/rllib/rllib-training.html  - Common parameters

### 1. PPO: Proximal Policy Optimization   
<b>Actions</b> discrete:`Yes`, continuous:`Yes`<br>
Doesn't work for MountainCar-v0

In [None]:
import ray.rllib.agents.ppo as trainer

config = trainer.DEFAULT_CONFIG.copy()

config['num_workers'] = 1           # 8 parallel workers
config['num_sgd_iter'] = 50 
config['sgd_minibatch_size'] = 250
config['model']['fcnet_hiddens'] = [512, 512]

agent = trainer.PPOTrainer(config, env=ENV_NAME)

### 2. DQN: Deep Q-Network
<b>Actions</b> discrete:`Yes`, continuous:`No`<br>
Possible for MountainCar-v0

In [None]:
import ray.rllib.agents.dqn as trainer

config = trainer.DEFAULT_CONFIG.copy()
config['metrics_num_episodes_for_smoothing'] = 100    # def: 100 (для вычисления mean, min, max)
config["framework"]                          = 'tf'   # def: 'tf',  можно  'torch' 
print(config['model'])

agent = trainer.DQNTrainer(config, env=ENV_NAME)

### 3. DDPG  (TD3):  Deep Deterministic Policy Gradients
<b>Actions</b> discrete:`No`, continuous:`Yes`<br>


In [None]:
import ray.rllib.agents.ddpg as trainer

config = trainer.DEFAULT_CONFIG.copy()
print(config['model'])

agent = trainer.DDPGTrainer(config, env=ENV_NAME)

### 4. SAC
<b>Actions</b> discrete:`Yes`, continuous:`Yes`<br>

In [None]:
import ray.rllib.agents.sac as trainer

config = trainer.DEFAULT_CONFIG.copy()

agent = trainer.SACTrainer(config, env=ENV_NAME)

### 5. A3C
<b>Actions</b> discrete:`Yes`, continuous:`Yes`<br>

In [None]:
import ray.rllib.agents.a3c as trainer

config = trainer.DEFAULT_CONFIG.copy()
config["framework"] = 'torch'
      
for name in ['gamma', 'train_batch_size', 'batch_mode', 'lr']:
    print(f"{name:20s}: {config[name]}")
for name in ['fcnet_hiddens', 'fcnet_activation']:
    print(f"{name:20s}: {config['model'][name]}")


agent = trainer.A3CTrainer(config, env=ENV_NAME)
#warnings.filterwarnings('ignore')

### 6. PG:  vanila  Policy Gradients
<b>Actions</b> discrete:`Yes`, continuous:`Yes`<br>
Ray: we include a vanilla policy gradients implementation as an example algorithm.

In [None]:
import ray.rllib.agents.pg as trainer

config = trainer.DEFAULT_CONFIG.copy()
config["framework"] = 'torch'
      
for name in ['gamma', 'train_batch_size', 'batch_mode', 'lr']:
    print(f"{name:20s}: {config[name]}")
for name in ['fcnet_hiddens', 'fcnet_activation']:
    print(f"{name:20s}: {config['model'][name]}")


agent = trainer.PGTrainer(config, env=ENV_NAME)


## Start training

In [None]:
MAX_ITER     = 100000                 # max iterations
MAX_EPISODES = 1000                   # max episodes
file_name, episode, history = "", 0, []
#print(config)

best_mean = -100000
for it in range(1,MAX_ITER+1):
    res = agent.train()    
    
    episode += res['episodes_this_iter']
    
    mean = res['episode_reward_mean']
    history.append([episode, mean])
    if mean > best_mean:
        file_name = agent.save(CHECKPOINT_ROOT)
        best_mean = mean
    
    if it % 10 == 0:
        print(f"\r{it:3d} episode:{episode:5d}  reawrd: {mean:6.2f}  ({res['episode_reward_min']:6.2f}, {res['episode_reward_max']:6.2f}), best: {best_mean:.2f}  {file_name:30s}", end="")        
        
    if episode > MAX_EPISODES:
        print("\nfinish")
        break

history = np.array(history)        
plt.title(f"{ENV_NAME} best: {mean:.2f}", fontsize=18)
plt.plot(history[:,0], history[:,1])
plt.show()

## Model used
### For `config["framework"]  = 'tf'`

In [None]:
policy = agent.get_policy()
model = policy.model
print(model.base_model.summary())

### For `config["framework"]  = 'torch' `

In [None]:
policy = agent.get_policy()
model = policy.model
print(model)

## Testing

In [None]:
import numpy as np
import gym 
env = gym.make(ENV)                              # создать среду
 
def run(episodes = 1000, ticks = 10000):
    rews = []                                    # доходы за каждую попытку
    for _ in range(episodes):                    # делаем trials попыток (игр)
        tot = 0                                  # cуммарное вознаграждение
        obs = env.reset()                        # начальное состояние
        for _ in range(ticks):                   # ticks временных шагов
            action = agent.compute_single_action(obs)                 # выбрать действие
 
            obs, rew, done, _ = env.step(action) # получить информацию
            tot += rew                           # cуммарное вознаграждение            
            env.render()
            if done:                             
                break

        rews.append(tot)                         # накопить вознаграждение

    print(f"Reward: {np.mean(rews):.2f} ± {np.std(rews)/len(rews)**0.5:.2f}, std: {np.std(rews):.2f}, min: {np.min(rews)}, max: {np.max(rews)}")    
    
run(episodes = 10)    

## Loading the Model

In [None]:
agent = dqn.DQNTrainer(config, env=ENV)
#agent = ppo.PPOTrainer(config, env=ENV)
agent.restore("tmp/MountainCar-v0\checkpoint_000120\checkpoint-120")


## Config

In [None]:
for k,v in config.items():
    if type(v) == dict:
        print(f"{k}: ")
        for kk, vv in v.items():
            print(f"      {kk:50s}:", vv)        
    else:
        print(f"{k:30s}:", v)