# Colab for training a ray agent (default: PPO) and save checkpoints to drive

### Install the gym platform environment and required libs

In [1]:
!pip install -e git+https://github.com/cycraig/gym-platform#egg=gym_platform
!pip install ray[rllib]
!pip install lz4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Obtaining gym_platform from git+https://github.com/cycraig/gym-platform#egg=gym_platform
  Updating ./src/gym-platform clone
  Running command git fetch -q --tags
  Running command git reset --hard -q e9329879dbb62badbbef89648162c97ba1e4d837
Installing collected packages: gym-platform
  Attempting uninstall: gym-platform
    Found existing installation: gym-platform 0.0.1
    Can't uninstall 'gym-platform'. No files were found to uninstall.
  Running setup.py develop for gym-platform
Successfully installed gym-platform-0.0.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Mount google drive to save checkpoints there

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Install Weights&Biases
(Not used)

In [3]:
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Import required libraries for environment and agents

In [4]:
import gym
import gym_platform
from gym_platform.envs.platform_env import PlatformEnv
import ray
from ray.tune.registry import register_env
import ray.rllib.agents.ppo as ppo
import ray.rllib.agents.impala as impala
import ray.rllib.agents.ddpg as ddpg
import shutil
import time

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


### Define the training function

In [7]:
import time 

# Choose the ppo algorithm
algorithm = "ppo"
#algorithm = "impala"
#algorithm = "td3"

def train_agent(env_cur, n_episodes):
    '''
    Train the agent
        Parameters:
            env_cur: gym environment
            n_episodes: total number of training episodes
        Returns:
            agent: RL agent
            sav_file: last training checkpoint file saved
    '''

    save_dir = "train_dir"
    shutil.rmtree(save_dir, ignore_errors=True, onerror=None)

    ray.init()
    register_env(env_cur, lambda config: PlatformEnv())

    if algorithm == "ppo":
      config = ppo.DEFAULT_CONFIG.copy()
    elif algorithm == "impala":
      config = impala.DEFAULT_CONFIG.copy()
    elif algorithm == "td3":
      config = ddpg.td3.TD3_DEFAULT_CONFIG.copy()

    config["framework"] = "torch"

    # Make custom fcnet model

    config["log_level"] = "WARN"
    if algorithm == "ppo":
        config["lr"] = 5e-6
        config["lr_schedule"] = [0, 5e-5], [200000, 1e-6] #[0, lr_start], [lr_time, lr_end]
        config["model"] =   {"fcnet_hiddens": [512, 512, 512], "fcnet_activation": "relu"}

    #config["sgd_minibatch_size"] = 128 #128
    #config["train_batch_size"] = 4000 #4000
    #config["num_gpus"] = 1

    print(config)

    if algorithm == "ppo":
      agent = ppo.PPOTrainer(config, env=env_cur)
    elif algorithm == "impala":
      agent = impala.ImpalaTrainer(config, env=env_cur)
    elif algorithm == "td3":
      agent = ddpg.TD3Trainer(config, env=env_cur)

    env = gym.make(env_cur)
    policy = agent.get_policy() 

    # train agent
    for i in range(n_episodes):
        start_time = time.time()

        result = agent.train()
        sav_file = agent.save(save_dir)
        print(f'{i}: reward: mean={result["episode_reward_mean"]}, '
              f'max={result["episode_reward_max"]}, min={result["episode_reward_min"]}. '
              f'length={result["episode_len_mean"]}. file={sav_file}, '
              #f"lr={result['info']['learner']['default_policy']}")
              f"lr={result['info']['learner']['default_policy']['learner_stats']['cur_lr']}")
        
        if i%10 == 0: agent.save("drive/MyDrive/RL/")
        
    return agent, sav_file


def view_agent(env_cur, agent, sav_file):
    '''
    Display the trained agent
        Parameters:
            env_cur: gym environment
            agent: trained RL agent
            sav_file: checkpoint file saved from training
        Returns:
            None
    '''

    # run the policy
    agent.restore(sav_file)
    env = gym.make(env_cur)

    obs = env.reset()
    ttl_reward = 0
    n_step = 10

    for step in range(n_step):
        action = agent.compute_action(obs)

        print("Action", action)

        obs, reward, done, info = env.step(action)
        ttl_reward += reward

        env.render()
        time.sleep(0.001)
        if done:
            # reward at the end of episode
            print("reward", ttl_reward)
            obs = env.reset()
            ttl_reward = 0
            
    env.close()

In [None]:
ray.shutdown()
def main():
    training_episodes = 100 #1000
    env_cur = "Platform-v0"
    agent, sav_file = train_agent(env_cur, training_episodes)
    #view_agent(env_cur, agent, sav_file)


if __name__ == "__main__":
    main()

2022-10-23 17:51:02,175	INFO worker.py:1421 -- Started a local Ray instance.


{'extra_python_environs_for_driver': {}, 'extra_python_environs_for_worker': {}, 'num_gpus': 0, 'num_cpus_per_worker': 1, 'num_gpus_per_worker': 0, '_fake_gpus': False, 'custom_resources_per_worker': {}, 'placement_strategy': 'PACK', 'eager_tracing': False, 'eager_max_retraces': 20, 'tf_session_args': {'intra_op_parallelism_threads': 2, 'inter_op_parallelism_threads': 2, 'gpu_options': {'allow_growth': True}, 'log_device_placement': False, 'device_count': {'CPU': 1}, 'allow_soft_placement': True}, 'local_tf_session_args': {'intra_op_parallelism_threads': 8, 'inter_op_parallelism_threads': 8}, 'env': None, 'env_config': {}, 'observation_space': None, 'action_space': None, 'env_task_fn': None, 'render_env': False, 'clip_rewards': None, 'normalize_actions': True, 'clip_actions': False, 'disable_env_checking': False, 'num_workers': 2, 'num_envs_per_worker': 1, 'sample_collector': <class 'ray.rllib.evaluation.collectors.simple_list_collector.SimpleListCollector'>, 'sample_async': False, 'en

[2m[36m(pid=1257)[0m   import imp
[2m[36m(pid=1256)[0m   import imp
[2m[36m(pid=1257)[0m Instructions for updating:
[2m[36m(pid=1257)[0m experimental_relax_shapes is deprecated, use reduce_retracing instead
[2m[36m(pid=1257)[0m Instructions for updating:
[2m[36m(pid=1257)[0m experimental_relax_shapes is deprecated, use reduce_retracing instead
[2m[36m(pid=1256)[0m Instructions for updating:
[2m[36m(pid=1256)[0m experimental_relax_shapes is deprecated, use reduce_retracing instead
[2m[36m(pid=1256)[0m Instructions for updating:
[2m[36m(pid=1256)[0m experimental_relax_shapes is deprecated, use reduce_retracing instead
[2m[36m(RolloutWorker pid=1257)[0m   logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
[2m[36m(RolloutWorker pid=1256)[0m   logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


Tuple(Box(0.0, 1.0, (9,), float32), Discrete(200))
Model summary FullyConnectedNetwork(
  (_logits): SlimFC(
    (_model): Sequential(
      (0): Linear(in_features=512, out_features=9, bias=True)
    )
  )
  (_hidden_layers): Sequential(
    (0): SlimFC(
      (_model): Sequential(
        (0): Linear(in_features=209, out_features=512, bias=True)
        (1): ReLU()
      )
    )
    (1): SlimFC(
      (_model): Sequential(
        (0): Linear(in_features=512, out_features=512, bias=True)
        (1): ReLU()
      )
    )
    (2): SlimFC(
      (_model): Sequential(
        (0): Linear(in_features=512, out_features=512, bias=True)
        (1): ReLU()
      )
    )
  )
  (_value_branch_separate): Sequential(
    (0): SlimFC(
      (_model): Sequential(
        (0): Linear(in_features=209, out_features=512, bias=True)
        (1): ReLU()
      )
    )
    (1): SlimFC(
      (_model): Sequential(
        (0): Linear(in_features=512, out_features=512, bias=True)
        (1): ReLU()
      