# Soft Actor Critic - Reward based on CTE

Train a soft actor critic agent based on rl_coach framework : https://nervanasystems.github.io/coach/components/agents/policy_optimization/sac.html

Some "filters" are applied on observation before being supply to soft actor critic policies :
- Convert tensor uint8 type into float32
- Convert rgb images to grayscale
- Reshape image by cropping from (120, 160) -> (80, 160)
- Apply sobel filter (https://en.wikipedia.org/wiki/Sobel_operator)
  - Binarize images by setting elements to 0 or 1

## Reward function

Scale reward from 0 to 5 (5 being center of the track)

-200 penalty on crash

In [None]:
import os
import tempfile

import pandas as pd
import numpy as np

import mlflow
from mlflow.tracking.artifact_utils import _download_artifact_from_uri

from rl_coach.agents.soft_actor_critic_agent import SoftActorCriticAgentParameters
from rl_coach.environments.gym_environment import GymVectorEnvironment
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
from rl_coach.graph_managers.graph_manager import SimpleSchedule
from rl_coach.base_parameters import TaskParameters, VisualizationParameters
from rl_coach.core_types import EnvironmentSteps, TrainingSteps, SelectedPhaseOnlyDumpFilter, RunPhase
from rl_coach.schedules import LinearSchedule

from rl_coach import logger

from xebikart.gym.envs import rewards as gym_rewards

import matplotlib.pyplot as plt

import math

In [None]:
# parameters
improve_steps = 1200000
heatup_steps = 300
evaluation_steps = 800
steps_between_evaluation_periods = 300000
num_training_per_episode = 50
checkpoint_runid = None# "3e46980d55fd457f9afdc56be5636d36"
checkpoint_id = None#"18_Step-200000.ckpt"
max_cte_error = 6.0 # max space between the car and the center of the road before ending an episode
throttle = 0.30
scale_cte_reward=5
crash_reward=-200

In [None]:
# Download checkpoint from mlflow
def download_ckpt(run_id, ckpt_id):
    mlclient = mlflow.tracking.MlflowClient()
    artifacts = mlclient.list_artifacts(run_id)
    ckpt_files = list(filter(lambda x: ckpt_id in x.path, artifacts))
    if len(ckpt_files) == 0:
        raise RuntimeError(f"No checkpoint found for run {run_id} and checkpoint {ckpt_id}")
    output = tempfile.mkdtemp()
    for ckpt_file in ckpt_files:
        print(f"runs:/{run_id}/{ckpt_file.path}")
        _download_artifact_from_uri(f"runs:/{run_id}/{ckpt_file.path}", output_path=output)
    return f"{output}/{ckpt_id}"

In [None]:
if checkpoint_runid is not None and checkpoint_id is not None:
    checkpoint_path = download_ckpt(checkpoint_runid, checkpoint_id)
else:
    checkpoint_path = None

## Reward distribution based on cte

In [None]:
def reward_cte(cte, scale):
    return (scale*-(math.fabs(cte) / max_cte_error) ** 2) + scale

In [None]:
ran = np.linspace(-max_cte_error, max_cte_error, 100)
rew = [reward_cte(i, scale_cte_reward) for i in ran]

plt.plot(ran, rew)
plt.plot((0,0),(-1,scale_cte_reward),'--')
plt.plot((-max_cte_error,-max_cte_error),(-1,scale_cte_reward),'-',c="green")
plt.plot((max_cte_error,max_cte_error),(-1,scale_cte_reward),'-',c="green")
plt.title("Reward distribution based on cte")
plt.show()

In [None]:
# custom reward
def build_reward_fn(cte_scale_reward_weight, crash_reward_weight):
    def _custom_reward_fn(reward, done, info):
        """
        Custom reward function
        
        :param reward:
        :param done:
        :param info:
            "x": 
            "y": 
            "z": 
            "speed": 
            "cte": 
            "hit": 
            "throttle": 
            "steering": 
        """
        
        if done:
            # penalize the agent for getting off the road fast
            return crash_reward_weight
        else:
            cte = info["cte"]
            return reward_cte(cte, cte_scale_reward_weight)
    return _custom_reward_fn

In [None]:
# define the environment parameters
env_params = GymVectorEnvironment(level='xebikart.gym.envs:create_fix_throttle_env')
env_params.additional_simulator_parameters = {
  'throttle': throttle, 'max_cte_error': max_cte_error, 
  'reward_fn': build_reward_fn(cte_scale_reward_weight=scale_cte_reward, crash_reward_weight=crash_reward)
}

# Soft Actor Critic
agent_params = SoftActorCriticAgentParameters()
agent_params.algorithm.num_consecutive_training_steps = num_training_per_episode
agent_params.algorithm.act_for_full_episodes = True
agent_params.algorithm.heatup_using_network_decisions = checkpoint_path is not None
# exploration schedules
agent_params.exploration.noise_schedule = LinearSchedule(0.3, 0., improve_steps)
agent_params.exploration.evaluation_noise = 0.


# visualize paremeters
vis_params = VisualizationParameters()
vis_params.print_networks_summary = True
vis_params.dump_parameters_documentation = True
vis_params.dump_mp4 = True
# Default rules, dump at evaluation phase when a new total reward has been achieved 
# Uncomment to dump all video during evaluation phase
#vis_params.video_dump_filters = [SelectedPhaseOnlyDumpFilter(RunPhase.TRAIN)]

# schedule
schedule_params = SimpleSchedule()
schedule_params.heatup_steps = EnvironmentSteps(heatup_steps)
schedule_params.improve_steps = TrainingSteps(improve_steps)
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(steps_between_evaluation_periods)
schedule_params.evaluation_steps = EnvironmentSteps(evaluation_steps)

graph_manager = BasicRLGraphManager(
    agent_params=agent_params,
    env_params=env_params,
    vis_params=vis_params,
    schedule_params=schedule_params
)

In [None]:
# Create temp directory
experiment_tempdir = tempfile.mkdtemp()

In [None]:
# create graph
task_params = TaskParameters()
task_params.num_gpu = 0
task_params.use_cpu = True
task_params.experiment_path = experiment_tempdir
task_params.checkpoint_save_dir = experiment_tempdir
# 30 min
task_params.checkpoint_save_secs = 6 * 60 * 30
# Use to start experiment from a checkpoint
task_params.checkpoint_restore_path = checkpoint_path

graph_manager.create_graph(task_params)

logger.experiment_path = graph_manager.graph_logger.experiments_path

In [None]:
mlflow.set_experiment("rl_reward_cte")

with mlflow.start_run():
    mlflow.log_param("improve_steps", improve_steps)
    mlflow.log_param("heatup_steps", heatup_steps)
    mlflow.log_param("evaluation_steps", evaluation_steps)
    mlflow.log_param("steps_between_evaluation_periods", steps_between_evaluation_periods)
    mlflow.log_param("num_training_per_episode", num_training_per_episode)
    mlflow.log_param("throttle", throttle)
    mlflow.log_param("checkpoint_runid", checkpoint_runid)
    mlflow.log_param("checkpoint_id", checkpoint_id)
    # TODO: think about saving the graph after improve
    # TODO: maybe not needed
    graph_manager.save_graph()
    graph_manager.improve()
    graph_manager.save_checkpoint()
    mlflow.log_artifacts(experiment_tempdir)
    # logs metrics
    agent_metrics = pd.read_csv(f"{experiment_tempdir}/worker_0.simple_rl_graph.main_level.main_level.agent_0.csv")
    mlflow.log_metric("episode_length_max", agent_metrics["Episode Length"].max())
    mlflow.log_metric("episode_length_mean", agent_metrics["Episode Length"].mean())
    mlflow.log_metric("episode_reward_max", agent_metrics["Training Reward"].max())
    mlflow.log_metric("episode_reward_mean", agent_metrics["Training Reward"].mean())

In [None]:
graph_manager.close()