# Soft Actor Critic

Train a soft actor critic agent based on rl_coach framework : https://nervanasystems.github.io/coach/components/agents/policy_optimization/sac.html

Some "filters" are applied on observation before being supply to soft actor critic policies :
- Convert tensor uint8 type into float32
- Convert rgb images to grayscale
- Reshape image by cropping from (120, 160) -> (80, 160)
- Apply sobel filter (https://en.wikipedia.org/wiki/Sobel_operator)
  - Binarize images by setting elements to 0 or 1
- Embed image with variational auto encoder (vae)


In [None]:
import os

import mlflow
import mlflow.keras
import tempfile

import pandas as pd

from rl_coach.agents.soft_actor_critic_agent import SoftActorCriticAgentParameters
from rl_coach.environments.gym_environment import GymVectorEnvironment
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
from rl_coach.graph_managers.graph_manager import SimpleSchedule, SimpleScheduleWithoutEvaluation
from rl_coach.base_parameters import TaskParameters, VisualizationParameters
from rl_coach.core_types import EnvironmentSteps, TrainingSteps, SelectedPhaseOnlyDumpFilter, RunPhase
from rl_coach import logger

from rl_coach.schedules import Schedule, LinearSchedule

In [None]:
# parameters
improve_steps = 200000
heatup_steps = 300
evaluation_steps = 800
steps_between_evaluation_periods = 50000
num_training_per_episode = 50
vae_runid = "1882ffed18594d8abba5239f106f7efe"

In [None]:
# define the environment parameters
# Load VAE
vae = mlflow.keras.load_model(f"runs:/{vae_runid}/model", compile=False)
env_params = GymVectorEnvironment(level='xebikart.gym.envs:create_env')
env_params.additional_simulator_parameters = {
  'level': 4, 'frame_skip': 2, 'max_cte_error': 3.0, 
  'min_steering': -1, 'max_steering': 1,
  'min_throttle': 0.2, 'max_throttle': 0.4,
  'vae': vae, 'n_history': 10, 
  'max_steering_diff': 0.15, 'jerk_penalty_weight': 0.}

# Soft Actor Critic
agent_params = SoftActorCriticAgentParameters()
agent_params.algorithm.num_consecutive_training_steps = num_training_per_episode
agent_params.algorithm.act_for_full_episodes = True

# visualize paremeters
vis_params = VisualizationParameters()
vis_params.print_networks_summary = True
vis_params.dump_parameters_documentation = True
vis_params.dump_mp4 = True
# Default rules, dump at evaluation phase when a new total reward has been achieved 
#vis_params.video_dump_filters = [SelectedPhaseOnlyDumpFilter(RunPhase.TEST)]


# schedule
schedule_params = SimpleSchedule()
schedule_params.heatup_steps = EnvironmentSteps(heatup_steps)
schedule_params.improve_steps = TrainingSteps(improve_steps)
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(steps_between_evaluation_periods)
schedule_params.evaluation_steps = EnvironmentSteps(evaluation_steps)

graph_manager = BasicRLGraphManager(
    agent_params=agent_params,
    env_params=env_params,
    vis_params=vis_params,
    schedule_params=schedule_params
)

In [None]:
# Create temp directory
experiment_tempdir = tempfile.mkdtemp()

In [None]:
# create graph
task_params = TaskParameters()
task_params.num_gpu = 0
task_params.use_cpu = True
task_params.experiment_path = experiment_tempdir
task_params.checkpoint_save_dir = experiment_tempdir
# 30 min
task_params.checkpoint_save_secs = 60 * 1
# Use to start experiment from a checkpoint
#task_params.checkpoint_restore_path = ""

graph_manager.create_graph(task_params)

logger.experiment_path = graph_manager.graph_logger.experiments_path

In [None]:
mlflow.set_experiment("soft_actor_critic")

with mlflow.start_run():
    mlflow.log_param("improve_steps", improve_steps)
    mlflow.log_param("heatup_steps", heatup_steps)
    mlflow.log_param("evaluation_steps", evaluation_steps)
    mlflow.log_param("steps_between_evaluation_periods", steps_between_evaluation_periods)
    mlflow.log_param("num_training_per_episode", num_training_per_episode)
    mlflow.log_param("vae_runid", vae_runid)
    mlflow.log_param("vae_runid", num_training_per_episode)
    graph_manager.save_graph()
    graph_manager.improve()
    mlflow.log_artifacts(experiment_tempdir)
    # TODO: compute some metrics (avg reward, max reward, ...)
    agent_metrics = pd.read_csv(f"{experiment_tempdir}/worker_0.simple_rl_graph.main_level.main_level.agent_0.csv")
    mlflow.log_metric("episode_length_max", agent_metrics["Episode Length"].max())
    mlflow.log_metric("episode_length_mean", agent_metrics["Episode Length"].mean())
    mlflow.log_metric("episode_reward_max", agent_metrics["Training Reward"].max())
    mlflow.log_metric("episode_reward_mean", agent_metrics["Training Reward"].mean())

In [None]:
graph_manager.close()