# Soft Actor Critic

Train a soft actor critic agent based on rl_coach framework : https://nervanasystems.github.io/coach/components/agents/policy_optimization/sac.html

Some "filters" are applied on observation before being supply to soft actor critic policies :
- Convert tensor uint8 type into float32
- Convert rgb images to grayscale
- Reshape image by cropping from (120, 160) -> (80, 160)
- Apply sobel filter (https://en.wikipedia.org/wiki/Sobel_operator)
  - Binarize images by setting elements to 0 or 1
- Embed image with variational auto encoder
- Concat last X actions

In [None]:
import os
import re
import tempfile

import pandas as pd
import matplotlib.pyplot as plt

import mlflow
import mlflow.keras

from rl_coach.agents.soft_actor_critic_agent import SoftActorCriticAgentParameters
from rl_coach.environments.gym_environment import GymVectorEnvironment
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
from rl_coach.graph_managers.graph_manager import SimpleSchedule
from rl_coach.base_parameters import TaskParameters, VisualizationParameters, PresetValidationParameters
from rl_coach.core_types import EnvironmentSteps, TrainingSteps, SelectedPhaseOnlyDumpFilter, RunPhase
from rl_coach.schedules import LinearSchedule

from rl_coach import logger

%matplotlib inline

In [None]:
# parameters
n_history = 30
improve_steps = 20000
heatup_steps = 300
evaluation_steps = 800
steps_between_evaluation_periods = 10000
num_training_per_episode = 1000
vae_runid = "ad896cebfe4544e69e946d4f1bdc24aa"
checkpoint_path = None #"/workspace/mlruns/4/7f18a82d78a84cd58f7947bb4e3986ff/artifacts/17_Step-198210.ckpt"

In [None]:
# define the environment parameters
# Load VAE
vae = mlflow.keras.load_model(f"runs:/{vae_runid}/model", compile=False)
env_params = GymVectorEnvironment(level='xebikart.gym.envs:create_env')
env_params.additional_simulator_parameters = {
    'level': 4, 
    'frame_skip': 2, 
    'max_cte_error': 6.0, 
    'min_steering': -1,
    'max_steering': 1,
    'min_throttle': 0.2, 
    'max_throttle': 0.4,
    'vae': vae, 
    'n_history': n_history, 
    'max_steering_diff': 0.15, 
    'jerk_penalty_weight': 0.}

# Soft Actor Critic
agent_params = SoftActorCriticAgentParameters()
agent_params.algorithm.num_consecutive_training_steps = num_training_per_episode
agent_params.algorithm.act_for_full_episodes = True
agent_params.algorithm.heatup_using_network_decisions = checkpoint_path is not None
# exploration schedules
agent_params.exploration.noise_schedule = LinearSchedule(0.1, 0., improve_steps)
agent_params.exploration.evaluation_noise = 0.


# visualize paremeters
vis_params = VisualizationParameters()
vis_params.print_networks_summary = True
vis_params.dump_parameters_documentation = True
vis_params.dump_mp4 = True
# Default rules, dump at evaluation phase when a new total reward has been achieved 
# Uncomment to dump all video during evaluation phase
#vis_params.video_dump_filters = [SelectedPhaseOnlyDumpFilter(RunPhase.TEST)]

# schedule
schedule_params = SimpleSchedule()
schedule_params.heatup_steps = EnvironmentSteps(heatup_steps)
schedule_params.improve_steps = TrainingSteps(improve_steps)
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(steps_between_evaluation_periods)
schedule_params.evaluation_steps = EnvironmentSteps(evaluation_steps)

graph_manager = BasicRLGraphManager(
    agent_params=agent_params,
    env_params=env_params,
    vis_params=vis_params,
    schedule_params=schedule_params
)

In [None]:
# Create temp directory
experiment_tempdir = tempfile.mkdtemp()

In [None]:
# create graph
task_params = TaskParameters()
task_params.num_gpu = 0
task_params.use_cpu = True
task_params.experiment_path = experiment_tempdir
task_params.checkpoint_save_dir = experiment_tempdir
# 30 min
task_params.checkpoint_save_secs = 60 * 30
# Use to start experiment from a checkpoint
task_params.checkpoint_restore_path = checkpoint_path

graph_manager.create_graph(task_params)

logger.experiment_path = graph_manager.graph_logger.experiments_path

In [None]:
mlflow.set_experiment("soft_actor_critic")

with mlflow.start_run():
    mlflow.log_param("improve_steps", improve_steps)
    mlflow.log_param("heatup_steps", heatup_steps)
    mlflow.log_param("evaluation_steps", evaluation_steps)
    mlflow.log_param("steps_between_evaluation_periods", steps_between_evaluation_periods)
    mlflow.log_param("num_training_per_episode", num_training_per_episode)
    mlflow.log_param("vae_runid", vae_runid)
    mlflow.log_param("n_history", n_history)
    mlflow.log_param("checkpoint", checkpoint_path)
    # TODO: think about saving the graph after improve
    # TODO: maybe not needed
    graph_manager.save_graph()
    graph_manager.improve()
    graph_manager.save_checkpoint()
    mlflow.log_artifacts(experiment_tempdir)
    # logs metrics
    agent_metrics = pd.read_csv(f"{experiment_tempdir}/worker_0.simple_rl_graph.main_level.main_level.agent_0.csv")
    mlflow.log_metric("episode_length_max", agent_metrics["Episode Length"].max())
    mlflow.log_metric("episode_length_mean", agent_metrics["Episode Length"].mean())
    mlflow.log_metric("episode_reward_max", agent_metrics["Training Reward"].max())
    mlflow.log_metric("episode_reward_mean", agent_metrics["Training Reward"].mean())

### - Visualize Reward Evolution

In [None]:
%%capture
list_file = os.listdir(logger.experiment_path)
csvfile = re.findall(pattern='[\w.]*.csv', string=' '.join(list_file))
df = pd.read_csv(os.path.join(logger.experiment_path, csvfile[0]))

In [None]:
fig = plt.figure(figsize=(18,5))
plt.plot(df['Shaped Training Reward'])
plt.title('Reward Evolution')
plt.xlabel('Episodes')

plt.show()

In [None]:
df.head()

In [None]:
graph_manager.close()