# Train Soft Actor Critical Agent

Use some image transformations :
- crop (1/3 of height)
- sobel_edges
- auto encoder (variational auto encoder)

In [None]:
import os

import mlflow
import mlflow.keras
import tempfile

from stable_baselines.bench import Monitor
from stable_baselines.common.vec_env import DummyVecEnv, VecVideoRecorder

from xebikart.gym.envs.donkey_env import DonkeyEnv
from xebikart.gym.envs.wrappers import CropObservationWrapper, ConvVariationalAutoEncoderObservationWrapper, \
    HistoryBasedWrapper, EdgingObservationWrapper

from xebikart.rl.sac import CustomSAC, CustomSACPolicy

## Parameters

In [None]:
learning_start = 300
steps = 1000
vae_mlflow_runid = "576b04c6640d4d7c834a928b6bb4fb9a"
sac_policy = CustomSACPolicy
level = 0
use_vae = True
use_history = False
n_history = 10

In [None]:
# Create temp directory
tempdir = tempfile.mkdtemp()

## Create Donkey Env

Start a donkey simulator in level 0.

Add wrappers (Crop -> VAE -> History)

In [None]:
# Create donkey env
donkey_env = DonkeyEnv(
  level=level, frame_skip=2, max_cte_error=3.0, 
  min_steering=-1, max_steering=1,
  min_throttle=0.4, max_throttle=0.6
)

In [None]:
# Load VAE
vae = mlflow.keras.load_model(f"runs:/{vae_mlflow_runid}/model", compile=False)
# CropObservation
crop_obs = CropObservationWrapper(donkey_env, 0, 40, 160, 80)
# Edging
edging_obs = EdgingObservationWrapper(crop_obs)
# VAE
vae_obs = ConvVariationalAutoEncoderObservationWrapper(edging_obs, vae)
# History
history_obs = HistoryBasedWrapper(vae_obs, n_command_history=n_history, max_steering_diff=0.15, jerk_penalty_weight=0.)

use_obs = edging_obs
if use_vae:
    use_obs = history_obs if use_history else vae_obs

In [None]:
# Set monitor
log_dir = os.path.join(tempdir, "log_dir")
donkey_env_monitored = Monitor(use_obs, log_dir, allow_early_resets=True)
# Create DummyVecEnv
env = DummyVecEnv([lambda: donkey_env_monitored])
# Record the video starting at the first step
videos_path = os.path.join(tempdir, "videos")
env = VecVideoRecorder(env, videos_path,
                       record_video_trigger=lambda x: x == 0, video_length=1000)

## Define Model and parameters

- param policy: (SACPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...)
- param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
- param gamma: (float) the discount factor
- param learning_rate: (float or callable) learning rate for adam optimizer,
    the same learning rate will be used for all networks (Q-Values, Actor and Value function)
    it can be a function of the current progress (from 1 to 0)
- param buffer_size: (int) size of the replay buffer
- param batch_size: (int) Minibatch size for each gradient update
- param tau: (float) the soft update coefficient ("polyak update", between 0 and 1)
- param ent_coef: (str or float) Entropy regularization coefficient. (Equivalent to
    inverse of reward scale in the original SAC paper.)  Controlling exploration/exploitation trade-off.
    Set it to 'auto' to learn it automatically (and 'auto_0.1' for using 0.1 as initial value)
- param train_freq: (int) Update the model every `train_freq` steps.
- param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
- param target_update_interval: (int) update the target network every `target_network_update_freq` steps.
- param gradient_steps: (int) How many gradient update after each step
- param target_entropy: (str or float) target entropy when learning ent_coef (ent_coef = 'auto')
- param action_noise: (ActionNoise) the action noise type (None by default), this can help
    for hard exploration problem. Cf DDPG for the different action noise type.
- param random_exploration: (float) Probability of taking a random action (as in an epsilon-greedy strategy)
    This is not needed for SAC normally but can help exploring when using HER + SAC.
    This hack was present in the original OpenAI Baselines repo (DDPG + HER)
- param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
- param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
- param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
- param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
- param full_tensorboard_log: (bool) enable additional logging when using tensorboard
    Note: this has no effect on SAC logging for now


In [None]:
# Compute and create log path
tensorboard_log = os.path.join(tempdir, "tensorboard")

model = CustomSAC(env=env, policy=sac_policy, 
    learning_rate=3e-4, train_freq=300, buffer_size=30000, 
    batch_size=64, gamma=0.99, gradient_steps=600,
    learning_starts=learning_start, ent_coef="auto_0.1",
    tensorboard_log=tensorboard_log, verbose=1, full_tensorboard_log=True,
)

In [None]:
# Train an agent from scratch
mlflow.set_experiment("soft_actor_critical_edges")

with mlflow.start_run():
    mlflow.log_params({
        "sac_policy": sac_policy,
        "donkey_car_level": level,
        "use_vae": use_vae,
        "vae_run_id": vae_mlflow_runid if use_vae else None,
        "use_history": use_history,
        "n_history": n_history if use_history else None,
        "steps": steps
    })
    mlflow.log_metrics({
        "steps" : steps
    }, step =10)
    model.learn(total_timesteps=steps)
    # Save trained model
    model.save(os.path.join(tempdir, "model"))
    mlflow.log_artifacts(tempdir)

In [None]:
donkey_env.close()
env.close()

In [None]:
#from mlflow.tracking.artifact_utils import _download_artifact_from_uri

#run_id = "f288e78b08374a6a90078fe08ae5b9cc"
#local_path_uri = _download_artifact_from_uri(f"runs:/{run_id}/model.pkl")
#sac = CustomSAC.load(local_path_uri)
# gradient_steps not saved
#sac.gradient_steps = 600