This notebook substitutes some classes in an experience by "debug" versions of them, which write to file almost every intermidiate step, as to help detect any incoherence in the code

In [1]:
PATH_TO_STORE_EXPERIMENTS = "data\\rl_training"

In [2]:
#experiment_name = "dqn_sb3_cartpole"
#experiment_name = "dqn_sb3_cartpole_ppo"

experiment_name = "ppo_cartpole"

# The base Experiment

## Base Configuration

In [None]:
#from automl.base_configurations.environment.cart_pole import dqn_sb3 as base_rl_configuration
from automl.base_configurations.environment.cart_pole import ppo_sb3 as base_rl_configuration


rl_pipeline_config = base_rl_configuration.config_dict()

## Base Configuration Interpretation

In [None]:
rl_pipeline_input = rl_pipeline_config["input"]

rl_trainer_tuple = rl_pipeline_input["rl_trainer"]
rl_trainer_input = rl_trainer_tuple[1]

agents_input = rl_pipeline_input["agents_input"]

policy_tuple = agents_input["policy"]
policy_input = policy_tuple[1]

agents_trainers_input = rl_trainer_input["agents_trainers_input"]

In [None]:
learner_tuple = agents_trainers_input["learner"]
learner_input = learner_tuple[1]

optimizer_tuple = learner_input["optimizer"]
optimizer_input = optimizer_tuple[1]

In [None]:
memory_tuple = agents_trainers_input["memory"]

In [None]:
environment = rl_pipeline_config["input"]["environment"]
environment_input = environment[1]

# Debug Changes

## Change logging system

We change the logging so we have full immediate  visibility of results

In [None]:
from automl.loggers.logger_component import LoggerSchema 


LoggerSchema.get_schema_parameter_signature("write_to_file_when_text_lines_over").change_default_value(-1)
LoggerSchema.get_schema_parameter_signature("necessary_logger_level").change_default_value("INFO")

In [None]:
from automl.loggers.component_with_results import ResultLogger


ResultLogger.get_schema_parameter_signature("save_results_on_log").change_default_value(True)

## Classes to Help Debug

In [None]:
from automl.ml.models.torch_model_components import TorchModelComponent
from automl.rl.trainers.agent_trainer_ppo import AgentTrainerPPO
from automl.rl.trainers.debug.agent_trainer_debug import AgentTrainerDebug
from automl.ml.models.torch_model_utils import model_parameter_distance
import torch



# we had our own debug functionality
class AgentTrainerPPODebugClass(AgentTrainerDebug, AgentTrainerPPO):

    def _proccess_input_internal(self):

        super()._proccess_input_internal()
    

    def optimizeAgent(self):
            
            super().optimizeAgent()

            self.lg.writeLine("\nParam IDs of the model:", file="model_optimization.txt")
            for p in self.model.model.parameters():
                self.lg.writeLine(f"{id(p)} {p.shape}", file="model_optimization.txt")
    
            self.lg.writeLine("\nParam IDs being optimized by the actor optimizer:", file="model_optimization.txt")
            for g in self.learner.actor_optimizer.torch_adam_opt.param_groups:
                for p in g['params']:
                    self.lg.writeLine(f"optimizer param id: {id(p)}", file="model_optimization.txt")

        
    

In [None]:
import os
from automl.basic_components.artifact_management import open_or_create_folder
from automl.component import requires_input_proccess
from automl.loggers.logger_component import ComponentWithLogging
from automl.rl.learners.ppo_learner import PPOLearner
from automl.rl.learners.debug.learner_debug import LearnerDebug
import torch
import torch.nn.functional as F

class PPOLearnerDebugLearnSubstitute(PPOLearner):

    def _proccess_input_internal(self):
        super()._proccess_input_internal()

        self.lg.open_or_create_relative_folder("learning")

    @requires_input_proccess
    def _learn(self, trajectory, discount_factor):

        path_to_write = self.lg.new_relative_path_if_exists("computation.txt", dir="learning")
        
        self.number_of_times_optimized += 1
        
        state_batch, action_batch, next_state_batch, reward_batch, done_batch, log_prob_batch = self._interpret_trajectory(trajectory)
        
        # Compute value estimates
        values = self.critic.predict(state_batch).squeeze(-1)
        with torch.no_grad():
            next_values = self.critic.predict(next_state_batch).squeeze(-1)

        # Mask out terminal states (no bootstrapping after done)
        next_values = next_values * (1 - done_batch)

        self.lg.writeLine(f"\nComputed next_values:", file=path_to_write, use_time_stamp=False)
        self.lg.writeLine(f"next_value = critic(next_state) if not done else 0\n", file=path_to_write, use_time_stamp=False)

        for i in range(len(next_values)):
            # print A = Q - V
            self.lg.writeLine(f"{next_values[i]} = critic({next_state_batch[i]}) if not {done_batch[i]} else 0", file=path_to_write, use_time_stamp=False)
        
        # Compute advantages using Generalized Advantage Estimation (GAE)
        deltas = reward_batch + discount_factor * next_values - values 
        advantages = torch.zeros_like(deltas, device=self.device)

        self.lg.writeLine(f"\nComputing {len(deltas)} deltas:", file=path_to_write, use_time_stamp=False)
        self.lg.writeLine("delta = r + discount_factor * next_values - values\n", file=path_to_write, use_time_stamp=False)

        for i in range(len(advantages)):
            # print A = Q - V
            self.lg.writeLine(f"{deltas[i]} = {reward_batch[i]} + {discount_factor} * {next_values[i]} - {values[i]}", file=path_to_write, use_time_stamp=False)
        
        self.lg.writeLine(f"\nComputing running advantage:", file=path_to_write, use_time_stamp=False)
        self.lg.writeLine(f"running_advantage[t] = deltas[t] + discount_factor * lamda_gae * running_advantage", file=path_to_write, use_time_stamp=False)
        self.lg.writeLine(f"advantages[t] = running_advantage\n", file=path_to_write, use_time_stamp=False)


        # GAE computation in reverse
        running_advantage = 0
        for t in reversed(range(len(deltas))):
            old_running_advantage = running_advantage
            old_advantage = advantages[t]

            running_advantage = deltas[t] + discount_factor * self.lamda_gae * running_advantage
            advantages[t] = running_advantage

            self.lg.writeLine(f"{running_advantage} = {deltas[t]} + {discount_factor} * {self.lamda_gae} * {old_running_advantage}", file=path_to_write, use_time_stamp=False)
            self.lg.writeLine(f"{old_advantage} substituted by {running_advantage}", file=path_to_write, use_time_stamp=False)

        self.lg.writeLine(f"\nComputing new advantages:", file=path_to_write, use_time_stamp=False)
        self.lg.writeLine(f"advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)\n", file=path_to_write, use_time_stamp=False)

        advantages_mean = advantages.mean()
        advantages_std = advantages.std()

        old_advantages = advantages.clone()

        self.lg.writeLine(f"Advantages mean: {advantages_mean}", file=path_to_write, use_time_stamp=False)
        self.lg.writeLine(f"Advantages std: {advantages_mean}\n", file=path_to_write, use_time_stamp=False)

        advantages = (advantages - advantages_mean) / (advantages_std + 1e-8)

        for i in range(len(advantages)):
            self.lg.writeLine(f"{advantages[i]} = ({old_advantages[i]} - {advantages_mean}) / ({advantages_std} + 1e-8)", file=path_to_write, use_time_stamp=False)

        returns = advantages + values.detach()

        # Compute new log probabilities from the policy
        new_log_probs, entropy = self._evaluate_actions(state_batch, action_batch)

        # Compute ratio (pi_theta / pi_theta_old)
        ratio = torch.exp(new_log_probs - log_prob_batch)

        # This is the true loss
        surrogate1 = ratio * advantages
        
        # This is the clipped loss
        surrogate2 = torch.clamp(ratio, 1 - self.clip_epsilon, 1 + self.clip_epsilon) * advantages
        
        # This is the policy loss we want to minimize
        policy_loss = -torch.min(surrogate1, surrogate2).mean()

        # Compute value loss
        value_loss = F.mse_loss(values, returns)

        # Total loss
        loss : torch.Tensor = policy_loss + self.value_loss_coef * value_loss - self.entropy_coef * entropy.mean()

        self.actor_optimizer.clear_optimizer_gradients()
        self.critic_optimizer.clear_optimizer_gradients()

        loss.backward() # we do the optimization here so it goes to both optimizers

        self.lg.writeLine("\nDid backward step, noticing the gradients: ", file=path_to_write, use_time_stamp=False)

        for name, p in self.model.model.named_parameters():
            if p.grad is None:
                self.lg.writeLine(f"Grad {name} is None", file=path_to_write, use_time_stamp=False)
            else:
                grad_norm = p.grad.detach().norm().item()
                self.lg.writeLine(f"Grad {name} norm: {grad_norm}", file=path_to_write, use_time_stamp=False)



        self.actor_optimizer.optimize_with_backward_pass_done()
        self.critic_optimizer.optimize_with_backward_pass_done()


class PPOLearnerDebug(LearnerDebug, PPOLearnerDebugLearnSubstitute):
    pass



In [None]:


from automl.ml.memory.torch_memory_component import TorchMemoryComponent
from automl.ml.memory.debug.memory_debug import MemoryDebug


class TorchMemoryComponentDebug(MemoryDebug, TorchMemoryComponent):


    def _proccess_input_internal(self):
        super()._proccess_input_internal()
        


In [None]:
from automl.rl.rl_pipeline import RLTrainerComponent
import matplotlib.pyplot as plt
from IPython.display import display, clear_output

class RLTrainerDebug(RLTrainerComponent):

    def _proccess_input_internal(self):
        super()._proccess_input_internal()

        plt.ion()  # turn on interactive mode

        self.fig, self.ax = plt.subplots(figsize=(6,4))


    def run_episode_step_for_agent_name(self, i_episode, agent_name):

        done = super().run_episode_step_for_agent_name( i_episode, agent_name)

        self.lg.writeLine(f"Doing episode step in episode {i_episode} for agent {agent_name} was over: {done}", file="observations.txt", use_time_stamp=False)
                        
        return done
    
    def run_single_episode(self, i_episode):
                        
        super().run_single_episode(i_episode)

        clear_output(wait=True)

        self.ax.clear()

        self.get_results_logger().plot_confidence_interval(x_axis='episode', y_column='episode_reward',show_std=False, to_show=False)
        self.get_results_logger().plot_linear_regression(x_axis='episode', y_axis='episode_reward', to_show=False, y_label='linear')

        self.ax.set_title(f"Training progress (update {i_episode})")
        display(self.fig)



# Change experiment with Debug variants

In [None]:
rl_trainer_input["default_trainer_class"] = AgentTrainerPPODebugClass

In [None]:
learner_tuple = (PPOLearnerDebug, learner_tuple[1])
agents_trainers_input["learner"] = learner_tuple

In [None]:
memory_tuple = (TorchMemoryComponentDebug, memory_tuple[1])
agents_trainers_input["memory"] = memory_tuple


In [None]:
rl_trainer_tuple = (RLTrainerDebug, rl_trainer_tuple[1])
rl_pipeline_input["rl_trainer"] = rl_trainer_tuple


# Manual Hyperparameter Tuning

### Base Model

In [None]:
LOAD_MODEL = False

In [None]:
import os

if LOAD_MODEL:

    #base_model_path = 'data\\models\\sb3_CartPole_dqn\\sb3_CartPole_dqn_perturbed_0_10'
    #base_model_path = 'data\\models\\sb3_CartPole_dqn\\sb3_CartPole_dqn_perturbed_5_50'
    base_model_path = 'data\\models\\sb3_CartPole_ppo\\sb3_CartPole_ppo_gaussian_0_0.8_0.9'

    #base_model_path = 'data\\models\\FC_CartPole_ppo\\FC_CartPole_ppo'
        
    model_name = os.path.basename(base_model_path)
    
    experiment_name = f"{experiment_name}\\{model_name}"
    
    rl_pipeline_input = rl_pipeline_config["input"]
    
    policy_input["model"] = base_model_path


In [None]:
#learner_input["critic_model"] = 'data\\models\\sb3_CartPole_ppo_critic\\sb3_CartPole_ppo_critic'

In [None]:
#rl_trainer_input["limit_total_steps"] = 1000

#rl_trainer_input.pop("limit_total_steps", None)

#rl_trainer_input["num_episodes"] = 4000


In [None]:
#agents_trainers_input["learning_start_step_delay"] = 5000
#agents_trainers_input["learning_start_ep_delay"] = 150

In [None]:
#agents_trainers_input["optimization_interval"] = 2048
#agents_trainers_input["times_to_learn"] = 1

In [None]:
#optimizer_input["clip_grad_norm"] = 0.1
#optimizer_input["clip_grad_value"] = 0.1
#optimizer_tuple = learner_input["learning_rate"] = 0.0012

# Gen RL Pipeline

In [None]:

from automl.rl.rl_pipeline import RLPipelineComponent
from automl.utils.json_utils.json_component_utils import gen_component_from

rl_pipeline : RLPipelineComponent = gen_component_from(rl_pipeline_config)

In [None]:
rl_pipeline.pass_input({"base_directory" : PATH_TO_STORE_EXPERIMENTS,
                        "artifact_relative_directory" : experiment_name,
                        "create_new_directory" : True})

experiment_path = rl_pipeline.get_artifact_directory()

print(f"Experiment path: {experiment_path}")

# Do the training

In [None]:
from automl.loggers.global_logger import activate_global_logger

activate_global_logger(rl_pipeline.get_artifact_directory())

In [None]:
rl_pipeline.proccess_input_if_not_proccesd()

In [None]:
rl_pipeline.run()

### Save configuration

In [None]:
#rl_pipeline.save_configuration(save_exposed_values=True)
from automl.basic_components.state_management import save_state


save_state(rl_pipeline, save_definition=True)

## See Results

In [None]:
AGGREGATE_NUMBER = 5

In [None]:

from automl.loggers.result_logger import RESULTS_FILENAME, ResultLogger

results_directory = f"{experiment_path}\\RLTrainerComponent"
    
results_logger = ResultLogger(input={
                                        "results_filename" : RESULTS_FILENAME,
                                        "base_directory" : results_directory,
                                        "artifact_relative_directory" : '',
                                        "create_new_directory" : False
                            })

In [None]:
#results_logger.plot_graph(x_axis='episode', y_axis=[('total_reward', name)], to_show=False)
results_logger.plot_confidence_interval(x_axis='episode', y_column='episode_reward',show_std=True, to_show=False, y_values_label=experiment_name, aggregate_number=AGGREGATE_NUMBER)
results_logger.plot_linear_regression(x_axis='episode', y_axis='episode_reward', to_show=False, y_label=experiment_name + '_linear')
