This notebook substitutes some classes in an experience by "debug" versions of them, which write to file almost every intermidiate step, as to help detect any incoherence in the code

In [None]:
PATH_TO_STORE_EXPERIMENTS = "data\\rl_training"

In [None]:
experiment_name = "dqn_multi_agent"

# Preparation before loading experiment

## Change logging system

In [None]:
from automl.loggers.logger_component import LoggerSchema 

LoggerSchema.get_schema_parameter_signature("write_to_file_when_text_lines_over").change_default_value(-1)
LoggerSchema.get_schema_parameter_signature("necessary_logger_level").change_default_value("DEBUG")

In [None]:
from automl.loggers.component_with_results import ResultLogger

ResultLogger.get_schema_parameter_signature("save_results_on_log").change_default_value(True)

# The base Experiment

## Base Configuration

In [None]:
from automl.rl.whole_configurations import rl_multi_agent_pipeline as base_rl_configuration
rl_pipeline_config = base_rl_configuration.config_dict()

## Debug changes

In [None]:
#from automl.rl.trainers.debug.rl_trainer_debug import RLTrainerDebug
from automl.rl.trainers.rl_trainer.parallel_rl_trainer import RLTrainerComponentParallel
import matplotlib.pyplot as plt
from IPython.display import display, clear_output


class RLTrainerDebug(RLTrainerComponentParallel):

    is_debug_schema = True

    def _proccess_input_internal(self):
        super()._proccess_input_internal()

        plt.ion()  # turn on interactive mode

        self.fig, self.ax = plt.subplots(figsize=(6,4))


    def run_episode_step_for_agent_name(self, i_episode, agent_name):

        done = super().run_episode_step_for_agent_name( i_episode, agent_name)

        self.lg.writeLine(f"Doing episode step in episode {i_episode} for agent {agent_name} was over: {done}", file="observations.txt", use_time_stamp=False)
                        
        return done
    
    def run_single_episode(self, i_episode):
                        
        super().run_single_episode(i_episode)

        clear_output(wait=True)

        self.ax.clear()

        self.get_results_logger().plot_confidence_interval(x_axis='episode', y_column='episode_reward',show_std=False, to_show=False, ax=self.ax)
        self.get_results_logger().plot_linear_regression(x_axis='episode', y_axis='episode_reward', to_show=False, y_label='linear', ax=self.ax)

        self.ax.set_title(f"Training progress (update {i_episode})")
        display(self.fig)



In [None]:
from automl.core.debug.debug_utils import substitute_classes_by_debug_classes
from automl.ml.models.debug.torch_model_debug import TorchModelComponentDebug
from automl.rl.learners.debug.learner_debug import DQNLearnerDebug
from automl.rl.trainers.debug.agent_trainer_debug import AgentTrainerDebug

rl_pipeline_config = substitute_classes_by_debug_classes(rl_pipeline_config, [
    AgentTrainerDebug, 
    DQNLearnerDebug, 
    TorchModelComponentDebug,
    #TranslatorDebug,
    #MemoryDebug,
    #RLTrainerDebug,
    #AdamOptimizerDebug,

    ])

## Base Configuration Interpretation

In [None]:
from automl.rl.policy.policy import ModelComponent


rl_pipeline_input : dict[str, any] = rl_pipeline_config["input"]
rl_pipeline_children : list = rl_pipeline_config.get("child_components", None)

rl_trainer_tuple = rl_pipeline_input["rl_trainer"]
rl_trainer_input : dict[str, any] = rl_trainer_tuple[1]

agents_input : dict[str, any] = rl_pipeline_input["agents_input"]

policy_tuple = agents_input["policy"]
policy_input : dict[str, any] = policy_tuple[1]

model_tuple = policy_input["model"]
model_class : type[ModelComponent] = model_tuple[0] 
model_input : dict[str, any] = model_tuple[1]

agents_trainers_input : dict[str, any] = rl_trainer_input["agents_trainers_input"]

state_translator_tuple = agents_input["state_translator"]
state_translator_input : dict[str, any] = state_translator_tuple[1]

learner_tuple = agents_trainers_input["learner"]
learner_input : dict[str, any] = learner_tuple[1]

optimizer_tuple = learner_input["optimizer"]
optimizer_input : dict[str, any] = optimizer_tuple[1]

memory_tuple = agents_trainers_input["memory"]
memory_input : dict[str, any] = memory_tuple[1]

environment = rl_pipeline_config["input"]["environment"]
environment_input : dict[str, any] = environment[1]

exploration_strategy_tuple = agents_trainers_input["exploration_strategy"]
exploration_strategy_input : dict[str, any] = exploration_strategy_tuple[1]

agent_models_in_sequence = model_input["models"]

agent_fcn_model_tuple = agent_models_in_sequence[1]

agent_fct_model_class : type[ModelComponent] = agent_fcn_model_tuple[0]
agent_fct_model_input : dict[str, any] = agent_fcn_model_tuple[1]

shared_model_definition = rl_pipeline_children[0]

shared_model_type : type[ModelComponent] = shared_model_definition["__type__"]
shared_model_input : dict[str, any] = shared_model_definition["input"]

## Change experiment with Debug variants

In [None]:
from automl.utils.collection_utils import substitute_value_in_dict


AgentTrainerDebug.get_schema_parameter_signature("verify_model_difference_after_optimize").change_default_value(False)

substitute_value_in_dict(agents_trainers_input, "verify_model_difference_after_optimize", False)


In [None]:
DQNLearnerDebug.get_schema_parameter_signature("compare_old_and_new_target_predictions").change_default_value(False)
DQNLearnerDebug.get_schema_parameter_signature("compare_old_and_new_target_model_params").change_default_value(False)

substitute_value_in_dict(learner_input, "compare_old_and_new_target_predictions", False)
substitute_value_in_dict(learner_input, "compare_old_and_new_target_model_params", False)
substitute_value_in_dict(learner_input, "compare_old_and_new_model_predictions", False)

## Manual Hyperparameter Tuning

### Experiment

### Base Model

### Experiment duration

In [None]:
substitute_value_in_dict(rl_trainer_input, "num_episodes", 1000)

rl_trainer_input.pop("limit_total_steps", None)

### Other value changes

In [None]:
#agents_trainers_input["learning_start_step_delay"] = 5000
#agents_trainers_input["learning_start_ep_delay"] = 150

#substitute_value_in_dict(agents_trainers_input, "learning_start_ep_delay", 2897)

In [None]:
substitute_value_in_dict(agents_trainers_input, "batch_size", 32)

In [None]:
substitute_value_in_dict(agents_trainers_input, "optimization_interval", 20)
substitute_value_in_dict(agents_trainers_input, "times_to_learn", 2)

In [None]:
substitute_value_in_dict(memory_input, "capacity", 50)

In [None]:
#optimizer_input["clip_grad_norm"] = 0.1

#substitute_value_in_dict(optimizer_input, "clip_grad_value", 10)

#substitute_value_in_dict(optimizer_input, "learning_rate", 0.006807860813523758)

In [None]:
#substitute_value_in_dict(model_input, "layers", [32, 16])

In [None]:
#substitute_value_in_dict(agents_trainers_input, "discount_factor", 0.8790365307757482)

In [None]:
#substitute_value_in_dict(learner_input, "target_update_rate", 0.5511208693081078)

In [None]:
#substitute_value_in_dict(exploration_strategy_input, "epsilon_end", 0.009535369612528788)

# Gen RL Pipeline

In [None]:

from automl.rl.rl_pipeline import RLPipelineComponent
from automl.utils.json_utils.json_component_utils import gen_component_from

rl_pipeline : RLPipelineComponent = gen_component_from(rl_pipeline_config)

In [None]:
rl_pipeline.pass_input({
    "base_directory" : PATH_TO_STORE_EXPERIMENTS,
                        "artifact_relative_directory" : experiment_name,
                        "create_new_directory" : True,
                        "do_full_setup_of_seed" : True}
                        )

experiment_path = rl_pipeline.get_artifact_directory()

print(f"Experiment path: {experiment_path}")

# Do the training

In [None]:
from automl.loggers.global_logger import activate_global_logger

activate_global_logger(rl_pipeline.get_artifact_directory())

In [None]:
from automl.basic_components.exec_component import save_state


save_state(rl_pipeline)

In [None]:
rl_pipeline.run()

## Save configuration

In [None]:
#rl_pipeline.save_configuration(save_exposed_values=True)
from automl.basic_components.state_management import save_state


save_state(rl_pipeline, save_definition=True)

## See Results

In [None]:
AGGREGATE_NUMBER = 5

In [None]:
    
results_logger = rl_pipeline.get_results_logger()

In [None]:
#results_logger.plot_graph(x_axis='episode', y_axis=[('total_reward', name)], to_show=False)

try:
    results_logger.plot_confidence_interval(x_axis='episode', y_column='episode_reward',show_std=True, to_show=False, y_values_label=experiment_name, aggregate_number=AGGREGATE_NUMBER)
    results_logger.plot_linear_regression(x_axis='episode', y_axis='episode_reward', to_show=False, y_label=experiment_name + '_linear')

except Exception as e:
    print(f"Error: {e}")

## Evaluate

In [None]:
from automl.rl.evaluators.rl_component_evaluator import RLPipelineEvaluator
from automl.rl.evaluators.rl_evaluator_player import EvaluatorWithPlayer
from automl.rl.evaluators.rl_std_avg_evaluator import LastValuesAvgStdEvaluator
from automl.rl.rl_player.rl_parallel_player import RLParallelPlayer


evaluator_definition = {
                        "__type__": EvaluatorWithPlayer,
                        "name": "EvaluatorWithPlayer",
                        "input" : {
                            "rl_player_definition" : (RLParallelPlayer, {}),
                            "base_evaluator" : (LastValuesAvgStdEvaluator, {"value_to_use" : "episode_reward"})
                            }       
                        }

In [None]:
evaluator_component : RLPipelineEvaluator = gen_component_from(evaluator_definition)
rl_pipeline.define_component_as_child(evaluator_component)

In [None]:
evaluation = evaluator_component.evaluate(rl_pipeline)