This notebook substitutes some classes in an experience by "debug" versions of them, which write to file almost every intermidiate step, as to help detect any incoherence in the code

In [None]:
PATH_TO_STORE_EXPERIMENTS = "data\\rl_training"

In [None]:
experiment_name = "dqn_multi_agent"

# Preparation before loading experiment

## Change logging system

In [None]:
from automl.loggers.logger_component import LoggerSchema 

LoggerSchema.get_schema_parameter_signature("write_to_file_when_text_lines_over").change_default_value(-1)
LoggerSchema.get_schema_parameter_signature("necessary_logger_level").change_default_value("INFO")

In [None]:
from automl.loggers.component_with_results import ResultLogger


ResultLogger.get_schema_parameter_signature("save_results_on_log").change_default_value(True)

# The base Experiment

## Base Configuration

In [None]:
from automl.rl.whole_configurations import rl_multi_agent_pipeline as base_rl_configuration


rl_pipeline_config = base_rl_configuration.config_dict()

## Base Configuration Interpretation

In [None]:
rl_pipeline_input = rl_pipeline_config["input"]

rl_trainer_tuple = rl_pipeline_input["rl_trainer"]
rl_trainer_input = rl_trainer_tuple[1]

agents_input = rl_pipeline_input["agents_input"]

policy_tuple = agents_input["policy"]
policy_input = policy_tuple[1]

agents_trainers_input = rl_trainer_input["agents_trainers_input"]

In [None]:
state_translator_tuple = agents_input["state_translator"]
state_translator_input = state_translator_tuple[1]

In [None]:
learner_tuple = agents_trainers_input["learner"]
learner_input = learner_tuple[1]

optimizer_tuple = learner_input["optimizer"]
optimizer_input = optimizer_tuple[1]

In [None]:
memory_tuple = agents_trainers_input["memory"]
memory_input = memory_tuple[1]

In [None]:
environment = rl_pipeline_config["input"]["environment"]
environment_input = environment[1]

In [None]:
exploration_strategy_tuple = agents_trainers_input["exploration_strategy"]
exploration_strategy_input = exploration_strategy_tuple[1]

# Changes to the base configuration

## Code to help alter experiment

In [None]:
def substitute_value_in_dict(dict_with_value : dict, key, new_value):
    print(f"Old value for key '{key}': {dict_with_value.get(key, None)}, new value: {new_value}")
    dict_with_value[key] = new_value

def remove_value_in_dict(dict_with_value : dict, key, new_value):
    print(f"Old value for key '{key}': {dict_with_value.get(key, None)}, to be removed...")
    dict_with_value.pop(key, None)



def substitute_tuple_value_in_dict(dict_with_tuple : dict, key, tuple_index, new_value):

    tuple_value : tuple = dict_with_tuple[key]

    print(f"Old value for tuple pos {tuple_index}: {tuple_value[tuple_index]}, new value: {new_value}")
    new_tuple_value = tuple( new_value if tuple_index == i else tuple_value[i] for i in range(len(tuple_value)) )

    dict_with_tuple[key] = new_tuple_value


## Changes to debug

### Classes to Help Debug

In [None]:
from automl.rl.trainers.debug.agent_trainer_debug import AgentTrainerDebug
from automl.rl.trainers.agent_trainer_component_dqn import AgentTrainerDQN

# we had our own debug functionality
class AgentTrainerDQNDebugClass(AgentTrainerDebug, AgentTrainerDQN):
    pass
        

In [None]:
from automl.component import requires_input_proccess
from automl.rl.learners.debug.learner_debug import DQNLearnerDebug
from automl.rl.learners.q_learner import DeepQLearnerSchema, DoubleDeepQLearnerSchema
import torch

QLearnerSchema = DoubleDeepQLearnerSchema

class CustomDQNLearnerDebugStrategy(QLearnerSchema):

    def _proccess_input_internal(self):
        super()._proccess_input_internal()

        self.lg.open_or_create_relative_folder("learning")


    def _apply_model_prediction_given_state_action_pairs(self, state_batch, action_batch):

        '''Returns the values predicted by the current model and the values for the specific actions that were passed''' 

        predicted_actions_values, predicted_values_for_actions = super()._apply_model_prediction_given_state_action_pairs(state_batch, action_batch)

        self.lg.writeLine(f"\nComputed predicted_actions_values and value for action chosen:\n", file=self.__path_to_write, use_time_stamp=False)

        for i in range(self.batch_size):
            self.lg.writeLine(f"{i}: {predicted_actions_values[i]} [ {action_batch[i]} ] -> {predicted_values_for_actions[i]}", file=self.__path_to_write, use_time_stamp=False)
    
        return predicted_actions_values, predicted_values_for_actions


    def _apply_value_prediction_to_next_state(self, next_state_batch, done_batch, reward_batch, discount_factor):

        '''
        Returns the predicted values for the next state
        
        They are given by appying the Q function to them and then chosing the next 

        '''

        next_state_q_values, next_state_v_values = super()._apply_value_prediction_to_next_state(next_state_batch, done_batch, reward_batch, discount_factor)

        self.lg.writeLine(f"\nComputed done, next_state_values computed by target and q value of action chosen:\n", file=self.__path_to_write, use_time_stamp=False)

        for i in range(self.batch_size):
            self.lg.writeLine(f"{i}: {done_batch[i]}, {next_state_q_values[i]} -> {next_state_v_values[i]}", file=self.__path_to_write, use_time_stamp=False)


        return next_state_q_values, next_state_v_values
    

    def _calculate_chosen_actions_correct_q_values(self, next_state_v_values, discount_factor, reward_batch):

        old_action_values = next_state_v_values.clone()

        correct_q_values_for_chosen_action = super()._calculate_chosen_actions_correct_q_values(next_state_v_values, discount_factor, reward_batch)

        self.lg.writeLine(f"\nNext action values after multiplying by discount factor {discount_factor} and adding reward:\n", file=self.__path_to_write, use_time_stamp=False)

        for i in range(self.batch_size):
            self.lg.writeLine(f"{i}: {correct_q_values_for_chosen_action[i]} = {old_action_values[i]} * {discount_factor} + {reward_batch[i]}", file=self.__path_to_write, use_time_stamp=False)

        return correct_q_values_for_chosen_action
    
    def _optimize_with_predicted_model_values_and_correct_values(self, predicted_values, correct_values):

        self.lg.writeLine(f"\nOptimizing using error of original predicted action values and target done on future state:\n", file=self.__path_to_write, use_time_stamp=False)

        for i in range(self.batch_size):
            self.lg.writeLine(f"{i}: {predicted_values[i]} vs {correct_values[i]}", file=self.__path_to_write, use_time_stamp=False)

        super()._optimize_with_predicted_model_values_and_correct_values(predicted_values, correct_values)


    def learn(self, trajectory, discount_factor) -> None:

        self.__path_to_write = self.lg.new_relative_path_if_exists("computation.txt", dir="learning")
        
        super().learn(trajectory, discount_factor)



class CustomDQNLearnerDebug(DQNLearnerDebug, CustomDQNLearnerDebugStrategy):
    pass


In [None]:


from automl.ml.memory.torch_memory_component import TorchMemoryComponent
from automl.ml.memory.debug.memory_debug import MemoryDebug


class TorchMemoryComponentDebug(MemoryDebug, TorchMemoryComponent):
    pass

In [None]:
#from automl.rl.trainers.debug.rl_trainer_debug import RLTrainerDebug
from automl.rl.trainers.rl_trainer.parallel_rl_trainer import RLTrainerComponentParallel
import matplotlib.pyplot as plt
from IPython.display import display, clear_output


class RLTrainerDebug(RLTrainerComponentParallel):

    is_debug_schema = True

    def _proccess_input_internal(self):
        super()._proccess_input_internal()

        plt.ion()  # turn on interactive mode

        self.fig, self.ax = plt.subplots(figsize=(6,4))


    def run_episode_step_for_agent_name(self, i_episode, agent_name):

        done = super().run_episode_step_for_agent_name( i_episode, agent_name)

        self.lg.writeLine(f"Doing episode step in episode {i_episode} for agent {agent_name} was over: {done}", file="observations.txt", use_time_stamp=False)
                        
        return done
    
    def run_single_episode(self, i_episode):
                        
        super().run_single_episode(i_episode)

        clear_output(wait=True)

        self.ax.clear()

        self.get_results_logger().plot_confidence_interval(x_axis='episode', y_column='episode_reward',show_std=False, to_show=False, ax=self.ax)
        self.get_results_logger().plot_linear_regression(x_axis='episode', y_axis='episode_reward', to_show=False, y_label='linear', ax=self.ax)

        self.ax.set_title(f"Training progress (update {i_episode})")
        display(self.fig)



In [None]:
from automl.ml.optimizers.debug.debug_optimizers import AdamOptimizerDebug

In [None]:
#from automl.rl.trainers.debug.rl_trainer_debug import RLTrainerDebug
from automl.fundamentals.translator.translator import Translator
from automl.rl.policy.policy import ComponentWithLogging

import torch
torch.set_printoptions(threshold=float('inf'), linewidth=30000)


translator_type : type[Translator] = state_translator_tuple[0]


class TranslatorDebug(translator_type, ComponentWithLogging):

    is_debug_schema = True


    def translate_state(self, state):
        
        self.lg.writeLine(f"-----------------------------------------------\nTranslating:\n\n{state}\ninto", use_time_stamp=False, file="translations.txt")
        to_return = super().translate_state(state)
        self.lg.writeLine(f"\n{to_return}\n-----------------------------------------------", use_time_stamp=False, file="translations.txt")
        return to_return



In [None]:
from automl.ml.models.neural_model import FullyConnectedModelSchema


class FullyConnectedModelDebug(FullyConnectedModelSchema, ComponentWithLogging):

    @requires_input_proccess
    def predict(self, state):
        self.lg.writeLine(f"Predicting for input with state {state.shape}, while input size is {self.input_size}",  use_time_stamp=False, file="predictions.txt")
        to_return = self.model(state)
        self.lg.writeLine(f"To return shape: {to_return.shape}\n",  use_time_stamp=False, file="predictions.txt")
        return to_return

### Change experiment with Debug variants

In [None]:
#substitute_value_in_dict(rl_trainer_input, "default_trainer_class", AgentTrainerDQNDebugClass)


In [None]:
substitute_tuple_value_in_dict(agents_trainers_input, "learner", 0, CustomDQNLearnerDebug)

In [None]:
substitute_tuple_value_in_dict(agents_input, "state_translator", 0, TranslatorDebug)

In [None]:
#substitute_tuple_value_in_dict(agents_trainers_input, "memory", 0, TorchMemoryComponentDebug)

In [None]:
#substitute_tuple_value_in_dict(rl_pipeline_input, "rl_trainer", 0, RLTrainerDebug)

In [None]:
#substitute_tuple_value_in_dict(learner_input, "optimizer", 0, AdamOptimizerDebug)

In [None]:
substitute_tuple_value_in_dict(policy_input, "model", 0, FullyConnectedModelDebug)

## Manual Hyperparameter Tuning

### Experiment

### Base Model

### Other value changes

In [None]:
#rl_trainer_input["limit_total_steps"] = 1000

#rl_trainer_input.pop("limit_total_steps", None)

#rl_trainer_input["num_episodes"] = 4000


In [None]:
#agents_trainers_input["learning_start_step_delay"] = 5000
#agents_trainers_input["learning_start_ep_delay"] = 150

#substitute_value_in_dict(agents_trainers_input, "learning_start_ep_delay", 2897)

In [None]:
substitute_value_in_dict(agents_trainers_input, "optimization_interval", 10)
substitute_value_in_dict(agents_trainers_input, "times_to_learn", 2)

In [None]:
substitute_value_in_dict(memory_input, "capacity", 20)

In [None]:
#optimizer_input["clip_grad_norm"] = 0.1

#substitute_value_in_dict(optimizer_input, "clip_grad_value", 0.2956984463839789)

#substitute_value_in_dict(optimizer_input, "learning_rate", 0.006807860813523758)

In [None]:
#substitute_value_in_dict(agents_trainers_input, "discount_factor", 0.8790365307757482)

In [None]:
#substitute_value_in_dict(learner_input, "target_update_rate", 0.5511208693081078)

In [None]:
#substitute_value_in_dict(exploration_strategy_input, "epsilon_end", 0.009535369612528788)

# Gen RL Pipeline

In [None]:

from automl.rl.rl_pipeline import RLPipelineComponent
from automl.utils.json_utils.json_component_utils import gen_component_from

rl_pipeline : RLPipelineComponent = gen_component_from(rl_pipeline_config)

In [None]:
rl_pipeline.pass_input({
    "base_directory" : PATH_TO_STORE_EXPERIMENTS,
                        "artifact_relative_directory" : experiment_name,
                        "create_new_directory" : True,
                        "do_full_setup_of_seed" : True}
                        )

experiment_path = rl_pipeline.get_artifact_directory()

print(f"Experiment path: {experiment_path}")

# Do the training

In [None]:
from automl.loggers.global_logger import activate_global_logger

activate_global_logger(rl_pipeline.get_artifact_directory())

In [None]:
rl_pipeline.run()

## Save configuration

In [None]:
#rl_pipeline.save_configuration(save_exposed_values=True)
from automl.basic_components.state_management import save_state


save_state(rl_pipeline, save_definition=True)

## See Results

In [None]:
AGGREGATE_NUMBER = 5

In [None]:

from automl.loggers.result_logger import RESULTS_FILENAME, ResultLogger

results_directory = f"{experiment_path}\\RLTrainerComponent"
    
results_logger = ResultLogger(input={
                                        "results_filename" : RESULTS_FILENAME,
                                        "base_directory" : results_directory,
                                        "artifact_relative_directory" : '',
                                        "create_new_directory" : False
                            })

In [None]:
#results_logger.plot_graph(x_axis='episode', y_axis=[('total_reward', name)], to_show=False)
results_logger.plot_confidence_interval(x_axis='episode', y_column='episode_reward',show_std=True, to_show=False, y_values_label=experiment_name, aggregate_number=AGGREGATE_NUMBER)
results_logger.plot_linear_regression(x_axis='episode', y_axis='episode_reward', to_show=False, y_label=experiment_name + '_linear')
