From 1b86b141b1033b1e3cf74e49084158012e7c0c9d Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Fri, 9 Aug 2024 00:00:27 +0200
Subject: [PATCH 01/15] state2state_rl implementation

---
 src/qutip_qoc/_rl.py         | 144 ++++++++++++++++++++++++++++-------
 src/qutip_qoc/pulse_optim.py |   2 +-
 tests/test_result.py         |  34 +++++----
 3 files changed, 137 insertions(+), 43 deletions(-)

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index bb12900..94989c0 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -3,6 +3,7 @@
 """
 import qutip as qt
 from qutip import Qobj, QobjEvo
+from qutip_qoc import Result
 
 import numpy as np
 
@@ -11,6 +12,7 @@
 from stable_baselines3 import PPO
 from stable_baselines3.common.env_checker import check_env
 
+import time
 
 class _RL(gym.Env): # TODO: this should be similar to your GymQubitEnv(gym.Env) implementation
     """
@@ -19,50 +21,83 @@ class _RL(gym.Env): # TODO: this should be similar to your GymQubitEnv(gym.Env)
 
     def __init__(
         self,
-        objective,
+        objectives,
+        control_parameters,
         time_interval,
         time_options,
-        control_parameters,
         alg_kwargs,
-        guess_params,
-        **integrator_kwargs,
+        optimizer_kwargs,
+        minimizer_kwargs,
+        integrator_kwargs,
+        qtrl_optimizers,
     ):
-        super().__init__() # TODO: super init your gym environment here
+        super(_RL,self).__init__() # TODO:(ok) super init your gym environment here
 
         # ------------------------------- copied from _GOAT class -------------------------------
         
         # TODO: you dont have to use (or keep them) if you don't need the following attributes
         # this is just an inspiration how to extract information from the input 
-
-        self._Hd = objective.H[0]
-        self._Hc_lst = objective.H[1:]
+        self._Hd_lst, self._Hc_lst = [], []
+        if not isinstance(objectives, list):
+            objectives = [objectives]
+        for objective in objectives:
+            # extract drift and control Hamiltonians from the objective
+            self._Hd_lst.append(objective.H[0])
+            self._Hc_lst.append([H[0] if isinstance(H, list) else H for H in objective.H[1:]])
 
         self._control_parameters = control_parameters
-        self._guess_params = guess_params
-        self._H = self._prepare_generator()
+        # extract bounds for _control_parameters
+        bounds = []
+        for key in control_parameters.keys():
+            bounds.append(control_parameters[key].get("bounds"))
+        self.lbound = [b[0][0] for b in bounds]
+        self.ubound = [b[0][1] for b in bounds]
+
+        #self._H = self._prepare_generator()
 
         self._initial = objective.initial
         self._target = objective.target
+        self.state = None
+
+        self._result = Result(
+            objectives = objectives,
+            time_interval = time_interval,
+            start_local_time = time.localtime(),    # initial optimization time
+            n_iters = 0,                            # Number of iterations(episodes) until convergence 
+            iter_seconds = [],                      # list containing the time taken for each iteration(episode) of the optimization
+            var_time = False,                       # Whether the optimization was performed with variable time
+        )
+
+        #for the reward
+        self._step_penalty = 1
 
-        self._evo_time = time_interval.evo_time
+        # To check if it exceeds the maximum number of steps in an episode
+        self.current_step = 0
+
+        #self._evo_time = time_interval.evo_time
+        self._fid_err_targ = alg_kwargs["fid_err_targ"]
 
         # inferred attributes
         self._norm_fac = 1 / self._target.norm()
 
+        # control function
+        self.pulse = self._pulse
+
+        # contains the action of the last completed or truncated episode
+        self.action = None
+
         # integrator options
         self._integrator_kwargs = integrator_kwargs
-
         self._rtol = self._integrator_kwargs.get("rtol", 1e-5)
         self._atol = self._integrator_kwargs.get("atol", 1e-5)
 
         # choose solver and fidelity type according to problem
-        if self._Hd.issuper:
+        if self._Hd_lst[0].issuper:
             self._fid_type = alg_kwargs.get("fid_type", "TRACEDIFF")
-            self._solver = qt.MESolver(H=self._H, options=self._integrator_kwargs)
-
+            #self._solver = qt.MESolver(H=self._H, options=self._integrator_kwargs)
         else:
             self._fid_type = alg_kwargs.get("fid_type", "PSU")
-            self._solver = qt.SESolver(H=self._H, options=self._integrator_kwargs)
+            #self._solver = qt.SESolver(H=self._H, options=self._integrator_kwargs)
 
         self.infidelity = self._infid # TODO: should be used to calculate the reward
 
@@ -70,18 +105,25 @@ def __init__(
         # TODO: set up your gym environment as you did correctly in post10
         self.max_episode_time = time_interval.evo_time                  # maximum time for an episode
         self.max_steps = time_interval.n_tslots                         # maximum number of steps in an episode
-        self.step_duration = time_interval.tslots[-1] / time_interval.n_tslots  # step duration for mesvole()
-        ...
-        
+        self.step_duration = time_interval.tslots[-1] / time_interval.n_tslots  # step duration for mesvole
+        self.max_episodes = alg_kwargs["max_iter"]                      # maximum number of episodes for training
+        self.total_timesteps = self.max_episodes * self.max_steps       # for learn() of gym
         
+        # Define action and observation spaces (Gym)
+        self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)  # Continuous action space from -1 to +1, as suggested from gym
+        self.observation_space = spaces.Box(low=-1, high=1, shape=(4,), dtype=np.float32)  # Observation space, |v> have 2 real and 2 imaginary numbers -> 4
         # ----------------------------------------------------------------------------------------
     
-    def _infid(self, params):
+    def _pulse(self, t, p):
+        return p
+
+    def _infid(self, params=None):
         """
         Calculate infidelity to be minimized
         """
+        '''
         X = self._solver.run(
-            self._initial, [0.0, self._evo_time], args={"p": params}
+            self.state, [0.0, self.step_duration], args={"p": params}
         ).final_state
 
         if self._fid_type == "TRACEDIFF":
@@ -96,17 +138,65 @@ def _infid(self, params):
                 infid = 1 - np.abs(g)
             elif self._fid_type == "SU":  # f_SU (incl global phase)
                 infid = 1 - np.real(g)
-
+        '''
+        infid = 1 - qt.fidelity(self.state, self._target)
         return infid
 
     # TODO: don't hesitate to add the required methods for your rl environment
 
     def step(self, action):
-        ...
+        action = action[0]
+        alpha = ((action + 1) / 2 * (self.ubound[0] - self.lbound[0])) + self.lbound[0]   # for example, scale action from [-1, 1] to [9, 13]
+        args = {"alpha" : alpha}
+
+        H = [self._Hd_lst[0], [self._Hc_lst[0][0], lambda t, args: self.pulse(t, args["alpha"])]]
+        step_result = qt.mesolve(H, self.state, [0, self.step_duration], args = args)
+        self.state =  step_result.states[-1]
+
+        infidelity = self.infidelity()
+        self._result.infidelity = infidelity
+        reward = (1 - infidelity) - self._step_penalty
+
+        terminated = infidelity <= self._fid_err_targ                       # the episode ended reaching the goal
+        truncated = self.current_step >= self.max_steps                     # if the episode ended without reaching the goal
+
+        if terminated or truncated:
+            time_diff = time.mktime(time.localtime()) - time.mktime(self._result.start_local_time)
+            self._result.iter_seconds.append(time_diff)
+            self.current_step = 0                                           # Reset the step counter
+            self.action = alpha
+
+        observation = self._get_obs()
+        return observation, reward, bool(terminated), bool(truncated), {}
+
+    def _get_obs(self):
+        rho = self.state.full().flatten()                                   # to have state vector as NumPy array and flatten into one dimensional array.[a+i*b c+i*d]
+        obs = np.concatenate((np.real(rho), np.imag(rho)))
+        return obs.astype(np.float32)                                       # Gymnasium expects the observation to be of type float32
+    
+    def reset(self, seed=None):
+        self.state = self._initial
+        return self._get_obs(), {}
+    
+    def result(self):
+        # TODO: return qoc.Result object with the optimized pulse amplitudes
+        self._result.message = "Optimization finished!"
+        self._result.end_local_time = time.localtime()
+        self._result.n_iters = len(self._result.iter_seconds)  
+        self._result.optimized_params = np.array(self.action)
+        self._result._optimized_controls = np.array(self.action)    #TODO: is equal to optimized_params ?
+        self._result._final_states = (self._result._final_states if self._result._final_states is not None else []) + [self.state]
+        self._result.start_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.start_local_time)       # Convert to a string
+        self._result.end_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.end_local_time)           # Convert to a string
+        self._result._guess_controls = []
+        return self._result
 
     def train(self):
-        ...
+        # Check if the environment follows Gym API
+        check_env(self, warn=True)
 
-    def result(self):
-        # TODO: return qoc.Result object with the optimized pulse amplitudes
-        ...
\ No newline at end of file
+        # Create the model
+        model = PPO('MlpPolicy', self, verbose=1)       # verbose = 1 to show training in terminal
+
+        # Train the model
+        model.learn(total_timesteps = self.total_timesteps)
\ No newline at end of file
diff --git a/src/qutip_qoc/pulse_optim.py b/src/qutip_qoc/pulse_optim.py
index 661c577..573a51d 100644
--- a/src/qutip_qoc/pulse_optim.py
+++ b/src/qutip_qoc/pulse_optim.py
@@ -152,7 +152,7 @@ def optimize_pulses(
     # extract guess and bounds for the control pulses
     x0, bounds = [], []
     for key in control_parameters.keys():
-        x0.append(control_parameters[key].get("guess"))
+        x0.append(control_parameters[key].get("guess"))         # TODO: for now only consider bounds
         bounds.append(control_parameters[key].get("bounds"))
     try:  # GRAPE, CRAB format
         lbound = [b[0][0] for b in bounds]
diff --git a/tests/test_result.py b/tests/test_result.py
index 084aa74..a7460d7 100644
--- a/tests/test_result.py
+++ b/tests/test_result.py
@@ -157,7 +157,7 @@ def sin_z_jax(t, r, **kwargs):
 # you can use this routine to test your implementation
 
 # state to state transfer
-init = qt.basis(2, 0)
+initial = qt.basis(2, 0)
 target = qt.basis(2, 1)
 
 H_c = [qt.sigmax(), qt.sigmay(), qt.sigmaz()] # control Hamiltonians
@@ -169,34 +169,38 @@ def sin_z_jax(t, r, **kwargs):
 
 state2state_rl = Case(
     objectives=[Objective(initial, H, target)],
-    control_parameters={"bounds": [-13, 13]}, # TODO: for now only consider bounds
+    #control_parameters={"bounds": [-13, 13]}, # TODO: for now only consider bounds
+    control_parameters = {
+        "p": {"bounds": [(-13, 13)],}
+    },
     tlist=np.linspace(0, 10, 100), # TODO: derive single step duration and max evo time / max num steps from this
     algorithm_kwargs={
         "fid_err_targ": 0.01,
         "alg": "RL",
-        "max_iter": 100,
-    }
+        "max_iter": 700,
+    },
+    optimizer_kwargs={}
 )
 
 # TODO: no big difference for unitary evolution
 
-initial = qt.qeye(2) # Identity
-target  = qt.gates.hadamard_transform()
+#initial = qt.qeye(2) # Identity
+#target  = qt.gates.hadamard_transform()
 
-unitary_rl = state2state_rl._replace(
-    objectives=[Objective(initial, H, target)],
-)
+#unitary_rl = state2state_rl._replace(
+#    objectives=[Objective(initial, H, target)],
+#)
 
 
 @pytest.fixture(
     params=[
-        pytest.param(state2state_grape, id="State to state (GRAPE)"),
-        pytest.param(state2state_crab, id="State to state (CRAB)"),
-        pytest.param(state2state_param_crab, id="State to state (param. CRAB)"),
-        pytest.param(state2state_goat, id="State to state (GOAT)"),
-        pytest.param(state2state_jax, id="State to state (JAX)"),
+        #pytest.param(state2state_grape, id="State to state (GRAPE)"),
+        #pytest.param(state2state_crab, id="State to state (CRAB)"),
+        #pytest.param(state2state_param_crab, id="State to state (param. CRAB)"),
+        #pytest.param(state2state_goat, id="State to state (GOAT)"),
+        #pytest.param(state2state_jax, id="State to state (JAX)"),
         pytest.param(state2state_rl, id="State to state (RL)"),
-        pytest.param(unitary_rl, id="Unitary (RL)"),
+        #pytest.param(unitary_rl, id="Unitary (RL)"),
     ]
 )
 def tst(request):

From 2c6c3025f63a35a6315437499b268b32c12e0d28 Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Sat, 10 Aug 2024 21:36:20 +0200
Subject: [PATCH 02/15] Using QobjEvo with Hc, Hd and controls. Dimension
 action space variable based on how many controls there are. using mesolve
 with QobjEvo object _H

---
 src/qutip_qoc/_rl.py | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index 94989c0..315e074 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -45,6 +45,13 @@ def __init__(
             self._Hd_lst.append(objective.H[0])
             self._Hc_lst.append([H[0] if isinstance(H, list) else H for H in objective.H[1:]])
 
+        # create the QobjEvo with Hd, Hc and controls(args)
+        self.args = {f"alpha{i+1}": (1) for i in range(len(self._Hc_lst[0]))}    # set the control parameters to 1 fo all the Hc
+        self._H_lst = [self._Hd_lst[0]]
+        for i, Hc in enumerate(self._Hc_lst[0]):
+            self._H_lst.append([Hc, lambda t, args: self.pulse(t, self.args, i+1)])
+        self._H = qt.QobjEvo(self._H_lst, self.args)
+
         self._control_parameters = control_parameters
         # extract bounds for _control_parameters
         bounds = []
@@ -80,9 +87,6 @@ def __init__(
         # inferred attributes
         self._norm_fac = 1 / self._target.norm()
 
-        # control function
-        self.pulse = self._pulse
-
         # contains the action of the last completed or truncated episode
         self.action = None
 
@@ -110,12 +114,15 @@ def __init__(
         self.total_timesteps = self.max_episodes * self.max_steps       # for learn() of gym
         
         # Define action and observation spaces (Gym)
-        self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)  # Continuous action space from -1 to +1, as suggested from gym
+        #self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)  # Continuous action space from -1 to +1, as suggested from gym
+        self.action_space = spaces.Box(low=-1, high=1, shape=(len(self._Hc_lst[0]),), dtype=np.float32)
         self.observation_space = spaces.Box(low=-1, high=1, shape=(4,), dtype=np.float32)  # Observation space, |v> have 2 real and 2 imaginary numbers -> 4
         # ----------------------------------------------------------------------------------------
     
-    def _pulse(self, t, p):
-        return p
+    #def _pulse(self, t, p):
+    #    return p
+    def pulse(self, t, args, idx):
+        return 1*args[f"alpha{idx}"]
 
     def _infid(self, params=None):
         """
@@ -145,12 +152,16 @@ def _infid(self, params=None):
     # TODO: don't hesitate to add the required methods for your rl environment
 
     def step(self, action):
-        action = action[0]
-        alpha = ((action + 1) / 2 * (self.ubound[0] - self.lbound[0])) + self.lbound[0]   # for example, scale action from [-1, 1] to [9, 13]
-        args = {"alpha" : alpha}
+        alphas = [((action[i] + 1) / 2 * (self.ubound[0] - self.lbound[0])) + self.lbound[0] for i in range(len(action))]   #TODO: use ubound[i] lbound[i] 
+
+        for i, value in enumerate(alphas):
+            self.args[f"alpha{i+1}"] = value
+        self._H = qt.QobjEvo(self._H_lst, self.args)
+
+        #H = [self._Hd_lst[0], [self._Hc_lst[0][0], lambda t, args: self.pulse(t, args["alpha"])]]
+        #step_result = qt.mesolve(H, self.state, [0, self.step_duration], args = args)
+        step_result = qt.mesolve(self._H, self.state, [0, self.step_duration])
 
-        H = [self._Hd_lst[0], [self._Hc_lst[0][0], lambda t, args: self.pulse(t, args["alpha"])]]
-        step_result = qt.mesolve(H, self.state, [0, self.step_duration], args = args)
         self.state =  step_result.states[-1]
 
         infidelity = self.infidelity()
@@ -164,7 +175,7 @@ def step(self, action):
             time_diff = time.mktime(time.localtime()) - time.mktime(self._result.start_local_time)
             self._result.iter_seconds.append(time_diff)
             self.current_step = 0                                           # Reset the step counter
-            self.action = alpha
+            self.action = alphas
 
         observation = self._get_obs()
         return observation, reward, bool(terminated), bool(truncated), {}

From 973095976abd7cc7840ae5e61f97be4a1d1f09f5 Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Sun, 11 Aug 2024 21:42:37 +0200
Subject: [PATCH 03/15] added _solver. _infidelity() more general

---
 src/qutip_qoc/_rl.py | 54 +++++++++++++++++++++++---------------------
 1 file changed, 28 insertions(+), 26 deletions(-)

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index 315e074..832c44c 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -46,7 +46,7 @@ def __init__(
             self._Hc_lst.append([H[0] if isinstance(H, list) else H for H in objective.H[1:]])
 
         # create the QobjEvo with Hd, Hc and controls(args)
-        self.args = {f"alpha{i+1}": (1) for i in range(len(self._Hc_lst[0]))}    # set the control parameters to 1 fo all the Hc
+        self.args = {f"alpha{i+1}": (1) for i in range(len(self._Hc_lst[0]))}    # set the control parameters to 1 for all the Hc
         self._H_lst = [self._Hd_lst[0]]
         for i, Hc in enumerate(self._Hc_lst[0]):
             self._H_lst.append([Hc, lambda t, args: self.pulse(t, self.args, i+1)])
@@ -61,6 +61,7 @@ def __init__(
         self.ubound = [b[0][1] for b in bounds]
 
         #self._H = self._prepare_generator()
+        self._alg_kwargs = alg_kwargs
 
         self._initial = objective.initial
         self._target = objective.target
@@ -87,24 +88,14 @@ def __init__(
         # inferred attributes
         self._norm_fac = 1 / self._target.norm()
 
-        # contains the action of the last completed or truncated episode
-        self.action = None
+        self.temp_actions = []                  # temporary list to save episode actions
+        self.actions = []                       # list of actions(lists) of the last episode
 
         # integrator options
         self._integrator_kwargs = integrator_kwargs
         self._rtol = self._integrator_kwargs.get("rtol", 1e-5)
         self._atol = self._integrator_kwargs.get("atol", 1e-5)
 
-        # choose solver and fidelity type according to problem
-        if self._Hd_lst[0].issuper:
-            self._fid_type = alg_kwargs.get("fid_type", "TRACEDIFF")
-            #self._solver = qt.MESolver(H=self._H, options=self._integrator_kwargs)
-        else:
-            self._fid_type = alg_kwargs.get("fid_type", "PSU")
-            #self._solver = qt.SESolver(H=self._H, options=self._integrator_kwargs)
-
-        self.infidelity = self._infid # TODO: should be used to calculate the reward
-
         # ----------------------------------------------------------------------------------------
         # TODO: set up your gym environment as you did correctly in post10
         self.max_episode_time = time_interval.evo_time                  # maximum time for an episode
@@ -118,7 +109,18 @@ def __init__(
         self.action_space = spaces.Box(low=-1, high=1, shape=(len(self._Hc_lst[0]),), dtype=np.float32)
         self.observation_space = spaces.Box(low=-1, high=1, shape=(4,), dtype=np.float32)  # Observation space, |v> have 2 real and 2 imaginary numbers -> 4
         # ----------------------------------------------------------------------------------------
-    
+        
+    def update_solver(self): 
+        # choose solver and fidelity type according to problem
+        if self._Hd_lst[0].issuper:
+            self._fid_type = self._alg_kwargs.get("fid_type", "TRACEDIFF")
+            self._solver = qt.MESolver(H=self._H, options=self._integrator_kwargs)
+        else:
+            self._fid_type = self._alg_kwargs.get("fid_type", "PSU")
+            self._solver = qt.SESolver(H=self._H, options=self._integrator_kwargs)
+
+        self.infidelity = self._infid # TODO: should be used to calculate the reward
+
     #def _pulse(self, t, p):
     #    return p
     def pulse(self, t, args, idx):
@@ -128,10 +130,10 @@ def _infid(self, params=None):
         """
         Calculate infidelity to be minimized
         """
-        '''
         X = self._solver.run(
             self.state, [0.0, self.step_duration], args={"p": params}
         ).final_state
+        self.state = X
 
         if self._fid_type == "TRACEDIFF":
             diff = X - self._target
@@ -145,8 +147,7 @@ def _infid(self, params=None):
                 infid = 1 - np.abs(g)
             elif self._fid_type == "SU":  # f_SU (incl global phase)
                 infid = 1 - np.real(g)
-        '''
-        infid = 1 - qt.fidelity(self.state, self._target)
+        #infid = 1 - qt.fidelity(self.state, self._target)
         return infid
 
     # TODO: don't hesitate to add the required methods for your rl environment
@@ -158,13 +159,11 @@ def step(self, action):
             self.args[f"alpha{i+1}"] = value
         self._H = qt.QobjEvo(self._H_lst, self.args)
 
-        #H = [self._Hd_lst[0], [self._Hc_lst[0][0], lambda t, args: self.pulse(t, args["alpha"])]]
-        #step_result = qt.mesolve(H, self.state, [0, self.step_duration], args = args)
-        step_result = qt.mesolve(self._H, self.state, [0, self.step_duration])
-
-        self.state =  step_result.states[-1]
-
+        self.update_solver()                # _H has changed
         infidelity = self.infidelity()
+
+        self.current_step += 1
+        self.temp_actions.append(alphas)
         self._result.infidelity = infidelity
         reward = (1 - infidelity) - self._step_penalty
 
@@ -175,7 +174,7 @@ def step(self, action):
             time_diff = time.mktime(time.localtime()) - time.mktime(self._result.start_local_time)
             self._result.iter_seconds.append(time_diff)
             self.current_step = 0                                           # Reset the step counter
-            self.action = alphas
+            self.actions = self.temp_actions.copy()
 
         observation = self._get_obs()
         return observation, reward, bool(terminated), bool(truncated), {}
@@ -186,6 +185,7 @@ def _get_obs(self):
         return obs.astype(np.float32)                                       # Gymnasium expects the observation to be of type float32
     
     def reset(self, seed=None):
+        self.temp_actions = []
         self.state = self._initial
         return self._get_obs(), {}
     
@@ -194,12 +194,14 @@ def result(self):
         self._result.message = "Optimization finished!"
         self._result.end_local_time = time.localtime()
         self._result.n_iters = len(self._result.iter_seconds)  
-        self._result.optimized_params = np.array(self.action)
-        self._result._optimized_controls = np.array(self.action)    #TODO: is equal to optimized_params ?
+        self._result.optimized_params = self.actions.copy()
+        self._result._optimized_controls = self.actions.copy()
         self._result._final_states = (self._result._final_states if self._result._final_states is not None else []) + [self.state]
         self._result.start_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.start_local_time)       # Convert to a string
         self._result.end_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.end_local_time)           # Convert to a string
         self._result._guess_controls = []
+        self._result._optimized_H = [self._H]
+        self._result.guess_params = []
         return self._result
 
     def train(self):

From 2667b6403f6ed2134d44fc845aa359a556060f0d Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Wed, 14 Aug 2024 10:05:40 +0200
Subject: [PATCH 04/15] unitary evolution implementation

---
 src/qutip_qoc/_rl.py | 15 ++++++++++-----
 tests/test_result.py | 16 ++++++++--------
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index 832c44c..8961c2e 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -63,9 +63,10 @@ def __init__(
         #self._H = self._prepare_generator()
         self._alg_kwargs = alg_kwargs
 
-        self._initial = objective.initial
-        self._target = objective.target
+        self._initial = objectives[0].initial
+        self._target = objectives[0].target
         self.state = None
+        self.dim = self._initial.shape[0]
 
         self._result = Result(
             objectives = objectives,
@@ -105,9 +106,13 @@ def __init__(
         self.total_timesteps = self.max_episodes * self.max_steps       # for learn() of gym
         
         # Define action and observation spaces (Gym)
-        #self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)  # Continuous action space from -1 to +1, as suggested from gym
-        self.action_space = spaces.Box(low=-1, high=1, shape=(len(self._Hc_lst[0]),), dtype=np.float32)
-        self.observation_space = spaces.Box(low=-1, high=1, shape=(4,), dtype=np.float32)  # Observation space, |v> have 2 real and 2 imaginary numbers -> 4
+        #self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)  
+        if self._initial.isket:
+            obs_shape = (2 * self.dim,)
+        else:   # for unitary operators 
+            obs_shape = (2 * self.dim * self.dim,)
+        self.action_space = spaces.Box(low=-1, high=1, shape=(len(self._Hc_lst[0]),), dtype=np.float32)     # Continuous action space from -1 to +1, as suggested from gym
+        self.observation_space = spaces.Box(low=-1, high=1, shape=obs_shape, dtype=np.float32)              # Observation space
         # ----------------------------------------------------------------------------------------
         
     def update_solver(self): 
diff --git a/tests/test_result.py b/tests/test_result.py
index a7460d7..10fb8bf 100644
--- a/tests/test_result.py
+++ b/tests/test_result.py
@@ -182,14 +182,14 @@ def sin_z_jax(t, r, **kwargs):
     optimizer_kwargs={}
 )
 
-# TODO: no big difference for unitary evolution
+# no big difference for unitary evolution
 
-#initial = qt.qeye(2) # Identity
-#target  = qt.gates.hadamard_transform()
+initial = qt.qeye(2) # Identity
+target  = qt.gates.hadamard_transform()
 
-#unitary_rl = state2state_rl._replace(
-#    objectives=[Objective(initial, H, target)],
-#)
+unitary_rl = state2state_rl._replace(
+    objectives=[Objective(initial, H, target)],
+)
 
 
 @pytest.fixture(
@@ -199,8 +199,8 @@ def sin_z_jax(t, r, **kwargs):
         #pytest.param(state2state_param_crab, id="State to state (param. CRAB)"),
         #pytest.param(state2state_goat, id="State to state (GOAT)"),
         #pytest.param(state2state_jax, id="State to state (JAX)"),
-        pytest.param(state2state_rl, id="State to state (RL)"),
-        #pytest.param(unitary_rl, id="Unitary (RL)"),
+        #pytest.param(state2state_rl, id="State to state (RL)"),
+        pytest.param(unitary_rl, id="Unitary (RL)"),
     ]
 )
 def tst(request):

From 19436092cbe89004cd1c8424018c827bc7576059 Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Sun, 18 Aug 2024 19:51:28 +0200
Subject: [PATCH 05/15] docstring added

---
 src/qutip_qoc/_rl.py | 73 ++++++++++++++++++++++++++++----------------
 1 file changed, 47 insertions(+), 26 deletions(-)

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index 8961c2e..8434fe6 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -1,5 +1,7 @@
 """
-This module contains ...
+This module contains functions that implement quantum optimal control 
+using reinforcement learning (RL) techniques, allowing for the optimization 
+of control pulse sequences in quantum systems.
 """
 import qutip as qt
 from qutip import Qobj, QobjEvo
@@ -14,9 +16,13 @@
 
 import time
 
-class _RL(gym.Env): # TODO: this should be similar to your GymQubitEnv(gym.Env) implementation
+class _RL(gym.Env):
     """
-    Class for storing a control problem and ...
+    Class for storing a control problem and implementing quantum optimal 
+    control using reinforcement learning. This class defines a custom 
+    Gym environment that models the dynamics of quantum systems 
+    under various control pulses, and uses RL algorithms to optimize the 
+    parameters of these pulses.
     """
 
     def __init__(
@@ -31,12 +37,14 @@ def __init__(
         integrator_kwargs,
         qtrl_optimizers,
     ):
-        super(_RL,self).__init__() # TODO:(ok) super init your gym environment here
+        """
+        Initialize the reinforcement learning environment for quantum 
+        optimal control. Sets up the system Hamiltonian, control parameters, 
+        and defines the observation and action spaces for the RL agent.
+        """
 
-        # ------------------------------- copied from _GOAT class -------------------------------
+        super(_RL,self).__init__()
         
-        # TODO: you dont have to use (or keep them) if you don't need the following attributes
-        # this is just an inspiration how to extract information from the input 
         self._Hd_lst, self._Hc_lst = [], []
         if not isinstance(objectives, list):
             objectives = [objectives]
@@ -60,7 +68,6 @@ def __init__(
         self.lbound = [b[0][0] for b in bounds]
         self.ubound = [b[0][1] for b in bounds]
 
-        #self._H = self._prepare_generator()
         self._alg_kwargs = alg_kwargs
 
         self._initial = objectives[0].initial
@@ -83,7 +90,6 @@ def __init__(
         # To check if it exceeds the maximum number of steps in an episode
         self.current_step = 0
 
-        #self._evo_time = time_interval.evo_time
         self._fid_err_targ = alg_kwargs["fid_err_targ"]
 
         # inferred attributes
@@ -97,8 +103,6 @@ def __init__(
         self._rtol = self._integrator_kwargs.get("rtol", 1e-5)
         self._atol = self._integrator_kwargs.get("atol", 1e-5)
 
-        # ----------------------------------------------------------------------------------------
-        # TODO: set up your gym environment as you did correctly in post10
         self.max_episode_time = time_interval.evo_time                  # maximum time for an episode
         self.max_steps = time_interval.n_tslots                         # maximum number of steps in an episode
         self.step_duration = time_interval.tslots[-1] / time_interval.n_tslots  # step duration for mesvole
@@ -106,17 +110,19 @@ def __init__(
         self.total_timesteps = self.max_episodes * self.max_steps       # for learn() of gym
         
         # Define action and observation spaces (Gym)
-        #self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)  
         if self._initial.isket:
             obs_shape = (2 * self.dim,)
         else:   # for unitary operators 
             obs_shape = (2 * self.dim * self.dim,)
         self.action_space = spaces.Box(low=-1, high=1, shape=(len(self._Hc_lst[0]),), dtype=np.float32)     # Continuous action space from -1 to +1, as suggested from gym
         self.observation_space = spaces.Box(low=-1, high=1, shape=obs_shape, dtype=np.float32)              # Observation space
-        # ----------------------------------------------------------------------------------------
         
     def update_solver(self): 
-        # choose solver and fidelity type according to problem
+        """
+        Update the solver and fidelity type based on the problem setup.
+        Chooses the appropriate solver (Schrödinger or master equation) and 
+        prepares for infidelity calculation.
+        """
         if self._Hd_lst[0].issuper:
             self._fid_type = self._alg_kwargs.get("fid_type", "TRACEDIFF")
             self._solver = qt.MESolver(H=self._H, options=self._integrator_kwargs)
@@ -124,16 +130,17 @@ def update_solver(self):
             self._fid_type = self._alg_kwargs.get("fid_type", "PSU")
             self._solver = qt.SESolver(H=self._H, options=self._integrator_kwargs)
 
-        self.infidelity = self._infid # TODO: should be used to calculate the reward
+        self.infidelity = self._infid
 
-    #def _pulse(self, t, p):
-    #    return p
     def pulse(self, t, args, idx):
+        """
+        Returns the control pulse value at time t for a given index.
+        """
         return 1*args[f"alpha{idx}"]
 
     def _infid(self, params=None):
         """
-        Calculate infidelity to be minimized
+        The agent performs a step, then calculate infidelity to be minimized of the current state against the target state.
         """
         X = self._solver.run(
             self.state, [0.0, self.step_duration], args={"p": params}
@@ -152,13 +159,14 @@ def _infid(self, params=None):
                 infid = 1 - np.abs(g)
             elif self._fid_type == "SU":  # f_SU (incl global phase)
                 infid = 1 - np.real(g)
-        #infid = 1 - qt.fidelity(self.state, self._target)
         return infid
 
-    # TODO: don't hesitate to add the required methods for your rl environment
-
     def step(self, action):
-        alphas = [((action[i] + 1) / 2 * (self.ubound[0] - self.lbound[0])) + self.lbound[0] for i in range(len(action))]   #TODO: use ubound[i] lbound[i] 
+        """
+        Perform a single time step in the environment, applying the scaled action (control pulse) 
+        chosen by the RL agent. Updates the system's state and computes the reward.
+        """
+        alphas = [((action[i] + 1) / 2 * (self.ubound[0] - self.lbound[0])) + self.lbound[0] for i in range(len(action))] 
 
         for i, value in enumerate(alphas):
             self.args[f"alpha{i+1}"] = value
@@ -185,17 +193,27 @@ def step(self, action):
         return observation, reward, bool(terminated), bool(truncated), {}
 
     def _get_obs(self):
-        rho = self.state.full().flatten()                                   # to have state vector as NumPy array and flatten into one dimensional array.[a+i*b c+i*d]
+        """
+        Get the current state observation for the RL agent. Converts the system's 
+        quantum state or matrix into a real-valued NumPy array suitable for RL algorithms.
+        """
+        rho = self.state.full().flatten()
         obs = np.concatenate((np.real(rho), np.imag(rho)))
         return obs.astype(np.float32)                                       # Gymnasium expects the observation to be of type float32
     
     def reset(self, seed=None):
+        """
+        Reset the environment to the initial state, preparing for a new episode.
+        """
         self.temp_actions = []
         self.state = self._initial
         return self._get_obs(), {}
     
     def result(self):
-        # TODO: return qoc.Result object with the optimized pulse amplitudes
+        """
+        Retrieve the results of the optimization process, including the optimized 
+        pulse sequences, final states, and performance metrics.
+        """
         self._result.message = "Optimization finished!"
         self._result.end_local_time = time.localtime()
         self._result.n_iters = len(self._result.iter_seconds)  
@@ -210,11 +228,14 @@ def result(self):
         return self._result
 
     def train(self):
+        """
+        Train the RL agent on the defined quantum control problem using the specified 
+        reinforcement learning algorithm. Checks environment compatibility with Gym API.
+        """
         # Check if the environment follows Gym API
         check_env(self, warn=True)
 
         # Create the model
-        model = PPO('MlpPolicy', self, verbose=1)       # verbose = 1 to show training in terminal
-
+        model = PPO('MlpPolicy', self, verbose=1)       # verbose = 1 to display training progress and statistics in the terminal
         # Train the model
         model.learn(total_timesteps = self.total_timesteps)
\ No newline at end of file

From 13ddba0253afc2b76807d55ca65991f2b0d50d15 Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Tue, 20 Aug 2024 23:11:54 +0200
Subject: [PATCH 06/15] added callback to stop training

---
 src/qutip_qoc/_rl.py | 94 ++++++++++++++++++++++++++++++++++++++------
 tests/test_result.py | 20 +++++-----
 2 files changed, 93 insertions(+), 21 deletions(-)

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index 8434fe6..943bcd3 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -13,6 +13,7 @@
 from gymnasium import spaces
 from stable_baselines3 import PPO
 from stable_baselines3.common.env_checker import check_env
+from stable_baselines3.common.callbacks import BaseCallback
 
 import time
 
@@ -90,6 +91,10 @@ def __init__(
         # To check if it exceeds the maximum number of steps in an episode
         self.current_step = 0
 
+        self.terminated = False
+        self.truncated = False
+        self.episode_info = []                # to contain some information from the latest episode
+
         self._fid_err_targ = alg_kwargs["fid_err_targ"]
 
         # inferred attributes
@@ -108,6 +113,7 @@ def __init__(
         self.step_duration = time_interval.tslots[-1] / time_interval.n_tslots  # step duration for mesvole
         self.max_episodes = alg_kwargs["max_iter"]                      # maximum number of episodes for training
         self.total_timesteps = self.max_episodes * self.max_steps       # for learn() of gym
+        self.current_episode = 0                                        # To keep track of the current episode
         
         # Define action and observation spaces (Gym)
         if self._initial.isket:
@@ -137,6 +143,19 @@ def pulse(self, t, args, idx):
         Returns the control pulse value at time t for a given index.
         """
         return 1*args[f"alpha{idx}"]
+    
+    def save_episode_info(self):
+        """
+        Save the information of the last episode before resetting the environment.
+        """
+        episode_data = {
+            "episode": self.current_episode,
+            "final_infidelity": self._result.infidelity,
+            "terminated": self.terminated,
+            "truncated": self.truncated,
+            "steps_used": self.current_step
+        }
+        self.episode_info.append(episode_data)
 
     def _infid(self, params=None):
         """
@@ -180,17 +199,11 @@ def step(self, action):
         self._result.infidelity = infidelity
         reward = (1 - infidelity) - self._step_penalty
 
-        terminated = infidelity <= self._fid_err_targ                       # the episode ended reaching the goal
-        truncated = self.current_step >= self.max_steps                     # if the episode ended without reaching the goal
-
-        if terminated or truncated:
-            time_diff = time.mktime(time.localtime()) - time.mktime(self._result.start_local_time)
-            self._result.iter_seconds.append(time_diff)
-            self.current_step = 0                                           # Reset the step counter
-            self.actions = self.temp_actions.copy()
+        self.terminated = infidelity <= self._fid_err_targ                       # the episode ended reaching the goal
+        self.truncated = self.current_step >= self.max_steps                     # if the episode ended without reaching the goal
 
         observation = self._get_obs()
-        return observation, reward, bool(terminated), bool(truncated), {}
+        return observation, reward, bool(self.terminated), bool(self.truncated), {}
 
     def _get_obs(self):
         """
@@ -205,6 +218,15 @@ def reset(self, seed=None):
         """
         Reset the environment to the initial state, preparing for a new episode.
         """
+        self.save_episode_info()
+
+        time_diff = time.mktime(time.localtime()) - time.mktime(self._result.start_local_time)
+        self._result.iter_seconds.append(time_diff)
+        self.current_step = 0                                           # Reset the step counter
+        self.current_episode += 1                                       # Increment episode counter
+        self.actions = self.temp_actions.copy()
+        self.terminated = False
+        self.truncated = False
         self.temp_actions = []
         self.state = self._initial
         return self._get_obs(), {}
@@ -214,7 +236,6 @@ def result(self):
         Retrieve the results of the optimization process, including the optimized 
         pulse sequences, final states, and performance metrics.
         """
-        self._result.message = "Optimization finished!"
         self._result.end_local_time = time.localtime()
         self._result.n_iters = len(self._result.iter_seconds)  
         self._result.optimized_params = self.actions.copy()
@@ -237,5 +258,56 @@ def train(self):
 
         # Create the model
         model = PPO('MlpPolicy', self, verbose=1)       # verbose = 1 to display training progress and statistics in the terminal
+        
+        stop_callback = EarlyStopTraining(verbose=1)
+        
         # Train the model
-        model.learn(total_timesteps = self.total_timesteps)
\ No newline at end of file
+        model.learn(total_timesteps = self.total_timesteps, callback=stop_callback)
+
+class EarlyStopTraining(BaseCallback):
+    """
+    A callback to stop training based on specific conditions (steps, infidelity, max iterations)
+    """
+    def __init__(self, verbose: int = 0):
+        super(EarlyStopTraining, self).__init__(verbose)
+        self.stop_train = False
+
+    def _on_step(self) -> bool:
+        """
+        This method is required by the BaseCallback class. We use it only to stop the training.
+        """
+        # Check if we need to stop training
+        if self.stop_train:
+            return False  # Stop training
+        return True  # Continue training
+
+    def _on_rollout_start(self) -> None:
+        """
+        This method is called before the rollout starts (before collecting new samples).
+        Checks:
+        - If all of the last 100 episodes have infidelity below the target and use the same number of steps, stop training.
+        - Stop training if the maximum number of episodes is reached.
+        """
+        env = self.training_env.envs[0].unwrapped
+        max_episodes = env.max_episodes
+        fid_err_targ = env._fid_err_targ
+
+        if len(env.episode_info) >= 100:
+            last_100_episodes = env.episode_info[-100:]
+
+            min_steps = min(info['steps_used'] for info in last_100_episodes)
+            steps_condition = all(ep['steps_used'] == min_steps for ep in last_100_episodes)
+            infid_condition = all(ep['final_infidelity'] <= fid_err_targ for ep in last_100_episodes)
+
+            if steps_condition and infid_condition:
+                env._result.message = "Training finished. No episode in the last 100 used fewer steps and infidelity was below target infid."
+                #print(f"Stopping training as no episode in the last 100 used fewer steps and infidelity was below target infid.")
+                self.stop_train = True  # Stop training
+                #print([ep['steps_used'] for ep in last_100_episodes])
+                #print([ep['final_infidelity'] for ep in last_100_episodes])
+
+        # Check max episodes condition
+        if env.current_episode >= max_episodes:
+            env._result.message = f"Reached {max_episodes} episodes, stopping training."
+            #print(f"Reached {max_episodes} episodes, stopping training.")
+            self.stop_train = True  # Stop training
diff --git a/tests/test_result.py b/tests/test_result.py
index 10fb8bf..f2ff583 100644
--- a/tests/test_result.py
+++ b/tests/test_result.py
@@ -158,7 +158,7 @@ def sin_z_jax(t, r, **kwargs):
 
 # state to state transfer
 initial = qt.basis(2, 0)
-target = qt.basis(2, 1)
+target = (qt.basis(2, 0) + qt.basis(2, 1)).unit() # |+⟩
 
 H_c = [qt.sigmax(), qt.sigmay(), qt.sigmaz()] # control Hamiltonians
 
@@ -173,23 +173,23 @@ def sin_z_jax(t, r, **kwargs):
     control_parameters = {
         "p": {"bounds": [(-13, 13)],}
     },
-    tlist=np.linspace(0, 10, 100), # TODO: derive single step duration and max evo time / max num steps from this
+    tlist=np.linspace(0, 10, 100),
     algorithm_kwargs={
         "fid_err_targ": 0.01,
         "alg": "RL",
-        "max_iter": 700,
+        "max_iter": 70000,
     },
     optimizer_kwargs={}
 )
 
 # no big difference for unitary evolution
 
-initial = qt.qeye(2) # Identity
-target  = qt.gates.hadamard_transform()
+#initial = qt.qeye(2) # Identity
+#target  = qt.gates.hadamard_transform()
 
-unitary_rl = state2state_rl._replace(
-    objectives=[Objective(initial, H, target)],
-)
+#unitary_rl = state2state_rl._replace(
+#    objectives=[Objective(initial, H, target)],
+#)
 
 
 @pytest.fixture(
@@ -199,8 +199,8 @@ def sin_z_jax(t, r, **kwargs):
         #pytest.param(state2state_param_crab, id="State to state (param. CRAB)"),
         #pytest.param(state2state_goat, id="State to state (GOAT)"),
         #pytest.param(state2state_jax, id="State to state (JAX)"),
-        #pytest.param(state2state_rl, id="State to state (RL)"),
-        pytest.param(unitary_rl, id="Unitary (RL)"),
+        pytest.param(state2state_rl, id="State to state (RL)"),
+        #pytest.param(unitary_rl, id="Unitary (RL)"),
     ]
 )
 def tst(request):

From fadb42e52f9008e2f9846ced4686f77e29c44d04 Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Thu, 22 Aug 2024 17:56:25 +0200
Subject: [PATCH 07/15] update docstring

---
 src/qutip_qoc/pulse_optim.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/qutip_qoc/pulse_optim.py b/src/qutip_qoc/pulse_optim.py
index 573a51d..efc5d84 100644
--- a/src/qutip_qoc/pulse_optim.py
+++ b/src/qutip_qoc/pulse_optim.py
@@ -25,7 +25,7 @@ def optimize_pulses(
     integrator_kwargs=None,
 ):
     """
-    Run GOAT, JOPT, GRAPE or CRAB optimization.
+    Run GOAT, JOPT, GRAPE, CRAB or RL optimization.
 
     Parameters
     ----------
@@ -37,6 +37,7 @@ def optimize_pulses(
         Dictionary of options for the control pulse optimization.
         The keys of this dict must be a unique string identifier for each control Hamiltonian / function.
         For the GOAT and JOPT algorithms, the dict may optionally also contain the key "__time__".
+        For RL you don't need to specify the guess.
         For each control function it must specify:
 
             control_id : dict
@@ -71,14 +72,15 @@ def optimize_pulses(
 
             - alg : str
                 Algorithm to use for the optimization.
-                Supported are: "GRAPE", "CRAB", "GOAT", "JOPT".
+                Supported are: "GRAPE", "CRAB", "GOAT", "JOPT" and "RL".
 
             - fid_err_targ : float, optional
                 Fidelity error target for the optimization.
 
             - max_iter : int, optional
                 Maximum number of iterations to perform.
-                Referes to local minimizer steps.
+                Referes to local minimizer steps or in the context of 
+                `alg: "RL"` to the max. number of episodes.
                 Global steps default to 0 (no global optimization).
                 Can be overridden by specifying in minimizer_kwargs.
 

From 0c6840581c9dde925efcc283015129f946d8dd66 Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Thu, 22 Aug 2024 18:00:09 +0200
Subject: [PATCH 08/15] update docstring

---
 src/qutip_qoc/pulse_optim.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/qutip_qoc/pulse_optim.py b/src/qutip_qoc/pulse_optim.py
index efc5d84..662fb67 100644
--- a/src/qutip_qoc/pulse_optim.py
+++ b/src/qutip_qoc/pulse_optim.py
@@ -1,7 +1,7 @@
 """
 This module is the entry point for the optimization of control pulses.
 It provides the function `optimize_pulses` which prepares and runs the
-GOAT, JOPT, GRAPE or CRAB optimization.
+GOAT, JOPT, GRAPE, CRAB or RL optimization.
 """
 import numpy as np
 

From e66da0ffd63d65fb9fdd4b0d56071c47440c0d7c Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Sat, 24 Aug 2024 17:32:49 +0200
Subject: [PATCH 09/15] Added shorter_pulses to algorithm_kwargs and changed
 callback to stop train early, removed update_solver(), use of var_time = True
 supported, other minor fixes

---
 src/qutip_qoc/_rl.py         | 77 ++++++++++++++++--------------------
 src/qutip_qoc/pulse_optim.py | 12 ++++--
 tests/test_result.py         | 28 ++++++-------
 3 files changed, 55 insertions(+), 62 deletions(-)

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index 943bcd3..ce4ae74 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -47,8 +47,6 @@ def __init__(
         super(_RL,self).__init__()
         
         self._Hd_lst, self._Hc_lst = [], []
-        if not isinstance(objectives, list):
-            objectives = [objectives]
         for objective in objectives:
             # extract drift and control Hamiltonians from the objective
             self._Hd_lst.append(objective.H[0])
@@ -61,8 +59,7 @@ def __init__(
             self._H_lst.append([Hc, lambda t, args: self.pulse(t, self.args, i+1)])
         self._H = qt.QobjEvo(self._H_lst, self.args)
 
-        self._control_parameters = control_parameters
-        # extract bounds for _control_parameters
+        # extract bounds for control_parameters
         bounds = []
         for key in control_parameters.keys():
             bounds.append(control_parameters[key].get("bounds"))
@@ -70,6 +67,7 @@ def __init__(
         self.ubound = [b[0][1] for b in bounds]
 
         self._alg_kwargs = alg_kwargs
+        self.shorter_pulses = self._alg_kwargs.get("shorter_pulses", False)     # lengthen the training to look for pulses of shorter duration, therefore episodes with fewer steps
 
         self._initial = objectives[0].initial
         self._target = objectives[0].target
@@ -82,7 +80,7 @@ def __init__(
             start_local_time = time.localtime(),    # initial optimization time
             n_iters = 0,                            # Number of iterations(episodes) until convergence 
             iter_seconds = [],                      # list containing the time taken for each iteration(episode) of the optimization
-            var_time = False,                       # Whether the optimization was performed with variable time
+            var_time = True,                        # Whether the optimization was performed with variable time
         )
 
         #for the reward
@@ -123,12 +121,7 @@ def __init__(
         self.action_space = spaces.Box(low=-1, high=1, shape=(len(self._Hc_lst[0]),), dtype=np.float32)     # Continuous action space from -1 to +1, as suggested from gym
         self.observation_space = spaces.Box(low=-1, high=1, shape=obs_shape, dtype=np.float32)              # Observation space
         
-    def update_solver(self): 
-        """
-        Update the solver and fidelity type based on the problem setup.
-        Chooses the appropriate solver (Schrödinger or master equation) and 
-        prepares for infidelity calculation.
-        """
+        # create the solver
         if self._Hd_lst[0].issuper:
             self._fid_type = self._alg_kwargs.get("fid_type", "TRACEDIFF")
             self._solver = qt.MESolver(H=self._H, options=self._integrator_kwargs)
@@ -136,8 +129,6 @@ def update_solver(self):
             self._fid_type = self._alg_kwargs.get("fid_type", "PSU")
             self._solver = qt.SESolver(H=self._H, options=self._integrator_kwargs)
 
-        self.infidelity = self._infid
-
     def pulse(self, t, args, idx):
         """
         Returns the control pulse value at time t for a given index.
@@ -153,7 +144,8 @@ def save_episode_info(self):
             "final_infidelity": self._result.infidelity,
             "terminated": self.terminated,
             "truncated": self.truncated,
-            "steps_used": self.current_step
+            "steps_used": self.current_step,
+            "elapsed_time": time.mktime(time.localtime())
         }
         self.episode_info.append(episode_data)
 
@@ -189,10 +181,8 @@ def step(self, action):
 
         for i, value in enumerate(alphas):
             self.args[f"alpha{i+1}"] = value
-        self._H = qt.QobjEvo(self._H_lst, self.args)
 
-        self.update_solver()                # _H has changed
-        infidelity = self.infidelity()
+        infidelity = self._infid()
 
         self.current_step += 1
         self.temp_actions.append(alphas)
@@ -220,7 +210,7 @@ def reset(self, seed=None):
         """
         self.save_episode_info()
 
-        time_diff = time.mktime(time.localtime()) - time.mktime(self._result.start_local_time)
+        time_diff = self.episode_info[-1]["elapsed_time"] - (self.episode_info[-2]["elapsed_time"] if len(self.episode_info) > 1 else time.mktime(self._result.start_local_time))
         self._result.iter_seconds.append(time_diff)
         self.current_step = 0                                           # Reset the step counter
         self.current_episode += 1                                       # Increment episode counter
@@ -238,7 +228,7 @@ def result(self):
         """
         self._result.end_local_time = time.localtime()
         self._result.n_iters = len(self._result.iter_seconds)  
-        self._result.optimized_params = self.actions.copy()
+        self._result.optimized_params = self.actions.copy() + [self._result.total_seconds]                    # If var_time is True, the last parameter is the evolution time
         self._result._optimized_controls = self.actions.copy()
         self._result._final_states = (self._result._final_states if self._result._final_states is not None else []) + [self.state]
         self._result.start_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.start_local_time)       # Convert to a string
@@ -274,11 +264,21 @@ def __init__(self, verbose: int = 0):
 
     def _on_step(self) -> bool:
         """
-        This method is required by the BaseCallback class. We use it only to stop the training.
+        This method is required by the BaseCallback class. We use it to stop the training.
+        - Stop training if the maximum number of episodes is reached.
+        - Stop training if it finds an episode with infidelity <= than target infidelity
         """
+        env = self.training_env.envs[0].unwrapped
+
         # Check if we need to stop training
         if self.stop_train:
             return False  # Stop training
+        elif env.current_episode >= env.max_episodes:
+            env._result.message = f"Reached {env.max_episodes} episodes, stopping training."
+            return False    # Stop training
+        elif (env._result.infidelity <= env._fid_err_targ) and not(env.shorter_pulses):
+            env._result.message = f"Stop training because an episode with infidelity <= target infidelity was found"
+            return False  # Stop training
         return True  # Continue training
 
     def _on_rollout_start(self) -> None:
@@ -286,28 +286,19 @@ def _on_rollout_start(self) -> None:
         This method is called before the rollout starts (before collecting new samples).
         Checks:
         - If all of the last 100 episodes have infidelity below the target and use the same number of steps, stop training.
-        - Stop training if the maximum number of episodes is reached.
         """
+        #could be moved to on_step
+        
         env = self.training_env.envs[0].unwrapped
-        max_episodes = env.max_episodes
-        fid_err_targ = env._fid_err_targ
-
-        if len(env.episode_info) >= 100:
-            last_100_episodes = env.episode_info[-100:]
-
-            min_steps = min(info['steps_used'] for info in last_100_episodes)
-            steps_condition = all(ep['steps_used'] == min_steps for ep in last_100_episodes)
-            infid_condition = all(ep['final_infidelity'] <= fid_err_targ for ep in last_100_episodes)
-
-            if steps_condition and infid_condition:
-                env._result.message = "Training finished. No episode in the last 100 used fewer steps and infidelity was below target infid."
-                #print(f"Stopping training as no episode in the last 100 used fewer steps and infidelity was below target infid.")
-                self.stop_train = True  # Stop training
-                #print([ep['steps_used'] for ep in last_100_episodes])
-                #print([ep['final_infidelity'] for ep in last_100_episodes])
-
-        # Check max episodes condition
-        if env.current_episode >= max_episodes:
-            env._result.message = f"Reached {max_episodes} episodes, stopping training."
-            #print(f"Reached {max_episodes} episodes, stopping training.")
-            self.stop_train = True  # Stop training
+        #Only if specified in alg_kwargs, the algorithm will search for shorter pulses, resulting in episodes with fewer steps.
+        if env.shorter_pulses:
+            if len(env.episode_info) >= 100:
+                last_100_episodes = env.episode_info[-100:]
+
+                min_steps = min(info['steps_used'] for info in last_100_episodes)
+                steps_condition = all(ep['steps_used'] == min_steps for ep in last_100_episodes)
+                infid_condition = all(ep['final_infidelity'] <= env._fid_err_targ for ep in last_100_episodes)
+
+                if steps_condition and infid_condition:
+                    env._result.message = "Training finished. No episode in the last 100 used fewer steps and infidelity was below target infid."
+                    self.stop_train = True  # Stop training
\ No newline at end of file
diff --git a/src/qutip_qoc/pulse_optim.py b/src/qutip_qoc/pulse_optim.py
index 662fb67..6964b9a 100644
--- a/src/qutip_qoc/pulse_optim.py
+++ b/src/qutip_qoc/pulse_optim.py
@@ -37,7 +37,6 @@ def optimize_pulses(
         Dictionary of options for the control pulse optimization.
         The keys of this dict must be a unique string identifier for each control Hamiltonian / function.
         For the GOAT and JOPT algorithms, the dict may optionally also contain the key "__time__".
-        For RL you don't need to specify the guess.
         For each control function it must specify:
 
             control_id : dict
@@ -46,6 +45,7 @@ def optimize_pulses(
                     where ``n`` is the number of independent variables.
 
                 - bounds : sequence, optional
+                    For RL you don't need to specify the guess.
                     Sequence of ``(min, max)`` pairs for each element in
                     `guess`. None is used to specify no bound.
 
@@ -84,6 +84,11 @@ def optimize_pulses(
                 Global steps default to 0 (no global optimization).
                 Can be overridden by specifying in minimizer_kwargs.
 
+            - shorter_pulses : bool, optional
+                If set to True, allows the algorithm to search for shorter control 
+                pulses that can achieve the desired fidelity target using fewer steps. 
+                By default, it is set to False, only attempting to reach the target infidelity.
+
         Algorithm specific keywords for GRAPE,CRAB can be found in
         :func:`qutip_qtrl.pulseoptim.optimize_pulse`.
 
@@ -154,7 +159,7 @@ def optimize_pulses(
     # extract guess and bounds for the control pulses
     x0, bounds = [], []
     for key in control_parameters.keys():
-        x0.append(control_parameters[key].get("guess"))         # TODO: for now only consider bounds
+        x0.append(control_parameters[key].get("guess"))
         bounds.append(control_parameters[key].get("bounds"))
     try:  # GRAPE, CRAB format
         lbound = [b[0][0] for b in bounds]
@@ -351,8 +356,7 @@ def optimize_pulses(
 
             qtrl_optimizers.append(qtrl_optimizer)
 
-    # TODO: we can deal with proper handling later
-    if alg == "RL":
+    elif alg == "RL":
         rl_env = _RL(
             objectives,
             control_parameters,
diff --git a/tests/test_result.py b/tests/test_result.py
index f2ff583..1ffa3df 100644
--- a/tests/test_result.py
+++ b/tests/test_result.py
@@ -153,8 +153,6 @@ def sin_z_jax(t, r, **kwargs):
 )
 
 # ----------------------- RL --------------------
-# TODO: this is the input for optimiz_pulses() function
-# you can use this routine to test your implementation
 
 # state to state transfer
 initial = qt.basis(2, 0)
@@ -169,7 +167,6 @@ def sin_z_jax(t, r, **kwargs):
 
 state2state_rl = Case(
     objectives=[Objective(initial, H, target)],
-    #control_parameters={"bounds": [-13, 13]}, # TODO: for now only consider bounds
     control_parameters = {
         "p": {"bounds": [(-13, 13)],}
     },
@@ -177,30 +174,31 @@ def sin_z_jax(t, r, **kwargs):
     algorithm_kwargs={
         "fid_err_targ": 0.01,
         "alg": "RL",
-        "max_iter": 70000,
+        "max_iter": 300,
+        "shorter_pulses": True,
     },
     optimizer_kwargs={}
 )
 
 # no big difference for unitary evolution
 
-#initial = qt.qeye(2) # Identity
-#target  = qt.gates.hadamard_transform()
+initial = qt.qeye(2) # Identity
+target  = qt.gates.hadamard_transform()
 
-#unitary_rl = state2state_rl._replace(
-#    objectives=[Objective(initial, H, target)],
-#)
+unitary_rl = state2state_rl._replace(
+    objectives=[Objective(initial, H, target)],
+)
 
 
 @pytest.fixture(
     params=[
-        #pytest.param(state2state_grape, id="State to state (GRAPE)"),
-        #pytest.param(state2state_crab, id="State to state (CRAB)"),
-        #pytest.param(state2state_param_crab, id="State to state (param. CRAB)"),
-        #pytest.param(state2state_goat, id="State to state (GOAT)"),
-        #pytest.param(state2state_jax, id="State to state (JAX)"),
+        pytest.param(state2state_grape, id="State to state (GRAPE)"),
+        pytest.param(state2state_crab, id="State to state (CRAB)"),
+        pytest.param(state2state_param_crab, id="State to state (param. CRAB)"),
+        pytest.param(state2state_goat, id="State to state (GOAT)"),
+        pytest.param(state2state_jax, id="State to state (JAX)"),
         pytest.param(state2state_rl, id="State to state (RL)"),
-        #pytest.param(unitary_rl, id="Unitary (RL)"),
+        pytest.param(unitary_rl, id="Unitary (RL)"),
     ]
 )
 def tst(request):

From 6cee77c3823e63ad8039710de6f1f9d1f66d2c03 Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Sat, 24 Aug 2024 19:07:37 +0200
Subject: [PATCH 10/15] Fixed final state saving

---
 src/qutip_qoc/_rl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index ce4ae74..ede6d5e 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -218,6 +218,7 @@ def reset(self, seed=None):
         self.terminated = False
         self.truncated = False
         self.temp_actions = []
+        self._result._final_states = [self.state]
         self.state = self._initial
         return self._get_obs(), {}
     
@@ -230,7 +231,6 @@ def result(self):
         self._result.n_iters = len(self._result.iter_seconds)  
         self._result.optimized_params = self.actions.copy() + [self._result.total_seconds]                    # If var_time is True, the last parameter is the evolution time
         self._result._optimized_controls = self.actions.copy()
-        self._result._final_states = (self._result._final_states if self._result._final_states is not None else []) + [self.state]
         self._result.start_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.start_local_time)       # Convert to a string
         self._result.end_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.end_local_time)           # Convert to a string
         self._result._guess_controls = []

From 318081f1d34ac17e484622cce117290756a57f9b Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Mon, 26 Aug 2024 10:28:31 +0200
Subject: [PATCH 11/15] edit configuration files

---
 .pre-commit-config.yaml | 2 ++
 requirements.txt        | 2 ++
 setup.cfg               | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0bcf23a..0c027ee 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -92,3 +92,5 @@ repos:
           - jaxlib
           - diffrax
           - pytest
+          - gymnasium
+          - stable-baselines3
diff --git a/requirements.txt b/requirements.txt
index a08565e..9687463 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,3 +7,5 @@ qutip>=5.0.1
 qutip-qtrl
 qutip-jax
 pre-commit
+gymnasium>=0.29.1
+stable-baselines3>=2.3.2
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index 3903195..7872023 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -35,6 +35,8 @@ install_requires =
     qutip-qtrl
     qutip-jax
     numpy>=1.16.6,<2.0
+    gymnasium>=0.29.1
+    stable-baselines3>=2.3.2
 setup_requires =
     cython>=1.0
     packaging

From 4485f9d42685364962ac4b436d4810200e2e1a26 Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Sat, 14 Sep 2024 20:17:11 +0200
Subject: [PATCH 12/15] Docstring fixes, use of __time__ parameter instead of
 shorter_pulses, added underscore for internal variables, args is now passed
 as a parameter to _infid(), changes in callback function

---
 src/qutip_qoc/_rl.py         | 126 +++++++++++++++++------------------
 src/qutip_qoc/pulse_optim.py |  10 +--
 tests/test_result.py         |   9 ++-
 3 files changed, 69 insertions(+), 76 deletions(-)

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index ede6d5e..66b8474 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -52,27 +52,34 @@ def __init__(
             self._Hd_lst.append(objective.H[0])
             self._Hc_lst.append([H[0] if isinstance(H, list) else H for H in objective.H[1:]])
 
-        # create the QobjEvo with Hd, Hc and controls(args)
-        self.args = {f"alpha{i+1}": (1) for i in range(len(self._Hc_lst[0]))}    # set the control parameters to 1 for all the Hc
+        def create_pulse_func(idx):
+            """
+            Create a control pulse lambda function for a given index.
+            """
+            return lambda t, args: self._pulse(t, args, idx+1)
+        
+        # create the QobjEvo with Hd, Hc and controls(args)        
         self._H_lst = [self._Hd_lst[0]]
+        dummy_args = {f'alpha{i+1}': 1.0 for i in range(len(self._Hc_lst[0]))}
         for i, Hc in enumerate(self._Hc_lst[0]):
-            self._H_lst.append([Hc, lambda t, args: self.pulse(t, self.args, i+1)])
-        self._H = qt.QobjEvo(self._H_lst, self.args)
+            self._H_lst.append([Hc, create_pulse_func(i)])
+        self._H = qt.QobjEvo(self._H_lst, args=dummy_args)
+
+        self.shorter_pulses = False if time_options == {} else True                 # lengthen the training to look for pulses of shorter duration, therefore episodes with fewer steps           
 
         # extract bounds for control_parameters
         bounds = []
         for key in control_parameters.keys():
             bounds.append(control_parameters[key].get("bounds"))
-        self.lbound = [b[0][0] for b in bounds]
-        self.ubound = [b[0][1] for b in bounds]
+        self._lbound = [b[0][0] for b in bounds]
+        self._ubound = [b[0][1] for b in bounds]
 
         self._alg_kwargs = alg_kwargs
-        self.shorter_pulses = self._alg_kwargs.get("shorter_pulses", False)     # lengthen the training to look for pulses of shorter duration, therefore episodes with fewer steps
 
         self._initial = objectives[0].initial
         self._target = objectives[0].target
-        self.state = None
-        self.dim = self._initial.shape[0]
+        self._state = None
+        self._dim = self._initial.shape[0]
 
         self._result = Result(
             objectives = objectives,
@@ -87,19 +94,19 @@ def __init__(
         self._step_penalty = 1
 
         # To check if it exceeds the maximum number of steps in an episode
-        self.current_step = 0
+        self._current_step = 0
 
         self.terminated = False
         self.truncated = False
-        self.episode_info = []                # to contain some information from the latest episode
+        self._episode_info = []                # to contain some information from the latest episode
 
         self._fid_err_targ = alg_kwargs["fid_err_targ"]
 
         # inferred attributes
         self._norm_fac = 1 / self._target.norm()
 
-        self.temp_actions = []                  # temporary list to save episode actions
-        self.actions = []                       # list of actions(lists) of the last episode
+        self._temp_actions = []                  # temporary list to save episode actions
+        self._actions = []                       # list of actions(lists) of the last episode
 
         # integrator options
         self._integrator_kwargs = integrator_kwargs
@@ -108,16 +115,16 @@ def __init__(
 
         self.max_episode_time = time_interval.evo_time                  # maximum time for an episode
         self.max_steps = time_interval.n_tslots                         # maximum number of steps in an episode
-        self.step_duration = time_interval.tslots[-1] / time_interval.n_tslots  # step duration for mesvole
+        self._step_duration = time_interval.tslots[-1] / time_interval.n_tslots  # step duration for mesvole
         self.max_episodes = alg_kwargs["max_iter"]                      # maximum number of episodes for training
-        self.total_timesteps = self.max_episodes * self.max_steps       # for learn() of gym
+        self._total_timesteps = self.max_episodes * self.max_steps       # for learn() of gym
         self.current_episode = 0                                        # To keep track of the current episode
         
         # Define action and observation spaces (Gym)
         if self._initial.isket:
-            obs_shape = (2 * self.dim,)
+            obs_shape = (2 * self._dim,)
         else:   # for unitary operators 
-            obs_shape = (2 * self.dim * self.dim,)
+            obs_shape = (2 * self._dim * self._dim,)
         self.action_space = spaces.Box(low=-1, high=1, shape=(len(self._Hc_lst[0]),), dtype=np.float32)     # Continuous action space from -1 to +1, as suggested from gym
         self.observation_space = spaces.Box(low=-1, high=1, shape=obs_shape, dtype=np.float32)              # Observation space
         
@@ -129,13 +136,14 @@ def __init__(
             self._fid_type = self._alg_kwargs.get("fid_type", "PSU")
             self._solver = qt.SESolver(H=self._H, options=self._integrator_kwargs)
 
-    def pulse(self, t, args, idx):
+    def _pulse(self, t, args, idx):
         """
         Returns the control pulse value at time t for a given index.
         """
-        return 1*args[f"alpha{idx}"]
+        alpha = args[f"alpha{idx}"]
+        return alpha
     
-    def save_episode_info(self):
+    def _save_episode_info(self):
         """
         Save the information of the last episode before resetting the environment.
         """
@@ -144,19 +152,19 @@ def save_episode_info(self):
             "final_infidelity": self._result.infidelity,
             "terminated": self.terminated,
             "truncated": self.truncated,
-            "steps_used": self.current_step,
+            "steps_used": self._current_step,
             "elapsed_time": time.mktime(time.localtime())
         }
-        self.episode_info.append(episode_data)
+        self._episode_info.append(episode_data)
 
-    def _infid(self, params=None):
+    def _infid(self, args):
         """
         The agent performs a step, then calculate infidelity to be minimized of the current state against the target state.
         """
         X = self._solver.run(
-            self.state, [0.0, self.step_duration], args={"p": params}
+            self._state, [0.0, self._step_duration], args=args
         ).final_state
-        self.state = X
+        self._state = X
 
         if self._fid_type == "TRACEDIFF":
             diff = X - self._target
@@ -177,20 +185,18 @@ def step(self, action):
         Perform a single time step in the environment, applying the scaled action (control pulse) 
         chosen by the RL agent. Updates the system's state and computes the reward.
         """
-        alphas = [((action[i] + 1) / 2 * (self.ubound[0] - self.lbound[0])) + self.lbound[0] for i in range(len(action))] 
-
-        for i, value in enumerate(alphas):
-            self.args[f"alpha{i+1}"] = value
+        alphas = [((action[i] + 1) / 2 * (self._ubound[0] - self._lbound[0])) + self._lbound[0] for i in range(len(action))] 
 
-        infidelity = self._infid()
+        args = {f"alpha{i+1}": value for i, value in enumerate(alphas)}
+        _infidelity = self._infid(args)
 
-        self.current_step += 1
-        self.temp_actions.append(alphas)
-        self._result.infidelity = infidelity
-        reward = (1 - infidelity) - self._step_penalty
+        self._current_step += 1
+        self._temp_actions.append(alphas)
+        self._result.infidelity = _infidelity
+        reward = (1 - _infidelity) - self._step_penalty
 
-        self.terminated = infidelity <= self._fid_err_targ                       # the episode ended reaching the goal
-        self.truncated = self.current_step >= self.max_steps                     # if the episode ended without reaching the goal
+        self.terminated = _infidelity <= self._fid_err_targ                       # the episode ended reaching the goal
+        self.truncated = self._current_step >= self.max_steps                     # if the episode ended without reaching the goal
 
         observation = self._get_obs()
         return observation, reward, bool(self.terminated), bool(self.truncated), {}
@@ -200,7 +206,7 @@ def _get_obs(self):
         Get the current state observation for the RL agent. Converts the system's 
         quantum state or matrix into a real-valued NumPy array suitable for RL algorithms.
         """
-        rho = self.state.full().flatten()
+        rho = self._state.full().flatten()
         obs = np.concatenate((np.real(rho), np.imag(rho)))
         return obs.astype(np.float32)                                       # Gymnasium expects the observation to be of type float32
     
@@ -208,18 +214,18 @@ def reset(self, seed=None):
         """
         Reset the environment to the initial state, preparing for a new episode.
         """
-        self.save_episode_info()
+        self._save_episode_info()
 
-        time_diff = self.episode_info[-1]["elapsed_time"] - (self.episode_info[-2]["elapsed_time"] if len(self.episode_info) > 1 else time.mktime(self._result.start_local_time))
+        time_diff = self._episode_info[-1]["elapsed_time"] - (self._episode_info[-2]["elapsed_time"] if len(self._episode_info) > 1 else time.mktime(self._result.start_local_time))
         self._result.iter_seconds.append(time_diff)
-        self.current_step = 0                                           # Reset the step counter
+        self._current_step = 0                                           # Reset the step counter
         self.current_episode += 1                                       # Increment episode counter
-        self.actions = self.temp_actions.copy()
+        self._actions = self._temp_actions.copy()
         self.terminated = False
         self.truncated = False
-        self.temp_actions = []
-        self._result._final_states = [self.state]
-        self.state = self._initial
+        self._temp_actions = []
+        self._result._final_states = [self._state]
+        self._state = self._initial
         return self._get_obs(), {}
     
     def result(self):
@@ -229,8 +235,8 @@ def result(self):
         """
         self._result.end_local_time = time.localtime()
         self._result.n_iters = len(self._result.iter_seconds)  
-        self._result.optimized_params = self.actions.copy() + [self._result.total_seconds]                    # If var_time is True, the last parameter is the evolution time
-        self._result._optimized_controls = self.actions.copy()
+        self._result.optimized_params = self._actions.copy() + [self._result.total_seconds]                    # If var_time is True, the last parameter is the evolution time
+        self._result._optimized_controls = self._actions.copy()
         self._result.start_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.start_local_time)       # Convert to a string
         self._result.end_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.end_local_time)           # Convert to a string
         self._result._guess_controls = []
@@ -252,7 +258,7 @@ def train(self):
         stop_callback = EarlyStopTraining(verbose=1)
         
         # Train the model
-        model.learn(total_timesteps = self.total_timesteps, callback=stop_callback)
+        model.learn(total_timesteps = self._total_timesteps, callback=stop_callback)
 
 class EarlyStopTraining(BaseCallback):
     """
@@ -267,33 +273,20 @@ def _on_step(self) -> bool:
         This method is required by the BaseCallback class. We use it to stop the training.
         - Stop training if the maximum number of episodes is reached.
         - Stop training if it finds an episode with infidelity <= than target infidelity
+        - If all of the last 100 episodes have infidelity below the target and use the same number of steps, stop training.
         """
         env = self.training_env.envs[0].unwrapped
 
         # Check if we need to stop training
-        if self.stop_train:
-            return False  # Stop training
-        elif env.current_episode >= env.max_episodes:
+        if env.current_episode >= env.max_episodes:
             env._result.message = f"Reached {env.max_episodes} episodes, stopping training."
             return False    # Stop training
         elif (env._result.infidelity <= env._fid_err_targ) and not(env.shorter_pulses):
             env._result.message = f"Stop training because an episode with infidelity <= target infidelity was found"
             return False  # Stop training
-        return True  # Continue training
-
-    def _on_rollout_start(self) -> None:
-        """
-        This method is called before the rollout starts (before collecting new samples).
-        Checks:
-        - If all of the last 100 episodes have infidelity below the target and use the same number of steps, stop training.
-        """
-        #could be moved to on_step
-        
-        env = self.training_env.envs[0].unwrapped
-        #Only if specified in alg_kwargs, the algorithm will search for shorter pulses, resulting in episodes with fewer steps.
-        if env.shorter_pulses:
-            if len(env.episode_info) >= 100:
-                last_100_episodes = env.episode_info[-100:]
+        elif env.shorter_pulses:
+            if len(env._episode_info) >= 100:
+                last_100_episodes = env._episode_info[-100:]
 
                 min_steps = min(info['steps_used'] for info in last_100_episodes)
                 steps_condition = all(ep['steps_used'] == min_steps for ep in last_100_episodes)
@@ -301,4 +294,5 @@ def _on_rollout_start(self) -> None:
 
                 if steps_condition and infid_condition:
                     env._result.message = "Training finished. No episode in the last 100 used fewer steps and infidelity was below target infid."
-                    self.stop_train = True  # Stop training
\ No newline at end of file
+                    return False  # Stop training
+        return True  # Continue training
\ No newline at end of file
diff --git a/src/qutip_qoc/pulse_optim.py b/src/qutip_qoc/pulse_optim.py
index 6964b9a..8a1f4dc 100644
--- a/src/qutip_qoc/pulse_optim.py
+++ b/src/qutip_qoc/pulse_optim.py
@@ -41,16 +41,17 @@ def optimize_pulses(
 
             control_id : dict
                 - guess: ndarray, shape (n,)
+                    For RL you don't need to specify the guess.
                     Initial guess. Array of real elements of size (n,),
                     where ``n`` is the number of independent variables.
 
                 - bounds : sequence, optional
-                    For RL you don't need to specify the guess.
                     Sequence of ``(min, max)`` pairs for each element in
                     `guess`. None is used to specify no bound.
 
             __time__ : dict, optional
-                Only supported by GOAT and JOPT.
+                Only supported by GOAT, JOPT and RL.
+                For RL the values of guess and bounds are not relevant.
                 If given the pulse duration is treated as optimization parameter.
                 It must specify both:
 
@@ -84,11 +85,6 @@ def optimize_pulses(
                 Global steps default to 0 (no global optimization).
                 Can be overridden by specifying in minimizer_kwargs.
 
-            - shorter_pulses : bool, optional
-                If set to True, allows the algorithm to search for shorter control 
-                pulses that can achieve the desired fidelity target using fewer steps. 
-                By default, it is set to False, only attempting to reach the target infidelity.
-
         Algorithm specific keywords for GRAPE,CRAB can be found in
         :func:`qutip_qtrl.pulseoptim.optimize_pulse`.
 
diff --git a/tests/test_result.py b/tests/test_result.py
index 1ffa3df..b74a211 100644
--- a/tests/test_result.py
+++ b/tests/test_result.py
@@ -168,14 +168,17 @@ def sin_z_jax(t, r, **kwargs):
 state2state_rl = Case(
     objectives=[Objective(initial, H, target)],
     control_parameters = {
-        "p": {"bounds": [(-13, 13)],}
+        "p": {"bounds": [(-13, 13)]},
+        "__time__": {
+            "guess": np.array([0.0]),  #dummy value
+            "bounds": [(0.0, 0.0)]    #dummy value
+        }
     },
     tlist=np.linspace(0, 10, 100),
     algorithm_kwargs={
         "fid_err_targ": 0.01,
         "alg": "RL",
-        "max_iter": 300,
-        "shorter_pulses": True,
+        "max_iter": 20000,
     },
     optimizer_kwargs={}
 )

From 3889a63d688e11a67ce69b382a1049725e79972f Mon Sep 17 00:00:00 2001
From: LegionAtol <alessio.parato@gmail.com>
Date: Mon, 30 Sep 2024 17:17:00 +0200
Subject: [PATCH 13/15] added _backup_result to keep track of the result even
 if the algorithm continues to search for solutions with shorter pulses

---
 src/qutip_qoc/_rl.py | 64 ++++++++++++++++++++++++++++++++++----------
 tests/test_result.py | 12 +++++++++
 2 files changed, 62 insertions(+), 14 deletions(-)

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index 66b8474..3bdb1fc 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -88,7 +88,19 @@ def create_pulse_func(idx):
             n_iters = 0,                            # Number of iterations(episodes) until convergence 
             iter_seconds = [],                      # list containing the time taken for each iteration(episode) of the optimization
             var_time = True,                        # Whether the optimization was performed with variable time
+            guess_params=[]
         )
+      
+        self._backup_result = Result(                              # used as a backup in case the algorithm with shorter_pulses does not find an episode with infid<target_infid
+            objectives = objectives,
+            time_interval = time_interval,
+            start_local_time = time.localtime(),
+            n_iters = 0,
+            iter_seconds = [],
+            var_time = True,
+            guess_params=[]
+        )
+        self._use_backup_result = False                             # if true, use self._backup_result as the final optimization result
 
         #for the reward
         self._step_penalty = 1
@@ -228,21 +240,39 @@ def reset(self, seed=None):
         self._state = self._initial
         return self._get_obs(), {}
     
-    def result(self):
+    def _save_result(self):
         """
-        Retrieve the results of the optimization process, including the optimized 
+        Save the results of the optimization process, including the optimized 
         pulse sequences, final states, and performance metrics.
         """
-        self._result.end_local_time = time.localtime()
-        self._result.n_iters = len(self._result.iter_seconds)  
-        self._result.optimized_params = self._actions.copy() + [self._result.total_seconds]                    # If var_time is True, the last parameter is the evolution time
-        self._result._optimized_controls = self._actions.copy()
-        self._result.start_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.start_local_time)       # Convert to a string
-        self._result.end_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.end_local_time)           # Convert to a string
-        self._result._guess_controls = []
-        self._result._optimized_H = [self._H]
-        self._result.guess_params = []
-        return self._result
+        result_obj = self._backup_result if self._use_backup_result else self._result
+
+        if(self._use_backup_result):
+            self._backup_result.iter_seconds = self._result.iter_seconds.copy()
+            self._backup_result._final_states = self._result._final_states.copy()
+            self._backup_result.infidelity = self._result.infidelity
+
+        result_obj.end_local_time = time.localtime()
+        result_obj.n_iters = len(self._result.iter_seconds)  
+        result_obj.optimized_params = self._actions.copy() + [self._result.total_seconds]                      # If var_time is True, the last parameter is the evolution time
+        result_obj._optimized_controls = self._actions.copy()
+        result_obj._guess_controls = []
+        result_obj._optimized_H = [self._H]
+
+
+    def result(self):
+        """
+        Final conversions and return of optimization results
+        """            
+        if self._use_backup_result:
+            self._backup_result.start_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._backup_result.start_local_time)       # Convert to a string
+            self._backup_result.end_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._backup_result.end_local_time)           # Convert to a string
+            return self._backup_result
+        else:
+            self._save_result()
+            self._result.start_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.start_local_time)       # Convert to a string
+            self._result.end_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.end_local_time)           # Convert to a string
+            return self._result
 
     def train(self):
         """
@@ -266,7 +296,6 @@ class EarlyStopTraining(BaseCallback):
     """
     def __init__(self, verbose: int = 0):
         super(EarlyStopTraining, self).__init__(verbose)
-        self.stop_train = False
 
     def _on_step(self) -> bool:
         """
@@ -279,12 +308,18 @@ def _on_step(self) -> bool:
 
         # Check if we need to stop training
         if env.current_episode >= env.max_episodes:
-            env._result.message = f"Reached {env.max_episodes} episodes, stopping training."
+            if env._use_backup_result == True:
+                env._backup_result.message = f"Reached {env.max_episodes} episodes, stopping training. Return the last founded episode with infid < target_infid"
+            else:
+                env._result.message = f"Reached {env.max_episodes} episodes, stopping training."
             return False    # Stop training
         elif (env._result.infidelity <= env._fid_err_targ) and not(env.shorter_pulses):
             env._result.message = f"Stop training because an episode with infidelity <= target infidelity was found"
             return False  # Stop training
         elif env.shorter_pulses:
+            if(env._result.infidelity <= env._fid_err_targ):        # if it finds an episode with infidelity lower than target infidelity, I'll save it in the meantime
+                env._use_backup_result = True
+                env._save_result()
             if len(env._episode_info) >= 100:
                 last_100_episodes = env._episode_info[-100:]
 
@@ -293,6 +328,7 @@ def _on_step(self) -> bool:
                 infid_condition = all(ep['final_infidelity'] <= env._fid_err_targ for ep in last_100_episodes)
 
                 if steps_condition and infid_condition:
+                    env._use_backup_result = False
                     env._result.message = "Training finished. No episode in the last 100 used fewer steps and infidelity was below target infid."
                     return False  # Stop training
         return True  # Continue training
\ No newline at end of file
diff --git a/tests/test_result.py b/tests/test_result.py
index b74a211..9bb5496 100644
--- a/tests/test_result.py
+++ b/tests/test_result.py
@@ -190,6 +190,18 @@ def sin_z_jax(t, r, **kwargs):
 
 unitary_rl = state2state_rl._replace(
     objectives=[Objective(initial, H, target)],
+        control_parameters = {
+        "p": {"bounds": [(-13, 13)]},
+        "__time__": {
+            "guess": np.array([0.0]),  #dummy value
+            "bounds": [(0.0, 0.0)]    #dummy value
+        }
+    },
+    algorithm_kwargs={
+        "fid_err_targ": 0.01,
+        "alg": "RL",
+        "max_iter": 300,
+    },
 )
 
 

From 8fe527ba5affa1a544546fb526cb91560b7ed418 Mon Sep 17 00:00:00 2001
From: flowerthrower <flowerthrower@users.noreply.github.com>
Date: Wed, 2 Oct 2024 11:25:51 +0200
Subject: [PATCH 14/15] back to 'shorter_pulses'

---
 src/qutip_qoc/_rl.py         |  2 +-
 src/qutip_qoc/pulse_optim.py |  3 +--
 tests/test_result.py         | 10 ++--------
 3 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index 3bdb1fc..cdcaa01 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -65,7 +65,7 @@ def create_pulse_func(idx):
             self._H_lst.append([Hc, create_pulse_func(i)])
         self._H = qt.QobjEvo(self._H_lst, args=dummy_args)
 
-        self.shorter_pulses = False if time_options == {} else True                 # lengthen the training to look for pulses of shorter duration, therefore episodes with fewer steps           
+        self.shorter_pulses = alg_kwargs.get("shorter_pulses", False) # lengthen the training to look for pulses of shorter duration, therefore episodes with fewer steps           
 
         # extract bounds for control_parameters
         bounds = []
diff --git a/src/qutip_qoc/pulse_optim.py b/src/qutip_qoc/pulse_optim.py
index 8a1f4dc..d320799 100644
--- a/src/qutip_qoc/pulse_optim.py
+++ b/src/qutip_qoc/pulse_optim.py
@@ -50,8 +50,7 @@ def optimize_pulses(
                     `guess`. None is used to specify no bound.
 
             __time__ : dict, optional
-                Only supported by GOAT, JOPT and RL.
-                For RL the values of guess and bounds are not relevant.
+                Only supported by GOAT, JOPT (for RL use `algorithm_kwargs: 'shorter_pulses'`).
                 If given the pulse duration is treated as optimization parameter.
                 It must specify both:
 
diff --git a/tests/test_result.py b/tests/test_result.py
index 9bb5496..7b9cda3 100644
--- a/tests/test_result.py
+++ b/tests/test_result.py
@@ -169,16 +169,13 @@ def sin_z_jax(t, r, **kwargs):
     objectives=[Objective(initial, H, target)],
     control_parameters = {
         "p": {"bounds": [(-13, 13)]},
-        "__time__": {
-            "guess": np.array([0.0]),  #dummy value
-            "bounds": [(0.0, 0.0)]    #dummy value
-        }
     },
     tlist=np.linspace(0, 10, 100),
     algorithm_kwargs={
         "fid_err_targ": 0.01,
         "alg": "RL",
         "max_iter": 20000,
+        "shorter_pulses": True,
     },
     optimizer_kwargs={}
 )
@@ -192,15 +189,12 @@ def sin_z_jax(t, r, **kwargs):
     objectives=[Objective(initial, H, target)],
         control_parameters = {
         "p": {"bounds": [(-13, 13)]},
-        "__time__": {
-            "guess": np.array([0.0]),  #dummy value
-            "bounds": [(0.0, 0.0)]    #dummy value
-        }
     },
     algorithm_kwargs={
         "fid_err_targ": 0.01,
         "alg": "RL",
         "max_iter": 300,
+        "shorter_pulses": True,
     },
 )
 

From 604b12e2f827d82f8823cc4c141cd0e247ea27d1 Mon Sep 17 00:00:00 2001
From: flowerthrower <flowerthrower@users.noreply.github.com>
Date: Wed, 2 Oct 2024 11:37:49 +0200
Subject: [PATCH 15/15] pre-commit formating

---
 requirements.txt             |   2 +-
 src/qutip_qoc/_rl.py         | 227 +++++++++++++++++++++--------------
 src/qutip_qoc/pulse_optim.py |   2 +-
 tests/test_result.py         |  18 +--
 4 files changed, 150 insertions(+), 99 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 9687463..50da21d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,4 +8,4 @@ qutip-qtrl
 qutip-jax
 pre-commit
 gymnasium>=0.29.1
-stable-baselines3>=2.3.2
\ No newline at end of file
+stable-baselines3>=2.3.2
diff --git a/src/qutip_qoc/_rl.py b/src/qutip_qoc/_rl.py
index cdcaa01..9a75a55 100644
--- a/src/qutip_qoc/_rl.py
+++ b/src/qutip_qoc/_rl.py
@@ -1,10 +1,10 @@
 """
-This module contains functions that implement quantum optimal control 
-using reinforcement learning (RL) techniques, allowing for the optimization 
+This module contains functions that implement quantum optimal control
+using reinforcement learning (RL) techniques, allowing for the optimization
 of control pulse sequences in quantum systems.
 """
 import qutip as qt
-from qutip import Qobj, QobjEvo
+from qutip import Qobj
 from qutip_qoc import Result
 
 import numpy as np
@@ -17,12 +17,13 @@
 
 import time
 
+
 class _RL(gym.Env):
     """
-    Class for storing a control problem and implementing quantum optimal 
-    control using reinforcement learning. This class defines a custom 
-    Gym environment that models the dynamics of quantum systems 
-    under various control pulses, and uses RL algorithms to optimize the 
+    Class for storing a control problem and implementing quantum optimal
+    control using reinforcement learning. This class defines a custom
+    Gym environment that models the dynamics of quantum systems
+    under various control pulses, and uses RL algorithms to optimize the
     parameters of these pulses.
     """
 
@@ -39,33 +40,37 @@ def __init__(
         qtrl_optimizers,
     ):
         """
-        Initialize the reinforcement learning environment for quantum 
-        optimal control. Sets up the system Hamiltonian, control parameters, 
+        Initialize the reinforcement learning environment for quantum
+        optimal control. Sets up the system Hamiltonian, control parameters,
         and defines the observation and action spaces for the RL agent.
         """
 
-        super(_RL,self).__init__()
-        
+        super(_RL, self).__init__()
+
         self._Hd_lst, self._Hc_lst = [], []
         for objective in objectives:
             # extract drift and control Hamiltonians from the objective
             self._Hd_lst.append(objective.H[0])
-            self._Hc_lst.append([H[0] if isinstance(H, list) else H for H in objective.H[1:]])
+            self._Hc_lst.append(
+                [H[0] if isinstance(H, list) else H for H in objective.H[1:]]
+            )
 
         def create_pulse_func(idx):
             """
             Create a control pulse lambda function for a given index.
             """
-            return lambda t, args: self._pulse(t, args, idx+1)
-        
-        # create the QobjEvo with Hd, Hc and controls(args)        
+            return lambda t, args: self._pulse(t, args, idx + 1)
+
+        # create the QobjEvo with Hd, Hc and controls(args)
         self._H_lst = [self._Hd_lst[0]]
-        dummy_args = {f'alpha{i+1}': 1.0 for i in range(len(self._Hc_lst[0]))}
+        dummy_args = {f"alpha{i+1}": 1.0 for i in range(len(self._Hc_lst[0]))}
         for i, Hc in enumerate(self._Hc_lst[0]):
             self._H_lst.append([Hc, create_pulse_func(i)])
         self._H = qt.QobjEvo(self._H_lst, args=dummy_args)
 
-        self.shorter_pulses = alg_kwargs.get("shorter_pulses", False) # lengthen the training to look for pulses of shorter duration, therefore episodes with fewer steps           
+        self.shorter_pulses = alg_kwargs.get(
+            "shorter_pulses", False
+        )  # lengthen the training to look for pulses of shorter duration, therefore episodes with fewer steps
 
         # extract bounds for control_parameters
         bounds = []
@@ -82,27 +87,29 @@ def create_pulse_func(idx):
         self._dim = self._initial.shape[0]
 
         self._result = Result(
-            objectives = objectives,
-            time_interval = time_interval,
-            start_local_time = time.localtime(),    # initial optimization time
-            n_iters = 0,                            # Number of iterations(episodes) until convergence 
-            iter_seconds = [],                      # list containing the time taken for each iteration(episode) of the optimization
-            var_time = True,                        # Whether the optimization was performed with variable time
-            guess_params=[]
+            objectives=objectives,
+            time_interval=time_interval,
+            start_local_time=time.localtime(),  # initial optimization time
+            n_iters=0,  # Number of iterations(episodes) until convergence
+            iter_seconds=[],  # list containing the time taken for each iteration(episode) of the optimization
+            var_time=True,  # Whether the optimization was performed with variable time
+            guess_params=[],
+        )
+
+        self._backup_result = Result(  # used as a backup in case the algorithm with shorter_pulses does not find an episode with infid<target_infid
+            objectives=objectives,
+            time_interval=time_interval,
+            start_local_time=time.localtime(),
+            n_iters=0,
+            iter_seconds=[],
+            var_time=True,
+            guess_params=[],
         )
-      
-        self._backup_result = Result(                              # used as a backup in case the algorithm with shorter_pulses does not find an episode with infid<target_infid
-            objectives = objectives,
-            time_interval = time_interval,
-            start_local_time = time.localtime(),
-            n_iters = 0,
-            iter_seconds = [],
-            var_time = True,
-            guess_params=[]
+        self._use_backup_result = (
+            False  # if true, use self._backup_result as the final optimization result
         )
-        self._use_backup_result = False                             # if true, use self._backup_result as the final optimization result
 
-        #for the reward
+        # for the reward
         self._step_penalty = 1
 
         # To check if it exceeds the maximum number of steps in an episode
@@ -110,36 +117,44 @@ def create_pulse_func(idx):
 
         self.terminated = False
         self.truncated = False
-        self._episode_info = []                # to contain some information from the latest episode
+        self._episode_info = []  # to contain some information from the latest episode
 
         self._fid_err_targ = alg_kwargs["fid_err_targ"]
 
         # inferred attributes
         self._norm_fac = 1 / self._target.norm()
 
-        self._temp_actions = []                  # temporary list to save episode actions
-        self._actions = []                       # list of actions(lists) of the last episode
+        self._temp_actions = []  # temporary list to save episode actions
+        self._actions = []  # list of actions(lists) of the last episode
 
         # integrator options
         self._integrator_kwargs = integrator_kwargs
         self._rtol = self._integrator_kwargs.get("rtol", 1e-5)
         self._atol = self._integrator_kwargs.get("atol", 1e-5)
 
-        self.max_episode_time = time_interval.evo_time                  # maximum time for an episode
-        self.max_steps = time_interval.n_tslots                         # maximum number of steps in an episode
-        self._step_duration = time_interval.tslots[-1] / time_interval.n_tslots  # step duration for mesvole
-        self.max_episodes = alg_kwargs["max_iter"]                      # maximum number of episodes for training
-        self._total_timesteps = self.max_episodes * self.max_steps       # for learn() of gym
-        self.current_episode = 0                                        # To keep track of the current episode
-        
+        self.max_episode_time = time_interval.evo_time  # maximum time for an episode
+        self.max_steps = time_interval.n_tslots  # maximum number of steps in an episode
+        self._step_duration = (
+            time_interval.tslots[-1] / time_interval.n_tslots
+        )  # step duration for mesvole
+        self.max_episodes = alg_kwargs[
+            "max_iter"
+        ]  # maximum number of episodes for training
+        self._total_timesteps = self.max_episodes * self.max_steps  # for learn() of gym
+        self.current_episode = 0  # To keep track of the current episode
+
         # Define action and observation spaces (Gym)
         if self._initial.isket:
             obs_shape = (2 * self._dim,)
-        else:   # for unitary operators 
+        else:  # for unitary operators
             obs_shape = (2 * self._dim * self._dim,)
-        self.action_space = spaces.Box(low=-1, high=1, shape=(len(self._Hc_lst[0]),), dtype=np.float32)     # Continuous action space from -1 to +1, as suggested from gym
-        self.observation_space = spaces.Box(low=-1, high=1, shape=obs_shape, dtype=np.float32)              # Observation space
-        
+        self.action_space = spaces.Box(
+            low=-1, high=1, shape=(len(self._Hc_lst[0]),), dtype=np.float32
+        )  # Continuous action space from -1 to +1, as suggested from gym
+        self.observation_space = spaces.Box(
+            low=-1, high=1, shape=obs_shape, dtype=np.float32
+        )  # Observation space
+
         # create the solver
         if self._Hd_lst[0].issuper:
             self._fid_type = self._alg_kwargs.get("fid_type", "TRACEDIFF")
@@ -154,7 +169,7 @@ def _pulse(self, t, args, idx):
         """
         alpha = args[f"alpha{idx}"]
         return alpha
-    
+
     def _save_episode_info(self):
         """
         Save the information of the last episode before resetting the environment.
@@ -165,7 +180,7 @@ def _save_episode_info(self):
             "terminated": self.terminated,
             "truncated": self.truncated,
             "steps_used": self._current_step,
-            "elapsed_time": time.mktime(time.localtime())
+            "elapsed_time": time.mktime(time.localtime()),
         }
         self._episode_info.append(episode_data)
 
@@ -194,10 +209,14 @@ def _infid(self, args):
 
     def step(self, action):
         """
-        Perform a single time step in the environment, applying the scaled action (control pulse) 
+        Perform a single time step in the environment, applying the scaled action (control pulse)
         chosen by the RL agent. Updates the system's state and computes the reward.
         """
-        alphas = [((action[i] + 1) / 2 * (self._ubound[0] - self._lbound[0])) + self._lbound[0] for i in range(len(action))] 
+        alphas = [
+            ((action[i] + 1) / 2 * (self._ubound[0] - self._lbound[0]))
+            + self._lbound[0]
+            for i in range(len(action))
+        ]
 
         args = {f"alpha{i+1}": value for i, value in enumerate(alphas)}
         _infidelity = self._infid(args)
@@ -207,31 +226,41 @@ def step(self, action):
         self._result.infidelity = _infidelity
         reward = (1 - _infidelity) - self._step_penalty
 
-        self.terminated = _infidelity <= self._fid_err_targ                       # the episode ended reaching the goal
-        self.truncated = self._current_step >= self.max_steps                     # if the episode ended without reaching the goal
+        self.terminated = (
+            _infidelity <= self._fid_err_targ
+        )  # the episode ended reaching the goal
+        self.truncated = (
+            self._current_step >= self.max_steps
+        )  # if the episode ended without reaching the goal
 
         observation = self._get_obs()
         return observation, reward, bool(self.terminated), bool(self.truncated), {}
 
     def _get_obs(self):
         """
-        Get the current state observation for the RL agent. Converts the system's 
+        Get the current state observation for the RL agent. Converts the system's
         quantum state or matrix into a real-valued NumPy array suitable for RL algorithms.
         """
         rho = self._state.full().flatten()
         obs = np.concatenate((np.real(rho), np.imag(rho)))
-        return obs.astype(np.float32)                                       # Gymnasium expects the observation to be of type float32
-    
+        return obs.astype(
+            np.float32
+        )  # Gymnasium expects the observation to be of type float32
+
     def reset(self, seed=None):
         """
         Reset the environment to the initial state, preparing for a new episode.
         """
         self._save_episode_info()
 
-        time_diff = self._episode_info[-1]["elapsed_time"] - (self._episode_info[-2]["elapsed_time"] if len(self._episode_info) > 1 else time.mktime(self._result.start_local_time))
+        time_diff = self._episode_info[-1]["elapsed_time"] - (
+            self._episode_info[-2]["elapsed_time"]
+            if len(self._episode_info) > 1
+            else time.mktime(self._result.start_local_time)
+        )
         self._result.iter_seconds.append(time_diff)
-        self._current_step = 0                                           # Reset the step counter
-        self.current_episode += 1                                       # Increment episode counter
+        self._current_step = 0  # Reset the step counter
+        self.current_episode += 1  # Increment episode counter
         self._actions = self._temp_actions.copy()
         self.terminated = False
         self.truncated = False
@@ -239,61 +268,74 @@ def reset(self, seed=None):
         self._result._final_states = [self._state]
         self._state = self._initial
         return self._get_obs(), {}
-    
+
     def _save_result(self):
         """
-        Save the results of the optimization process, including the optimized 
+        Save the results of the optimization process, including the optimized
         pulse sequences, final states, and performance metrics.
         """
         result_obj = self._backup_result if self._use_backup_result else self._result
 
-        if(self._use_backup_result):
+        if self._use_backup_result:
             self._backup_result.iter_seconds = self._result.iter_seconds.copy()
             self._backup_result._final_states = self._result._final_states.copy()
             self._backup_result.infidelity = self._result.infidelity
 
         result_obj.end_local_time = time.localtime()
-        result_obj.n_iters = len(self._result.iter_seconds)  
-        result_obj.optimized_params = self._actions.copy() + [self._result.total_seconds]                      # If var_time is True, the last parameter is the evolution time
+        result_obj.n_iters = len(self._result.iter_seconds)
+        result_obj.optimized_params = self._actions.copy() + [
+            self._result.total_seconds
+        ]  # If var_time is True, the last parameter is the evolution time
         result_obj._optimized_controls = self._actions.copy()
         result_obj._guess_controls = []
         result_obj._optimized_H = [self._H]
 
-
     def result(self):
         """
         Final conversions and return of optimization results
-        """            
+        """
         if self._use_backup_result:
-            self._backup_result.start_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._backup_result.start_local_time)       # Convert to a string
-            self._backup_result.end_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._backup_result.end_local_time)           # Convert to a string
+            self._backup_result.start_local_time = time.strftime(
+                "%Y-%m-%d %H:%M:%S", self._backup_result.start_local_time
+            )  # Convert to a string
+            self._backup_result.end_local_time = time.strftime(
+                "%Y-%m-%d %H:%M:%S", self._backup_result.end_local_time
+            )  # Convert to a string
             return self._backup_result
         else:
             self._save_result()
-            self._result.start_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.start_local_time)       # Convert to a string
-            self._result.end_local_time = time.strftime("%Y-%m-%d %H:%M:%S", self._result.end_local_time)           # Convert to a string
+            self._result.start_local_time = time.strftime(
+                "%Y-%m-%d %H:%M:%S", self._result.start_local_time
+            )  # Convert to a string
+            self._result.end_local_time = time.strftime(
+                "%Y-%m-%d %H:%M:%S", self._result.end_local_time
+            )  # Convert to a string
             return self._result
 
     def train(self):
         """
-        Train the RL agent on the defined quantum control problem using the specified 
+        Train the RL agent on the defined quantum control problem using the specified
         reinforcement learning algorithm. Checks environment compatibility with Gym API.
         """
         # Check if the environment follows Gym API
         check_env(self, warn=True)
 
         # Create the model
-        model = PPO('MlpPolicy', self, verbose=1)       # verbose = 1 to display training progress and statistics in the terminal
-        
+        model = PPO(
+            "MlpPolicy", self, verbose=1
+        )  # verbose = 1 to display training progress and statistics in the terminal
+
         stop_callback = EarlyStopTraining(verbose=1)
-        
+
         # Train the model
-        model.learn(total_timesteps = self._total_timesteps, callback=stop_callback)
+        model.learn(total_timesteps=self._total_timesteps, callback=stop_callback)
+
 
 class EarlyStopTraining(BaseCallback):
     """
     A callback to stop training based on specific conditions (steps, infidelity, max iterations)
     """
+
     def __init__(self, verbose: int = 0):
         super(EarlyStopTraining, self).__init__(verbose)
 
@@ -304,31 +346,40 @@ def _on_step(self) -> bool:
         - Stop training if it finds an episode with infidelity <= than target infidelity
         - If all of the last 100 episodes have infidelity below the target and use the same number of steps, stop training.
         """
-        env = self.training_env.envs[0].unwrapped
+        env = self.training_env.get_attr("unwrapped")[0]
 
         # Check if we need to stop training
         if env.current_episode >= env.max_episodes:
-            if env._use_backup_result == True:
+            if env._use_backup_result is True:
                 env._backup_result.message = f"Reached {env.max_episodes} episodes, stopping training. Return the last founded episode with infid < target_infid"
             else:
-                env._result.message = f"Reached {env.max_episodes} episodes, stopping training."
-            return False    # Stop training
-        elif (env._result.infidelity <= env._fid_err_targ) and not(env.shorter_pulses):
-            env._result.message = f"Stop training because an episode with infidelity <= target infidelity was found"
+                env._result.message = (
+                    f"Reached {env.max_episodes} episodes, stopping training."
+                )
+            return False  # Stop training
+        elif (env._result.infidelity <= env._fid_err_targ) and not (env.shorter_pulses):
+            env._result.message = "Stop training because an episode with infidelity <= target infidelity was found"
             return False  # Stop training
         elif env.shorter_pulses:
-            if(env._result.infidelity <= env._fid_err_targ):        # if it finds an episode with infidelity lower than target infidelity, I'll save it in the meantime
+            if (
+                env._result.infidelity <= env._fid_err_targ
+            ):  # if it finds an episode with infidelity lower than target infidelity, I'll save it in the meantime
                 env._use_backup_result = True
                 env._save_result()
             if len(env._episode_info) >= 100:
                 last_100_episodes = env._episode_info[-100:]
 
-                min_steps = min(info['steps_used'] for info in last_100_episodes)
-                steps_condition = all(ep['steps_used'] == min_steps for ep in last_100_episodes)
-                infid_condition = all(ep['final_infidelity'] <= env._fid_err_targ for ep in last_100_episodes)
+                min_steps = min(info["steps_used"] for info in last_100_episodes)
+                steps_condition = all(
+                    ep["steps_used"] == min_steps for ep in last_100_episodes
+                )
+                infid_condition = all(
+                    ep["final_infidelity"] <= env._fid_err_targ
+                    for ep in last_100_episodes
+                )
 
                 if steps_condition and infid_condition:
                     env._use_backup_result = False
                     env._result.message = "Training finished. No episode in the last 100 used fewer steps and infidelity was below target infid."
                     return False  # Stop training
-        return True  # Continue training
\ No newline at end of file
+        return True  # Continue training
diff --git a/src/qutip_qoc/pulse_optim.py b/src/qutip_qoc/pulse_optim.py
index d320799..9e8ecc3 100644
--- a/src/qutip_qoc/pulse_optim.py
+++ b/src/qutip_qoc/pulse_optim.py
@@ -79,7 +79,7 @@ def optimize_pulses(
 
             - max_iter : int, optional
                 Maximum number of iterations to perform.
-                Referes to local minimizer steps or in the context of 
+                Referes to local minimizer steps or in the context of
                 `alg: "RL"` to the max. number of episodes.
                 Global steps default to 0 (no global optimization).
                 Can be overridden by specifying in minimizer_kwargs.
diff --git a/tests/test_result.py b/tests/test_result.py
index 7b9cda3..7146b04 100644
--- a/tests/test_result.py
+++ b/tests/test_result.py
@@ -156,18 +156,18 @@ def sin_z_jax(t, r, **kwargs):
 
 # state to state transfer
 initial = qt.basis(2, 0)
-target = (qt.basis(2, 0) + qt.basis(2, 1)).unit() # |+⟩
+target = (qt.basis(2, 0) + qt.basis(2, 1)).unit()  # |+⟩
 
-H_c = [qt.sigmax(), qt.sigmay(), qt.sigmaz()] # control Hamiltonians
+H_c = [qt.sigmax(), qt.sigmay(), qt.sigmaz()]  # control Hamiltonians
 
 w, d, y = 0.1, 1.0, 0.1
-H_d = 1 / 2 * (w * qt.sigmaz() + d * qt.sigmax()) # drift Hamiltonian
+H_d = 1 / 2 * (w * qt.sigmaz() + d * qt.sigmax())  # drift Hamiltonian
 
-H = [H_d] + H_c # total Hamiltonian
+H = [H_d] + H_c  # total Hamiltonian
 
 state2state_rl = Case(
     objectives=[Objective(initial, H, target)],
-    control_parameters = {
+    control_parameters={
         "p": {"bounds": [(-13, 13)]},
     },
     tlist=np.linspace(0, 10, 100),
@@ -177,17 +177,17 @@ def sin_z_jax(t, r, **kwargs):
         "max_iter": 20000,
         "shorter_pulses": True,
     },
-    optimizer_kwargs={}
+    optimizer_kwargs={},
 )
 
 # no big difference for unitary evolution
 
-initial = qt.qeye(2) # Identity
-target  = qt.gates.hadamard_transform()
+initial = qt.qeye(2)  # Identity
+target = qt.gates.hadamard_transform()
 
 unitary_rl = state2state_rl._replace(
     objectives=[Objective(initial, H, target)],
-        control_parameters = {
+    control_parameters={
         "p": {"bounds": [(-13, 13)]},
     },
     algorithm_kwargs={