In [None]:
import gym
from gym import spaces
import numpy as np
import itertools
import math

class StandardDeviationEnv(gym.Env):
    """
    Custom Environment that uses the exact reward computation code provided.
    Actions:
        - Step in Delta1 space
        - Step in Delta2 space
    Observations:
        - Delta1
        - Delta2
        - C1
        - C2
        - std_over_mean_sum (error metric)
    """
    metadata = {'render.modes': ['human']}

    def __init__(
        self,
        min_delta1: float = 0.5,
        max_delta1: float = 1.5,
        min_delta2: float = 1.6,
        max_delta2: float = 2.4,
        delta_lr: float = 0.2,
        max_episode_steps: int = 50,
        z_samples: np.ndarray = np.linspace(0.3, 0.49, 20),
    ):
        super(StandardDeviationEnv, self).__init__()
        self.data_type = np.float32

        # Parameters
        self.min_delta1 = min_delta1
        self.max_delta1 = max_delta1
        self.min_delta2 = min_delta2
        self.max_delta2 = max_delta2
        self.delta_lr = delta_lr  # Learning rate for delta updates
        self.max_episode_steps = max_episode_steps

        # Initialize variables
        self.reward = 0.0
        self.std_over_mean_sum = None

        # Initialize delta1 and delta2
        self.delta1 = 1.56
        self.delta2 = 2.7
        # z_samples for main equations
        self.z_samples = z_samples

        # Action and observation spaces
        # Actions: delta1_step, delta2_step
        action_low = np.array([-1.0, -1.0], dtype=self.data_type)
        action_high = np.array([1.0, 1.0], dtype=self.data_type)

        # Observations: delta1, delta2, c1, c2, std_over_mean_sum
        obs_low = np.array(
            [self.min_delta1, self.min_delta2, -np.inf, -np.inf, 0.0],
            dtype=self.data_type
        )
        obs_high = np.array(
            [self.max_delta1, self.max_delta2, np.inf, np.inf, np.inf],
            dtype=self.data_type
        )

        self.action_space = spaces.Box(low=action_low, high=action_high, dtype=self.data_type)
        self.observation_space = spaces.Box(low=obs_low, high=obs_high, dtype=self.data_type)

        # Count steps
        self.num_steps = 0

        # Initialize state variables
        self.reset()

    # Functions provided by you

    def ftoy(self, z, delta):
        """
        Compute ftoy(z, delta) = Exp[-delta^2 * z] - Exp[-delta^2 * (1 - z)]
        """
        return math.exp(-delta**2 * z) - math.exp(-delta**2 * (1 - z))

    def main_equation(self, z, Delta1, Delta2):
        """
        Compute the main equation coefficients B, C, D for given z, Delta1, Delta2.
        Equation: B * C1 + C * C2 = D
        where D = Exp[-4*(1 - z)] - Exp[-4*z] + 1/100*(Exp[-64*(1 - z)] - Exp[-64*z]) - 5*(-Exp[-1 + z] + Exp[-z])
        """
        # Compute the constant terms
        A = (math.exp(-4 * (1 - z)) - math.exp(-4 * z) +
             (1/100) * (math.exp(-64 * (1 - z)) - math.exp(-64 * z)) -
             5 * (-math.exp(-1 + z) + math.exp(-z)))

        # Compute coefficients for C1 and C2
        B = -math.exp(-((1 - z) * Delta1**2)) + math.exp(-z * Delta1**2)
        C = -math.exp(-((1 - z) * Delta2**2)) + math.exp(-z * Delta2**2)

        D = -A  # Rearranged to B*C1 + C*C2 = D
        return (B, C, D)

    def derivative_equations(self, Delta1, Delta2):
        """
        Define the derivative equations explicitly as provided.
        Each derivative equation is of the form:
        B_deriv * C1 + C_deriv * C2 = D_deriv
        """
        # Constants from the provided expressions
        eq1 = (
            2.0 * math.exp(-0.5 * Delta1**2) * Delta1**2,
            2.0 * math.exp(-0.5 * Delta2**2) * Delta2**2,
            7.147988863019252
        )

        eq2 = (
            0.3333333333333333 * math.exp(-0.5 * Delta1**2) * Delta1**6,
            0.3333333333333333 * math.exp(-0.5 * Delta2**2) * Delta2**6,
            3.8980371419131926
        )

        eq3 = (
            0.016666666666666667 * math.exp(-0.5 * Delta1**2) * Delta1**10,
            0.016666666666666667 * math.exp(-0.5 * Delta2**2) * Delta2**10,
            2.3602663911472502
        )

        eq4 = (
            0.00039682539682539683 * math.exp(-0.5 * Delta1**2) * Delta1**14,
            0.00039682539682539683 * math.exp(-0.5 * Delta2**2) * Delta2**14,
            0.8810978138186971
        )

        eq5 = (
            5.511463844797178e-6 * math.exp(-0.5 * Delta1**2) * Delta1**18,
            5.511463844797178e-6 * math.exp(-0.5 * Delta2**2) * Delta2**18,
            0.19556132338694376
        )

        return [eq1, eq2, eq3, eq4, eq5]

    def get_all_equations(self, Delta1, Delta2):
        """
        Combine main equations for all z_samples with derivative equations.
        Returns a list of equations where each equation is a tuple (B, C, D).
        Represented as B*C1 + C*C2 = D
        """
        equations = []

        # Main equations for each z in z_samples
        for z in self.z_samples:
            eq = self.main_equation(z, Delta1, Delta2)
            equations.append(eq)

        # Derivative equations
        deriv_eqns = self.derivative_equations(Delta1, Delta2)
        equations.extend(deriv_eqns)

        return equations

    def solve_subsets(self, equations):
        """
        For all 2-element subsets of equations, solve for C1 and C2.
        Collect all valid solutions.
        Returns:
            means: Tuple containing mean of C1 and mean of C2
            stds: Tuple containing std of C1 and std of C2
        """
        solutions = []
        subsets = itertools.combinations(equations, 2)

        for subset in subsets:
            (B1, C1_coef, D1), (B2, C2_coef, D2) = subset
            # Form the coefficient matrix and constant vector
            A = np.array([[B1, C1_coef],
                          [B2, C2_coef]], dtype=self.data_type)
            D = np.array([D1, D2], dtype=self.data_type)
            try:
                sol = np.linalg.solve(A, D)
                solutions.append(sol)
            except np.linalg.LinAlgError:
                # Singular or ill-conditioned matrix, skip this subset
                continue

        if len(solutions) == 0:
            return None, None

        solutions = np.array(solutions)
        C1_mean = np.mean(solutions[:,0])
        C1_std = np.std(solutions[:,0], ddof=1)
        C2_mean = np.mean(solutions[:,1])
        C2_std = np.std(solutions[:,1], ddof=1)

        return (C1_mean, C2_mean), (C1_std, C2_std)

    def get_reward(self, means, stds):
        """
        Calculate the reward based on the mean and std of C1 and C2.
        Reward = - (log(abs(std1 / mean1)) + log(abs(std2 / mean2)))
        """
        C1_mean, C2_mean = means
        C1_std, C2_std = stds

        # Avoid division by zero by adding a small epsilon
        epsilon = 1e-8
        ratio1 = np.abs(C1_std / (C1_mean + epsilon))
        ratio2 = np.abs(C2_std / (C2_mean + epsilon))

        # Clip ratios to avoid log(0)
        min_ratio = 1e-8
        ratio1 = np.clip(ratio1, min_ratio, None)
        ratio2 = np.clip(ratio2, min_ratio, None)

        reward = - (np.log(ratio1) + np.log(ratio2))
        return reward

    def compute_reward(self, Delta1, Delta2):
        """
        Compute the reward for given Delta1 and Delta2 based on the sampled z values.
        """
        equations = self.get_all_equations(Delta1, Delta2)
        means, stds = self.solve_subsets(equations)
        if means is None or stds is None:
            # Assign a very low reward if no solutions are found
            self.cs = np.array([0.0, 0.0])
            self.std_over_mean_sum = 1e6
            return -1e6
        reward = self.get_reward(means, stds)
        self.cs = np.array(means)
        # Compute std_over_mean_sum for observation
        epsilon = 1e-8
        cs_mean_safe = np.where(np.abs(self.cs) < epsilon, epsilon, np.abs(self.cs))
        std_over_mean = np.array(stds) / cs_mean_safe
        self.std_over_mean_sum = np.sum(std_over_mean)
        return reward

    # Rest of the environment methods

    def _get_obs(self):
        return np.concatenate(
            [np.array([self.delta1, self.delta2], dtype=self.data_type), self.cs, [self.reward]],
            dtype=self.data_type
        )

    def reset(self):
        self.num_steps = 0

        # Initialize delta1 and delta2 randomly within bounds
        self.delta1 = np.random.uniform(self.min_delta1, self.max_delta1)
        self.delta2 = np.random.uniform(self.min_delta2, self.max_delta2)

        # Compute initial reward and observation
        self.reward = self.compute_reward(self.delta1, self.delta2)

        observation = self._get_obs()
        return observation

    def step(self, action):
        self.num_steps += 1

        # Update delta1 and delta2
        delta1 = self.delta1 + self.delta_lr * action[0]
        delta2 = self.delta2 + self.delta_lr * action[1]

        # Check boundary conditions
        is_ok = self.delta_bc(delta1, delta2)

        done = False

        if not is_ok:
            # Penalize the agent for invalid deltas
            self.reward = -10.0
            done = True
            observation = self._get_obs()
            return observation, self.reward, done, {}

        # Update deltas
        self.delta1 = delta1
        self.delta2 = delta2

        # Compute the new reward
        self.reward = self.compute_reward(self.delta1, self.delta2)

        # Check if episode should be truncated due to max steps
        if self.num_steps >= self.max_episode_steps:
            done = True

        # Check for NaNs in observation
        observation = self._get_obs()
        if np.isnan(observation).any():
            self.reward = -10.0
            done = True
            return observation, self.reward, done, {}

        return observation, self.reward, done, {}

    def delta_bc(self, delta1, delta2):
        # Check delta bounds
        if delta1 < self.min_delta1 or delta1 > self.max_delta1:
            return False
        if delta2 < self.min_delta2 or delta2 > self.max_delta2:
            return False
        return True

    def render(self, mode='human'):
        """
        Render the environment.
        """
        print(f"Step: {self.num_steps}")
        print(f"Delta1: {self.delta1:.4f}, Delta2: {self.delta2:.4f}")
        print(f"C1: {self.cs[0]:.4f}, C2: {self.cs[1]:.4f}")
        print(f"Std over Mean Sum: {self.std_over_mean_sum:.4f}")
        print(f"Reward: {self.reward:.4f}")
        print("-" * 30)

    def close(self):
        pass


In [None]:

from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
import matplotlib.pyplot as plt

# Assuming the StandardDeviationEnv class is defined in the same script or imported appropriately

# Initialize the environment
env = StandardDeviationEnv()



# Create the RL model
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log="./ppo_standard_deviation_tensorboard/")

# Train the agent
# Adjust the total_timesteps based on computational resources
model.learn(total_timesteps=100000)

# Save the trained model
model.save("ppo_standard_deviation")

# To load the model later:
# model = PPO.load("ppo_standard_deviation")


In [None]:
env = StandardDeviationEnv()

# Number of evaluation episodes
num_episodes = 100

# Lists to store results
delta1_list = []
delta2_list = []
reward_list = []

for episode in range(num_episodes):
    observation = env.reset()
    #print(observation)
    action, _states = model.predict(observation, deterministic=True)
    Delta1, Delta2 = action
    observation, reward, done, info = env.step(action)
    
    delta1_list.append(Delta1)
    delta2_list.append(Delta2)
    reward_list.append(reward)

# Convert lists to numpy arrays for easier analysis
delta1_array = np.array(delta1_list)
delta2_array = np.array(delta2_list)
reward_array = np.array(reward_list)

# Find the best delta1 and delta2
best_idx = np.argmax(reward_array)
best_delta1 = delta1_array[best_idx]
best_delta2 = delta2_array[best_idx]
best_reward = reward_array[best_idx]

print(f"Best Delta1: {best_delta1}")
print(f"Best Delta2: {best_delta2}")
print(f"Best Reward: {best_reward}")

# Plotting the distribution of selected deltas
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(delta1_array, bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Delta1')
plt.xlabel('Delta1')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(delta2_array, bins=20, color='salmon', edgecolor='black')
plt.title('Distribution of Delta2')
plt.xlabel('Delta2')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Plot reward over episodes
plt.figure(figsize=(10, 5))
plt.plot(reward_array, color='green')
plt.title('Reward over Episodes')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show()