In [10]:
bandit_config = {"bandit_actions": 10, "true_q_value_mean": 0.5, 
                 "true_q_value_std": 0.1, "q_value_std": 0., 
                 "q_drift_mean": 0.0, "q_drift_std": 0.01, "random_seed": None}

class BanditEnv:
    def __init__(self, bandit_config: dict, render_mode=None):
        for key, value in bandit_config.items(): self.__setattr__(key, value)

env = BanditEnv(bandit_config=bandit_config, render_mode=None)
print (env.true_q_value_mean)



0.5


bandit_actions: int = 10,
true_q_value_mean = 0., # for a specific initial q-value set this to the value of choice otherwise use mean = 0
true_q_value_std = 0.1, # for a uniform initial q-value set this to zero otherwise use e.g., std = 1.
q_value_std = 0., # for q_values drawn from a normal standard distribution set this to 1 else leave it to 0. The true_q_value_mean of each will be used as mean in any case which will be drifting if set so in the next two params.
q_drift_mean: float = 0.0, # for a stationary problem set this to zero
q_drift_std: float = 0.01, # for a stationary problem set this to zero else use e.g. std = 0.1 for a light drift
random_seed: int = None,

In [24]:



bandit_config = {"bandit_actions": 10, "true_q_value_mean": 0.5, 
                 "true_q_value_std": 0.1, "q_value_std": 0.0, 
                 "q_drift_mean": 0.0, "q_drift_std": 0.01, "random_seed": None}

class BanditEnv:
    def __init__(self, bandit_config: dict, render_mode=None):

        try:
            for key, value in bandit_config.items(): self.__setattr__(key, value)

            # Type checks for the parameters
            assert isinstance(self.bandit_actions, int) and self.bandit_actions > 0, f"bandit_actions must be an integer greater than 0, got {type(self.bandit_actions)} with value {self.bandit_actions}"
            assert isinstance(self.true_q_value_mean, (int, float)), f"true_q_value_mean must be a number, got {type(self.true_q_value_mean)}"
            assert isinstance(self.true_q_value_std, (int, float)), f"true_q_value_std must be a number, got {type(self.true_q_value_std)}"
            assert isinstance(self.q_value_std, (int, float)), f"q_value_std must be a number, got {type(self.q_value_std)}"
            assert isinstance(self.q_drift_mean, (int, float)), f"q_drift_mean must be a number, got {type(self.q_drift_mean)}"
            assert isinstance(self.q_drift_std, (int, float)), f"q_drift_std must be a number, got {type(self.q_drift_std)}"
            assert isinstance(self.random_seed, (int, type(None))), f"random_seed must be an integer or None, got {type(self.random_seed)}"
        except (AttributeError, AssertionError) as e:
            raise ValueError(f"Invalid BanditEnv configuration: {e}")

        self.render_mode = render_mode


env = BanditEnv(bandit_config=bandit_config, render_mode=None)
print (env.bandit_actions)

-10


In [None]:
class BanditEnv:
    def __init__(self, *, render_mode=None, **kwargs):
        self.render_mode = render_mode
        self.__setattr_from_dict__(kwargs)

        # Type checks for the parameters
        assert isinstance(self.bandit_actions, int) and self.bandit_actions > 0, f"bandit_actions must be an integer greater than 0, got {type(self.bandit_actions)}"
        assert isinstance(self.true_q_value_mean, (int, float)), f"true_q_value_mean must be a number, got {type(self.true_q_value_mean)}"
        assert isinstance(self.true_q_value_std, (int, float)), f"true_q_value_std must be a number, got {type(self.true_q_value_std)}"
        assert isinstance(self.q_value_std, (int, float)), f"q_value_std must be a number, got {type(self.q_value_std)}"
        assert isinstance(self.q_drift_mean, (int, float)), f"q_drift_mean must be a number, got {type(self.q_drift_mean)}"
        assert isinstance(self.q_drift_std, (int, float)), f"q_drift_std must be a number, got {type(self.q_drift_std)}"
        assert isinstance(self.random_seed, (int, type(None))), f"random_seed must be an integer or None, got {type(self.random_seed)}"

        # observation and action space of the BanditEnv
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.bandit_actions,), dtype=np.float64)
        self.action_space = spaces.Discrete(self.bandit_actions)

        # The arrays that holds the true and actual q-values of each bandit arm
        self.arms_true_q_values = np.zeros(self.bandit_actions)
        self.arms_q_values = np.zeros(self.bandit_actions)

        # Variables to store and communicate the agents efficiency
        self.step_error = 0.
        self.optimal_action = False

        # ... rest of the code remains the same ...

    def __setattr_from_dict__(self, d):
        for key, value in d.items():
            self.__setattr__(key, value)

In [9]:
import numpy as np
random_seed = 42
rng = np.random.default_rng(seed=random_seed)
rng1 = np.random.default_rng(seed=random_seed)
# rng1 = np.random.default_rng(seed=rng.bit_generator.state['state']['state'])

print (rng.bit_generator.state['state']['state'])

print(rng.normal(loc=0, scale=1) == rng1.normal(loc=0, scale=1))
print (rng.random() == rng1.random())

print (rng.bit_generator.state['state']['state'])

274674114334540486603088602300644985544
True
True
180456145198944327624639367796045148994
