Atari

In [1]:
import gymnasium as gym  # Import gymnasium instead of gym
from stable_baselines3 import DQN
import ale_py
from stable_baselines3.common.vec_env import DummyVecEnv


# Initialize and wrap the environment
env = gym.make("ALE/Pong-v5",render_mode=None)

# Convert action space to Discrete (since Pong has 6 actions but we want only one discrete action)
# For Pong, typically you would reduce the multi-discrete action to one action: 0 (stay) or 1 (move up), etc.
env = gym.wrappers.GrayScaleObservation(env)  # Convert the observations to grayscale for efficiency
env = gym.wrappers.ResizeObservation(env, 84)  # Resize for faster training
env = gym.wrappers.FrameStack(env, 4)  # Stack frames (a common practice in RL)

# Convert the environment to a vectorized environment (required by SB3)
env = DummyVecEnv([lambda: env])

Bayes

In [2]:
from scipy.stats import norm

# Example Bayesian function that adjusts action probabilities
def bayesian_action_probability(action_success_rate, current_performance):
    # Assume a normal distribution for simplicity
    probability = norm.cdf(current_performance, loc=action_success_rate, scale=0.1)
    return probability


Fuzzy

In [3]:
import numpy as np
import skfuzzy as fuzz
from skfuzzy import control as ctrl

# Define fuzzy variables
state_quality = ctrl.Antecedent(np.arange(0, 11, 1), 'state_quality')
action_intensity = ctrl.Consequent(np.arange(0, 11, 1), 'action_intensity')

# Define fuzzy sets and rules
state_quality['poor'] = fuzz.trimf(state_quality.universe, [0, 0, 5])
state_quality['average'] = fuzz.trimf(state_quality.universe, [0, 5, 10])
state_quality['good'] = fuzz.trimf(state_quality.universe, [5, 10, 10])

action_intensity['low'] = fuzz.trimf(action_intensity.universe, [0, 0, 5])
action_intensity['medium'] = fuzz.trimf(action_intensity.universe, [0, 5, 10])
action_intensity['high'] = fuzz.trimf(action_intensity.universe, [5, 10, 10])

# Define fuzzy rules
rule1 = ctrl.Rule(state_quality['poor'], action_intensity['low'])
rule2 = ctrl.Rule(state_quality['average'], action_intensity['medium'])
rule3 = ctrl.Rule(state_quality['good'], action_intensity['high'])

# Control system
action_ctrl = ctrl.ControlSystem([rule1, rule2, rule3])
action_decision = ctrl.ControlSystemSimulation(action_ctrl)

# Example fuzzy decision
def fuzzy_action_decision(state_quality_value):
    action_decision.input['state_quality'] = state_quality_value
    action_decision.compute()
    return action_decision.output['action_intensity']


Meta

In [4]:
class MetaAgent:
    def __init__(self):
        self.exploration_rate = 0.5  # adjust based on performance
        self.fuzzy_weight = 0.5  # weight for fuzzy decisions
        self.bayesian_weight = 0.5  # weight for Bayesian decisions

    def adjust_weights(self, performance):
        if performance > 0.7:
            self.exploration_rate *= 0.9  # reduce exploration as agent performs well
        else:
            self.exploration_rate *= 1.1  # increase exploration if performance is low

        # Adjust weights dynamically
        self.fuzzy_weight = max(0.3, self.fuzzy_weight - 0.05 * (performance - 0.5))
        self.bayesian_weight = max(0.3, self.bayesian_weight + 0.05 * (0.5 - performance))


Training

In [5]:
# Initialize the MetaAgent and the DQN model
meta_agent = MetaAgent()
model = DQN("CnnPolicy", env, verbose=1, device="cuda", buffer_size=50000, batch_size=128)

# Training loop
for episode in range(1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        # Extract the first observation from the batch
        obs_single = obs[0]  # Unwrap the first observation

        # Extract state quality (simplified here as random)
        state_quality_value = np.random.uniform(0, 10)
        
        # Fuzzy action decision
        fuzzy_action = fuzzy_action_decision(state_quality_value)
        
        # Bayesian decision
        bayesian_prob = bayesian_action_probability(0.6, score / (episode + 1))
        
        # Meta-agent adjusts exploration rate and weights
        meta_agent.adjust_weights(score / (episode + 1))
        
        # Combine fuzzy and Bayesian decisions
        action_value = fuzzy_action * meta_agent.fuzzy_weight + bayesian_prob * meta_agent.bayesian_weight

        # Ensure that the result is a scalar before converting to int
        action = int(action_value.item())  # .item() ensures it's a scalar before converting
        action = model.predict(obs_single, deterministic=action > meta_agent.exploration_rate)[0]  # Use the unwrapped observation
        
        # The action needs to be passed as a batch
        action_batch = [action]

        # Take action in the environment
        obs, reward, done, info = env.step(action_batch)
        score += reward

        # Train the model
        model.learn(total_timesteps=10000)

    print(f"Episode {episode} - Score: {score}")

# Save the trained model
model.save("dqn_pong_model")
print("Model saved!")

# Close the environment
env.close()

Using cuda device
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 191      |
|    time_elapsed     | 16       |
|    total_timesteps  | 3144     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0113   |
|    n_updates        | 760      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 191      |
|    time_elapsed     | 34       |
|    total_timesteps  | 6521     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0035   |
|    n_updates        | 1605     |
----------------------------------
----------------------------------
| rollout/            |          |
| 