In [3]:
import numpy as np

# Initialize Q-table with appropriate dimensions
def initialize_q_table(state_space, action_space):
    return np.zeros(state_space + [action_space])

# Choose action using epsilon-greedy strategy
def choose_action(state, q_table, epsilon, action_space):
    if np.random.rand() < epsilon:
        return np.random.randint(action_space)  # Exploration
    else:
        return np.argmax(q_table[tuple(state)])  # Exploitation

# Update Q-table using Q-learning algorithm
def update_q_table(q_table, state, action, reward, next_state, alpha, gamma):
    best_next_action = np.argmax(q_table[tuple(next_state)])
    td_target = reward + gamma * q_table[tuple(next_state)][best_next_action]
    td_error = td_target - q_table[tuple(state)][action]
    q_table[tuple(state)][action] += alpha * td_error

# Decay epsilon value
def decay_epsilon(epsilon, epsilon_decay, epsilon_min):
    return max(epsilon_min, epsilon * epsilon_decay)

# Generate random actions for each user
def generate_random_actions(num_users, env):
    alpha_values = np.linspace(0, 1, 10)
    b_values = np.linspace(1, env.B, 10)
    p_values = np.linspace(1, env.P_max, 10)
    f_ue_values = np.linspace(1, env.F_max_ue, 10)
    f_es_values = np.linspace(1, env.F_max_es, 10)

    actions = []
    for _ in range(num_users):
        action = {
            'alpha_m': np.random.choice(alpha_values),
            'b_m': np.random.choice(b_values),
            'p_m': np.random.choice(p_values),
            'f_ue_m': np.random.choice(f_ue_values),
            'f_es_m': np.random.choice(f_es_values)
        }
        actions.append(action)
    return actions

# Run the simulation with Q-learning
def run_simulation(env, q_table, num_episodes, alpha, gamma, initial_epsilon, epsilon_decay, epsilon_min, action_space):
    epsilon = initial_epsilon
    for episode in range(num_episodes):
        state = env.reset()  # Assuming `reset` initializes the environment and returns the initial state
        done = False
        while not done:
            actions = generate_random_actions(env.M, env)
            action = choose_action(state, q_table, epsilon, action_space)
            reward, next_state, done = env.step(actions)  # Assuming `step` executes the action and returns next_state, reward, done, and penalties
            update_q_table(q_table, state, action, reward, next_state, alpha, gamma)
            state = next_state
        epsilon = decay_epsilon(epsilon, epsilon_decay, epsilon_min)
        print(f'Episode {episode + 1}/{num_episodes}, Epsilon: {epsilon:.4f}')

# Define a render function for visualization
def render(env):
    print("Rendering the environment state:")
    print(f"Current State: {env.get_state()}")
    print(f"Total Users: {env.M}")
    print(f"Bandwidth: {env.B}")
    print(f"Maximum Power: {env.P_max}")
    print(f"Maximum UE Frequency: {env.F_max_ue}")
    print(f"Maximum ES Frequency: {env.F_max_es}")

# Test simulation
def test_simulation():
    # Initialize the environment
    env = EdgeComputingEnvironment()

    # Set state and action spaces based on the environment
    state_space = [len(env.get_state())]  # Adjust state space based on the returned state
    action_space = 10 ** 5  # Number of possible actions

    # Initialize Q-Learning parameters
    alpha = 0.1
    gamma = 0.9
    initial_epsilon = 1.0
    epsilon_decay = 0.99
    epsilon_min = 0.01
    num_episodes = 50

    # Initialize Q-table
    q_table = initialize_q_table(state_space, action_space)

    # Run simulation
    run_simulation(env, q_table, num_episodes, alpha, gamma, initial_epsilon, epsilon_decay, epsilon_min, action_space)

    # Render the final state of the environment
    render(env)

# Run the test simulation
test_simulation()
