In [4]:
import numpy as np
import matplotlib.pyplot as plt
class QLearningAgent:
    def __init__(self, env, num_users, alpha=0.1, gamma=0.9, epsilon=0.1, max_steps_per_episode=10):
        self.env = env
        self.num_users = num_users
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.q_table = {}
        self.max_steps_per_episode = max_steps_per_episode

    def get_state(self):
        state_array = self.env.get_state()
        return tuple(state_array)

    def get_action(self, state):
        state_key = tuple(state)
        if state_key not in self.q_table:
            self.q_table[state_key] = self.initialize_q_values()

        if np.random.rand() < self.epsilon:
            return self.generate_random_action()
        else:
            best_action_key = max(self.q_table[state_key], key=self.q_table[state_key].get)
            return dict(best_action_key)

    def initialize_q_values(self):
        alpha_values = np.linspace(0, 1, 10)
        b_values = np.linspace(0, 0.5, 10)
        p_values = np.linspace(1, self.env.P_max, 10)
        f_ue_values = np.linspace(1, self.env.F_max_ue, 10)
        f_es_values = np.linspace(1, self.env.F_max_es/10, 10)

        q_values = {}
        for alpha in alpha_values:
            for b in b_values:
                for p in p_values:
                    for f_ue in f_ue_values:
                        for f_es in f_es_values:
                            action = {
                                'alpha_m': alpha,
                                'b_m': b,
                                'p_m': p,
                                'f_ue_m': f_ue,
                                'f_es_m': f_es
                            }
                            q_values[frozenset(action.items())] = 0

        return q_values

    def generate_random_action(self):
        alpha_values = np.linspace(0, 1, 10)
        b_values = np.linspace(0, 0.5, 10)
        p_values = np.linspace(1, self.env.P_max, 10)
        f_ue_values = np.linspace(1, self.env.F_max_ue, 10)
        f_es_values = np.linspace(1, self.env.F_max_es/10, 10)

        return {
            'alpha_m': np.random.choice(alpha_values),
            'b_m': np.random.choice(b_values),
            'p_m': np.random.choice(p_values),
            'f_ue_m': np.random.choice(f_ue_values),
            'f_es_m': np.random.choice(f_es_values)
        }

    def update_q_table(self, state, action, reward, next_state):
        next_state = tuple(next_state)
        state = tuple(state)

        if next_state not in self.q_table:
            self.q_table[next_state] = self.initialize_q_values()

        action_key = frozenset(action.items())
        best_next_action = max(self.q_table[next_state], key=self.q_table[next_state].get)
        self.q_table[state][action_key] += self.alpha * (reward + self.gamma * self.q_table[next_state][best_next_action] - self.q_table[state][action_key])

    def train(self, num_episodes):
        rewards = []  # List to store rewards for each episode
        for episode in range(num_episodes):
            state = self.get_state()  # Get the initial state
            self.env.reset()  # Reset the environment
            total_reward = 0  # Initialize total reward for this episode
            total_delay = 0  # Initialize total delay
            total_energy = 0  # Initialize total energy consumption
            done = False  # Initialize the done flag
            steps = 0  # Initialize step counter
            task_count = 0  # Initialize task counter

            while not done and steps < self.max_steps_per_episode:
                action = self.get_action(state)  # Select an action
                reward, next_state, done = self.env.step([action for _ in range(self.num_users)])  # Take a step in the environment
                self.update_q_table(state, action, reward, next_state)  # Update the Q-table
                state = next_state  # Update the state
                total_reward += reward  # Accumulate reward
                total_delay += next_state['total_delay']  # Accumulate delay
                total_energy += next_state['total_energy']  # Accumulate energy consumption
                steps += 1  # Increment step counter
                task_count += 1  # Increment task counter

            rewards.append(total_reward)  # Store the total reward for this episode
            avg_delay = total_delay / steps  # Calculate average delay
            avg_energy = total_energy / steps  # Calculate average energy consumption
            print(f"Episode {episode + 1}/{num_episodes} - Total Reward: {total_reward}, Avg Delay: {avg_delay}, Avg Energy: {avg_energy}, Tasks Created: {task_count}")

        return rewards


    def test(self, S_max_es_values, E_max_values):
        results = []
        for S_max_es in S_max_es_values:
            for E_max in E_max_values:
                self.env.S_max_es = S_max_es
                self.env.E_max = E_max

                delays = []
                for _ in range(10):
                    self.env.reset()
                    state = self.get_state()
                    done = False
                    total_delay = 0

                    while not done:
                        action = self.get_action(state)
                        _, next_state, done = self.env.step([action for _ in range(self.num_users)])
                        total_delay += next_state['total_delay']
                        state = next_state

                    delays.append(total_delay)

                avg_delay = np.mean(delays)
                results.append((S_max_es, E_max, avg_delay))

        return results

# Create the environment
env = EdgeComputingEnvironment()

# Initialize the Q-learning agent
agent = QLearningAgent(env, num_users=env.M)

# Train the agent
num_episodes = 1
training_rewards = agent.train(num_episodes)

# Test the agent with different S_max_es and E_max values
S_max_es_values = [60]
E_max_values = [1.5]

test_results = agent.test(S_max_es_values, E_max_values)

# Plot the results
x = np.arange(len(E_max_values))
for S_max_es in S_max_es_values:
    y = [result[2] for result in test_results if result[0] == S_max_es]
    plt.plot(x, y, label=f'S_max_es = {S_max_es} KB')

plt.xlabel('E_max (mJ)')
plt.ylabel('Average Delay (ms)')
plt.title('Average Delay vs. E_max for Different S_max_es Values')
plt.xticks(x, E_max_values)
plt.legend()
plt.grid(True)
plt.show()

Episode 1/1 - Total Reward: [-1383787.33239409], Avg Delay: [1304875.29183511], Avg Energy: [0.00061789], Tasks Created: 10


KeyboardInterrupt: 