In [1]:
import numpy as np
import random
from tensorflow.keras import models, layers, optimizers
from collections import deque
import matplotlib.pyplot as plt

In [2]:
# Double Deep Q-Network (DDQN) Agent
class DDQNAgent:
    def __init__(self, env, alpha=0.001, gamma=0.95, epsilon=1.0, batch_size=64, max_steps_per_episode=1, update_target_freq=25):
        self.env = env  # Environment for the agent
        self.num_users = env.M  # Number of users/devices in the environment
        self.num_tasks = env.M # Number of task
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor for future rewards
        self.epsilon = epsilon  # Exploration rate
        self.batch_size = batch_size
        self.max_steps_per_episode = max_steps_per_episode  # Maximum steps per episode
        self.save_interval = 1000  # save model 
        self.num_candidates = 1000  # Define the number of candidate actions to sample
        self.state_dim = 5  # state dimensions
        self.action_dim = 6  # action dimensions
        self.all_action = self.sample_all_action()  # Actions Space = 200000
        self.is_training = True
        self.update_target_freq = update_target_freq  # Frequency to update the target network

        self.memory = deque(maxlen=2000)
        self.model = self.build_model()
        self.target_model = self.build_model()
        self.update_target_network()  # Initialize the target network

    def build_model(self):
        # Define inputs for state and action
        input_state = layers.Input(shape=(self.state_dim,))
        input_action = layers.Input(shape=(self.action_dim,))
        
        # Concatenate state and action inputs
        concat = layers.Concatenate()([input_state, input_action])
        
        # Pass through dense layers
        dense1 = layers.Dense(64, activation='relu')(concat)
        dense2 = layers.Dense(64, activation='relu')(dense1)
        dense3 = layers.Dense(64, activation='relu')(dense2)
        output = layers.Dense(1, activation='linear')(dense3)  # Output the predicted reward
        
        # Create and compile the model
        model = models.Model(inputs=[input_state, input_action], outputs=output)
        model.compile(loss='mse', optimizer=optimizers.Adam(learning_rate=self.alpha, clipvalue=1.0))
        return model

    def update_target_network(self):
        tau = 0.125
        weights = self.model.get_weights() # give weights
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * tau + target_weights[i] * (1 - tau) # update weights
        
        self.target_model.set_weights(target_weights) # add weights that updated
        # self.target_model.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    def update_epsilon(self, episode, total_episodes):
        """
        Adjust the epsilon value based on the episode index.
        The decay rate changes based on different stages of the training process.
        """
        # Define three different decay rates for three stages of training
        if episode < 0.25 * total_episodes:  # Stage 1 (0–25% of episodes)
            decay_rate = 0.9992  # Slow decay for broad exploration
        elif 0.25 * total_episodes <= episode < 0.75 * total_episodes:  # Stage 2 (25–75%)
            decay_rate = 0.9980  # Moderate decay
        else:  # Stage 3 (75–100%)
            decay_rate = 0.9930  # Faster decay to focus on exploitation

        # Update epsilon value
        min_epsilon = 0.01  # Set a minimum epsilon value
        self.epsilon = max(min_epsilon, self.epsilon * decay_rate)

    def act(self, state):
        if np.random.rand() <= self.epsilon and self.is_training:
            # Choose a random action from the predefined set (exploration)
            action = self.sample_random_action()

        else:
            if self.is_training:
                # Sample a subset of actions from all actions for training
                candidate_actions = np.array([self.sample_random_action() for _ in range(self.num_candidates)])

            else:
                # Use all actions during testing
                candidate_actions = self.all_action

            # Prepare the state array for batch prediction
            state_batch = np.tile(state, (candidate_actions.shape[0], 1))

            # Normalize actions
            normalized_action = np.array([self.normalize_action(action) for action in candidate_actions])

            # Predict the reward for each candidate action
            predicted_rewards = self.model.predict([state_batch, normalized_action])

            # Select the action with the highest predicted reward
            best_action_index = np.argmax(predicted_rewards)
            action = candidate_actions[best_action_index]

        return action

    def sample_random_action(self):
        # Generate random values for the action parameters
        # Generate a random value from a discrete set of 10 values ​​(for discretization). Discretization reduces the operation space, improving performance with minimal impact on results.
        alpha = np.round(np.random.choice(np.linspace(0, 1, 10)),3)
        b = np.round(np.random.choice(np.linspace(0.005, (2 / self.num_users), 10)),3)  # The minimum is set to 0.01 to avoid zero values, and the upper limit ensures fair distribution across users.
        p = np.round(np.random.choice(np.linspace(0.005, self.env.P_max, 10)),3)  # The lower limit is 0.01 to avoid zero values, which are impractical and can cause calculation errors.
        f_ue = np.round(np.random.choice(np.linspace(1e6, self.env.F_max_ue, 10))) # The minimum is set to 1 MHz to avoid unrealistic values, ensuring reasonable usage.
        f_es = np.round(np.random.choice(np.linspace(1e6, ((2 * self.env.F_max_es) / self.num_users), 10))) # The lower limit prevents unrealistic values, while the upper limit ensures fair resource distribution.
        cache_hit = np.random.choice([0, 1])  # Generate a random cache hit value, either 0 (no cache hit) or 1 (cache hit).
        
        return np.array([alpha, b, p, f_ue, f_es, cache_hit])
    
    def sample_all_action(self):
        # Generate all action
        alpha = np.round(np.linspace(0, 1, 10),3) 
        b = np.round(np.linspace(0.005, (2 / self.num_users), 10),3) 
        p = np.round(np.linspace(0.005, self.env.P_max, 10),3) 
        f_ue = np.round(np.linspace(1e6, self.env.F_max_ue, 10))
        f_es = np.round(np.linspace(1e6, ((2 * self.env.F_max_es) / self.num_users), 10))
        cache_hit = [0, 1]

        samples = []

        for i in alpha:
            for j in b:
                for k in p:
                    for l in f_ue:
                        for m in f_es:
                            for n in cache_hit:
                                samples.append(np.array([i, j, k, l, m, n]))

        samples = np.array(samples)

        return samples

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        # Prioritized sampling based on rewards
        minibatch = random.sample(self.memory, self.batch_size)

        # Vectorized state, action, next_state extraction and reshaping
        states = np.array([sample[0] for sample in minibatch])
        actions = np.array([sample[1] for sample in minibatch])
        rewards = np.array([sample[2] for sample in minibatch])
        next_states = np.array([sample[3] for sample in minibatch])
        dones = np.array([sample[4] for sample in minibatch])

        states = states.reshape(self.batch_size, -1)
        actions = actions.reshape(self.batch_size, -1)
        next_states = next_states.reshape(self.batch_size, -1)

        # Sample a subset of actions from all actions for training
        candidate_actions = np.array([self.sample_random_action() for _ in range(self.num_candidates)])
        normalized_candidate_actions = np.array([self.normalize_action(action) for action in candidate_actions])

        # Expand dimensions to match state_batch and normalized_candidate_actions
        state_batch = np.repeat(next_states, self.num_candidates, axis=0)
        action_batch = np.tile(normalized_candidate_actions, (self.batch_size, 1))

        # Predict Q-values ​​for each next_state-action pair
        predicted_q_values = self.model.predict([state_batch, action_batch])
        max_q_values = np.max(predicted_q_values.reshape(self.batch_size, self.num_candidates), axis=1)

        # Compute target for the Q-learning update
        targets = rewards + self.gamma * max_q_values * (1 - dones)

        # Predict current Q-values ​​and update them
        target_f = self.model.predict([states, actions])

        for i in range(self.batch_size):
            target_f[i, 0] = targets[i]
        
        # Train the model on the updated Q-values
        self.model.fit([states, actions], target_f, epochs=1, verbose=0)

    def normalize_state(self, state):
        # Normalize states between values ​​0 and 1
        normalized_state = np.array([
                state[0] / 300,  # Normalizing task complexity (eta_m)
                state[1],  # Bandwidth (assuming it's already normalized to [0, 1])
                state[2] / self.env.F_max_es,  # Normalizing computation
                state[3], # cache hit
                state[4] / (self.env.area_size/2) # distance
        ])
        return normalized_state
    
    def normalize_action(self, action):
        # Normalize actions between values ​​0 and 1
        normalized_action = np.array([
                action[0] ,  # alpha
                action[1] / (2 / self.num_users),  # bandwidth
                action[2] / self.env.P_max,  # power transmission
                action[3] / self.env.F_max_ue,  # Normalizing computation of user
                action[4] / ((2 * self.env.F_max_es) / self.num_users),  # Normalizing computation of server
                action[5] # cache hit
        ])
        return normalized_action

    def load(self, name):
        self.model.load_weights(f'{name}model.weights.h5')
        self.target_model.load_weights(f'{name}target_model.weights.h5')

    def save(self, name):
        self.model.save_weights(f'{name}model.weights.h5')
        self.target_model.save_weights(f'{name}target_model.weights.h5')

    def train(self, num_episodes):
        # Lists to store average delay and energy values for each episode
        avg_delays = []
        avg_energies = []
        avg_rewards = []

        self.env.is_training = True
        self.is_training = True

        for episode in range(num_episodes):

            self.env.reset()

            # Initialize total delay and energy for this episode
            total_delay = 0
            total_energy = 0
            total_reward = 0

            done_step = False

            # Initialize the number of tasks in this episode
            num_all_tasks = 0
            actual_steps = 0

            for step in range(self.max_steps_per_episode):

                if done_step:
                    break

                self.env.create_task()

                for item in range(self.num_users):

                    state = np.array(self.env.get_state())

                    normalized_state = np.array(self.normalize_state(state))

                    action = self.act(normalized_state)
                    
                    # Execute the actions in the environment
                    reward, next_state_info, done = self.env.step(action)

                    # Extract delay and energy values from the next device information
                    delay = next_state_info.pop(0)
                    energy = next_state_info.pop(0)

                    next_state = np.array(next_state_info)

                    normalized_action = self.normalize_action(action)
                    normal_nextstate = self.normalize_state(next_state)

                    self.remember(normalized_state, normalized_action, reward, normal_nextstate, done)

                    # Accumulate the total delay and energy for the episode
                    total_delay += delay
                    total_energy += energy
                    total_reward += reward

                    num_all_tasks += 1

                    if done:
                        done_step = True
                        # Exit the loop if the episode is done
                        break

                self.env.increase_time()

                actual_steps += 1
                
            self.env.render()

            # Calculate and store average delay and energy for the episode
            avg_delay = (total_delay / num_all_tasks) * 1000  # Convert to milliseconds
            avg_energy = total_energy / num_all_tasks
            avg_reward = total_reward / num_all_tasks
            avg_delays.append(avg_delay)
            avg_energies.append(avg_energy)
            avg_rewards.append(avg_reward)

            # Update epsilon for the epsilon-greedy strategy
            self.update_epsilon(episode, num_episodes)

            self.replay()

            # save model
            if (episode + 1) % self.save_interval == 0:
                self.save(f'Model{episode+1}')

            if (episode + 1) % self.update_target_freq == 0:
                self.update_target_network()
            
            # Print the episode's results
            # print(f"Train : Episode {episode + 1}/{num_episodes} - Steps Count {actual_steps} - Tasks Count {num_all_tasks} - Avg Delay: {avg_delay}, Avg Energy: {avg_energy}, Avg Reward: {avg_reward}")
            # print("-" * 100)

        # Optionally plot the results
        # self.plot_results(avg_delays, avg_energies, avg_rewards)

    def test(self, num_test_steps=1):
        # Initialize total delay, alpha values, and rewards for the test
        total_delay = 0
        total_alpha = 0
        total_energy = 0
        total_reward = 0
        
        done_step = False

        # Counter for actual steps
        num_all_tasks = 0
        actual_steps = 0
        
        # Set epsilon to 0 for testing (no exploration)
        self.epsilon = 0

        self.env.is_training = False
        self.is_training = False
        
        # Reset the environment 
        self.env.reset()

        for step in range(num_test_steps):

            if done_step:
                    break

            self.env.create_task()

            for item in range(self.num_users):

                state = np.array(self.env.get_state())

                normalized_state = self.normalize_state(state)

                action = self.act(normalized_state)
                
                # Execute the actions in the environment
                reward, next_state_info, done = self.env.step(action)

                # Extract delay and energy values from the next device information
                delay = next_state_info.pop(0)
                energy = next_state_info.pop(0)

                # Accumulate delay and alpha values
                total_delay += delay
                total_energy += energy
                total_alpha += 1 - action[0]

                num_all_tasks += 1

                if done:
                    done_step = True
                    # Exit the loop if the episode is done
                    break

            self.env.increase_time()

            # Increment the actual steps counter
            actual_steps += 1

        self.env.render()

        # Calculate and return the average delay and alpha for the test
        total_delay = total_delay * 1000  # Convert to milliseconds
        avg_energy = (total_energy / num_all_tasks) * 1000  # Convert to milliJoule 
        avg_alpha = total_alpha / num_all_tasks
        avg_reward = total_reward / num_all_tasks


        # Print the episode's results
        # print(f"Test : Steps Count {actual_steps} - Tasks Count {num_all_tasks} - Delay: {total_delay}, Avg Energy: {avg_energy}, Avg Reward: {avg_reward}, Avg Alpha: {avg_alpha}")
        # print("-" * 100)

        # if num_all_tasks == 15:
        #     return avg_delay, avg_alpha

        return total_delay, avg_alpha


    def plot_results(self, avg_delays, avg_energies, avg_rewards):
        episodes = np.arange(1, len(avg_delays) + 1)

        plt.figure(figsize=(12, 6))

        plt.subplot(1, 3, 1)
        plt.plot(episodes, avg_delays, label='Avg Delay')
        plt.xlabel('Episode')
        plt.ylabel('Average Delay')
        plt.title('Average Delay per Episode')
        plt.legend()

        plt.subplot(1, 3, 2)
        plt.plot(episodes, avg_energies, label='Avg Energy')
        plt.xlabel('Episode')
        plt.ylabel('Average Energy')
        plt.title('Average Energy per Episode')
        plt.legend()

        plt.subplot(1, 3, 3)
        plt.plot(episodes, avg_rewards, label='Avg Reward')
        plt.xlabel('Episode')
        plt.ylabel('Average Reward')
        plt.title('Average Reward per Episode')
        plt.legend()

        plt.tight_layout()
        plt.show()