In [1]:
import numpy as np
import random
from scipy.special import erfcinv

import tensorflow as tf
from tensorflow.keras import models, layers, optimizers
from collections import deque

import os
import matplotlib.pyplot as plt

In [2]:
# Edge Computing Environment
class EdgeComputingEnvironment:
    def __init__(self, M=15, area_size=100, D_m=1354, eta_m_range=(100, 300), F_max_ue=1.5, P_max=23, B=5, T_max=10, F_max_es=30, S_max_es=60, epsilon=10**-7, E_max=3, theta=10**-26, L=8, phi=0.02, N0=-174, f_es_dev=0.02, f_ue_dev=0.02):
        """
        Initialize the edge computing environment with given parameters.
        """
        self.M = M  # Number of users
        self.area_size = area_size  # Size of the area in which users are distributed
        self.D_m = D_m  # Task data size
        self.eta_m_range = eta_m_range  # Range of Task complexity
        self.F_max_ue = F_max_ue * 10**9  # Maximum frequency of user equipment
        self.P_max = 10 ** ((P_max - 30) / 10)  # Convert maximum transmission power from dBm to Watts
        self.B = B * 10**6  # Bandwidth
        self.T_max = T_max * 10**-3  # Maximum tolerable delay
        self.F_max_es = F_max_es * 10**9  # Maximum frequency of edge server
        self.S_max_es = S_max_es * 10**3  # Maximum cache size of edge server
        self.epsilon = epsilon  # Error tolerance for rate calculation
        self.E_max = E_max * 10**-3  # Maximum energy consumption
        self.theta = theta  # Energy coefficient
        self.L = L  # Number of antennas
        self.phi = phi * 10**-3  # Transmission time interval
        self.R_min = 10**6  # Minimum data rate
        self.N0 = N0  # Noise power in dBm
        self.N0 = 10 ** ((N0 - 30) / 10)  # Convert noise power from dBm/Hz to Watts/Hz
        self.PL_d = lambda d: -35.3 - (37.6 * np.log10(d))  # Path loss model
        self.f_es_dev = f_es_dev  #The deviation between the estimated value and the actual value of the processing rate of the ES
        self.f_ue_dev = f_ue_dev  #The deviation between the estimated value and the actual value of the processing rate of the UE
        self.is_training = True
        self.tasks = []
        self.current_task = {}
        self.Task_processed = 0
        self.penalty = 10
        self.penalties = [0,0,0,0,0,0]

        self.user_device_params = []  # List to store parameters for each user device
        self.initialize_user_device_params()  # Initialize user device parameters

        self.cache = []  # Cache to store tasks
        self.current_cache_size = 0  # Current size of the cache
        self.transmitting_tasks = []  # List to store transmitting tasks
        self.processing_tasks = []  # List to store processing tasks
        self.current_time = 0.0  # Current simulation time

        # Initialize bandwidth and computation attributes
        self.total_bandwidth = 0 # Initialize total bandwidth
        self.total_computation = 0 # Initialize total computation

    def initialize_user_device_params(self):
        """
        Initialize parameters for each user device.
        Randomly generates user-specific parameters such as path loss.
        """
        for device_id in range(self.M):
            d = np.random.uniform(1, self.area_size / 2)  # Distance to server
            PL_dB = self.PL_d(d)
            g_m = 10 ** (PL_dB / 10)  # Convert path loss from dB to linear scale
            h_bar = np.random.randn(1, self.L) + 1j * np.random.randn(1, self.L)  # Channel gain

            self.user_device_params.append({
                'device_id': device_id,  # Assign a unique ID to each device
                'd': d,
                'g_m': g_m,
                'h_bar': h_bar,
            })

    def create_task(self):    
        task_distribution = np.random.choice(range(self.M), self.M, replace=True)
        self.tasks = []
        for user_id in task_distribution:
            eta_m =  np.round(np.random.choice(np.linspace(self.eta_m_range[0], self.eta_m_range[1], 50)))
            T_max_task = self.T_max  # Static according to article
            order = np.round(random.uniform(1,5))

            task_info = {
            'eta_m': eta_m,
            'T_max': T_max_task,
            'D_m': 1354,  # Task data size
            'user_id' : user_id,
            'order' : order
            }

            self.tasks.append(task_info)

    def get_state(self):

        self.tasks = sorted(self.tasks, key=lambda x: x['order'], reverse=True)

        task = self.tasks.pop()

        self.current_task = task

        cache_hit = 1 if any(task == task_info[0] for task_info in self.cache) else 0

        state = [
            task['eta_m'],
            self.total_bandwidth,
            self.total_computation,
            cache_hit,
            self.user_device_params[task['user_id']]['d']
        ]

        return state

    def calculate_gamma_m(self, b_m, p_m, user_id):
        """
        Calculate the signal-to-noise ratio (SNR) for a given user.

        Parameters:
        - b_m (float): Bandwidth allocation
        - p_m (float): Transmission power
        - user_id (int): ID of the user

        Returns:
        - gamma_m (array): SNR values for the user's communication channel
        """
        h_m = np.sqrt(self.user_device_params[user_id]['g_m']) * self.user_device_params[user_id]['h_bar']  # Channel gain
        gamma_m = (p_m * np.linalg.norm(h_m, axis=1) ** 2) / (b_m * self.B * self.N0)  # SNR
        
        return gamma_m

    def calculate_uplink_rate(self, b_m, p_m, user_id):
        """
        Calculate the uplink data rate for a given user.

        Parameters:
        - b_m (float): Bandwidth allocation
        - p_m (float): Transmission power
        - user_id (int): ID of the user

        Returns:
        - R_m (float): Uplink data rate in bits/second
        """
        gamma_m = self.calculate_gamma_m(b_m, p_m, user_id)  # Calculate the SINR for the m-th user
        V_m = 1 - (1 / (1 + gamma_m) ** 2)  # Intermediate variable for rate calculation
        Q_inv = np.sqrt(2) * erfcinv(2 * self.epsilon)  # Calculate the inverse of the Q-function for the outage probability
        R_m = (self.B / np.log(2)) * ((b_m * np.log(1 + gamma_m)) - ((np.sqrt((b_m * V_m) / (self.phi * self.B))) * Q_inv))  # Uplink data rate

        return R_m

    def calculate_delay(self, alpha_m, cache_hit, R_m, D_m, f_ue_m, f_es_m, f_ue_est, f_es_est, eta_m):
        """
        Calculate the end-to-end delay for a given task.

        Parameters:
        - alpha_m (float): Offloading decision
        - cache_hit (int): Split factor (0 or 1)
        - R_m (float): Uplink data rate in bits/second
        - D_m (int): Data size
        - f_ue_m (float): Computation capability of the user device
        - f_es_m (float): Computation capability of the edge server
        - f_ue_est (float): Estimation error for the user device's computation capability
        - f_es_est (float): Estimation error for the edge server's computation capability
        - eta_m (float): Computational intensity

        Returns:
        - T_e2e (float): End-to-end delay in seconds
        """
        actual_f_ue_m = f_ue_m - f_ue_est  # Actual processing rate of the user device

        if cache_hit == 1:
            T_es = self.calculate_server_processing_delay(alpha_m, cache_hit, D_m, f_es_m, f_es_est, eta_m)  # Only edge server processing delay
            T_e2e = T_es

        else:
            T_ue = (alpha_m * eta_m * D_m) / actual_f_ue_m  # User device processing delay
            T_tr = self.calculate_transmission_delay(alpha_m, R_m, D_m)  # Transmission delay
            T_es = self.calculate_server_processing_delay(alpha_m, cache_hit, D_m, f_es_m, f_es_est, eta_m)  # Edge server processing delay
            T_e2e = T_ue + T_tr + T_es  # Total end-to-end delay

        return T_e2e

    def calculate_transmission_delay(self, alpha_m, R_m, D_m):
        """
        Calculate the transmission delay for a given task.

        Parameters:
        - alpha_m (float): Offloading decision
        - R_m (float): Uplink data rate in bits/second
        - D_m (int): Data size
        - user_id (int): ID of the user

        Returns:
        - T_co (float): Transmission delay in seconds
        """
        T_co =  ((1 - alpha_m) * (D_m * 8)) / R_m   # Transmission delay calculation based on task size and uplink rate

        return T_co

    def calculate_server_processing_delay(self, alpha_m, cache_hit, D_m, f_es_m, f_es_est, eta_m):
        """
        Calculate the processing delay at the edge server for a given task.

        Parameters:
        - alpha_m (float): Offloading decision
        - D_m (int): Data size
        - cache_hit (0,1): 1 = Exist in cache and 0 not exist in cache
        - f_es_m (float): Computation capability of the edge server
        - f_es_est (float): Estimation error for the edge server's computation capability
        - eta_m (float): Computational intensity

        Returns:
        - T_es (float): Processing delay at the edge server in seconds
        """

        actual_f_es_m = f_es_m - f_es_est  # Actual processing rate of the Edge server

        if cache_hit == 0:
            T_es = ((1 - alpha_m) * eta_m * D_m) / actual_f_es_m  # Processing delay at the edge server

        else:
            T_es = (eta_m * D_m) / actual_f_es_m
        return T_es

    def calculate_energy_consumption(self, s_m, R_m, alpha_m, p_m, D_m, f_ue_m, f_ue_est, eta_m):
        """
        Calculate the energy consumption for a given task.

        Parameters:
        - alpha_m (float): Offloading decision
        - R_m (float): Uplink data rate in bits/second
        - s_m (int): Split factor (0 or 1)
        - f_ue_m (float): Computation capability of the user device
        - p_m (float): Transmission power
        - f_ue_est (float): Estimation error for the user device's computation capability
        - eta_m (float): Computational intensity

        Returns:
        - E_total (float): Total energy consumption in Joules
        """

        actual_f_ue_m = f_ue_m - f_ue_est  # Calculate the actual processing rate of the UE

        E_ue = alpha_m * (self.theta / 2) * eta_m * D_m * (actual_f_ue_m ** 2)  # Energy consumption at the user device
        E_tx = ((D_m * 8) * p_m) / R_m  # Transmission energy

        if s_m == 1:  # Task is in cache
            E_total = 0  # No energy consumed when task is in cache
        else:
            E_total = E_ue + E_tx  # Total energy consumption

        return E_total

    def manage_cache(self, task_info, task_delay, cache_hit_model):
        """
        Manage the cache for storing and retrieving tasks.

        Parameters:
        - task_info (tuple): Task parameters to identify the task
        - task_delay (float): Delay of the task

        Returns:
        - bool: True if the task is found in the cache, False otherwise
        """
        # Check for cache hit first
        cache_hit = any(task_info == task[0] for task in self.cache)
        
        if task_delay == 0:
            return cache_hit  # Return True if found, False otherwise
        
        task_size = task_info['D_m'] * 8  # Task size
        Server_Max_Capacity = self.S_max_es  # Server maximum capacity
        if Server_Max_Capacity == 0 :
            return
        
        if self.is_training :
            # During training
            if cache_hit:
                self.current_cache_size -= task_info['D_m'] * 8
                self.cache = [task for task in self.cache if task[0] != task_info]

            # During training, always update cache based on task delay
            self.cache.append((task_info, task_delay))  # Add task to cache
            self.current_cache_size += task_size  # Update cache size
        
            if self.current_cache_size >= Server_Max_Capacity:
                sorted_cache = sorted(self.cache, key=lambda x: x[1], reverse=True)  # Sort tasks by delay in descending order

                while (self.current_cache_size) > Server_Max_Capacity:
                    if not sorted_cache:
                        break  # Exit loop if sorted_cache is empty
                    last_task = sorted_cache.pop()  # Remove the last task from sorted_cache
                    self.cache.remove(last_task)  # Remove the task from the cache
                    self.current_cache_size -= last_task[0]['D_m'] * 8  # Update current cache size

        if self.is_training is False:
            # During testing, follow the model's prediction
            if cache_hit_model == 0 :
                if cache_hit:
                    self.current_cache_size -= task_info['D_m'] * 8
                    self.cache = [task for task in self.cache if task[0] != task_info]
            else:
                if cache_hit:
                    self.current_cache_size -= task_info['D_m'] * 8
                    self.cache = [task for task in self.cache if task[0] != task_info]

                if (task_size + self.current_cache_size) <= Server_Max_Capacity:
                    self.cache.append((task_info, task_delay))  # Add task to cache
                    self.current_cache_size += task_size  # Update cache size

                else:
                    sorted_cache = sorted(self.cache, key=lambda x: x[1], reverse=True)  # Sort tasks by delay in descending order

                    while (task_size + self.current_cache_size) > Server_Max_Capacity:
                        if not sorted_cache:
                            break  # Exit loop if sorted_cache is empty
                        last_task = sorted_cache.pop()  # Remove the last task from sorted_cache
                        self.cache.remove(last_task)  # Remove the task from the cache
                        self.current_cache_size -= last_task[0]['D_m'] * 8  # Update current cache size

                    self.cache.append((task_info, task_delay))  # Add task to cache
                    self.current_cache_size += task_size  # Update cache size


    def step(self, action):
        """
        Perform a simulation step for the given action.

        Parameters:
        - action (array): Array of action for each user
        - tasks (array): Array of task for each user

        Returns:
        - tuple: (task_rewards, next_state, done)
        """
        done = False

        alpha_m = action[0]
        b_m = action[1]
        p_m = action[2]
        f_ue_m = action[3]
        f_es_m = action[4]
        cache_hit_model = action[5]

        task = self.current_task

        user_id = task.pop('user_id',0)
        task.pop('order')
        
        # Determine if the task is a cache hit or miss
        cache_hit = 1 if self.manage_cache(task, 0, cache_hit_model) else 0

        f_ue_est = f_ue_m * self.f_ue_dev  
        f_es_est = f_es_m * self.f_es_dev  

        # Calculate the uplink data rate for the user
        R_m = self.calculate_uplink_rate(b_m, p_m, user_id)

        # Calculate the end-to-end delay for the task
        delay = self.calculate_delay(
            alpha_m, cache_hit, R_m,
            task['D_m'], f_ue_m, f_es_m, f_ue_est,
            f_es_est, task['eta_m']
        )

        delay = np.round(delay[0],6) if isinstance(delay, np.ndarray) else np.round(delay,6)

        # Calculate the energy consumption for the task
        energy = self.calculate_energy_consumption(
            cache_hit, R_m, alpha_m, p_m, task['D_m'], f_ue_m,
            f_es_est, task['eta_m']
        )

        energy = np.round(energy[0],6) if isinstance(energy, np.ndarray) else np.round(energy,6)

        # Manage task transmission and processing times
        if cache_hit == 0:
            transmission_end_time = self.current_time + self.calculate_transmission_delay(alpha_m, R_m, task['D_m'])
            processing_end_time = transmission_end_time + self.calculate_server_processing_delay(alpha_m, cache_hit, task['D_m'], f_es_m, f_es_est, task['eta_m'])

            self.transmitting_tasks.append((self.current_time, transmission_end_time, b_m))
            process = f_es_m * (1 - alpha_m)
            self.processing_tasks.append((transmission_end_time, processing_end_time, process))

        else:
            # For cache hit, only processing delay is considered
            processing_end_time = self.current_time + self.calculate_server_processing_delay(alpha_m, cache_hit, task['D_m'], f_es_m, f_es_est, task['eta_m'])
            self.processing_tasks.append((self.current_time, processing_end_time, f_es_m))

        # Update cache with the task if it becomes eligible
        self.manage_cache(task, delay, cache_hit_model)
            
        # Calculate total bandwidth and computation resource usage at current time
        self.total_bandwidth = sum(b for _, end_time, b in self.transmitting_tasks if end_time > self.current_time)
        self.total_computation = sum(f for _, end_time, f in self.processing_tasks if end_time > self.current_time)

        # Free resources for tasks that have completed transmission or processing
        self.transmitting_tasks = [(start_time, end_time, b) for start_time, end_time, b in self.transmitting_tasks if end_time > self.current_time]
        self.processing_tasks = [(start_time, end_time, f) for start_time, end_time, f in self.processing_tasks if end_time > self.current_time]

        # Check The Cache hit of model is right or not 
        cache_hit = 1 if self.manage_cache(task, 0, cache_hit_model) else 0
        cache_hit_right = 1 if cache_hit == cache_hit_model else 0

        x = (((1/self.M) - b_m) ** 2) * 10000

        # Calculate reward
        reward  = (-energy - delay)*1e4 
        # print(reward)
        reward -= x
        # print(action)
        # print(delay)
        # print(energy)
        
        # Apply penalties for exceeding resource limits
        if delay > task['T_max']:
            reward -= self.penalty
            done = True
            self.penalties[0] += 1
        if cache_hit_right == 0:
            reward -= self.penalty
            self.penalties[1] += 1
        if R_m < self.R_min:
            reward -= self.penalty
            done = True
            self.penalties[2] += 1
        if energy > self.E_max:
            reward -= self.penalty
            done = True
            self.penalties[3] += 1
        if self.total_bandwidth > 1:
            reward -= self.penalty
            done = True
            self.penalties[4] += 1
        if self.total_computation > self.F_max_es:
            reward -= self.penalty
            done = True
            self.penalties[5] += 1

        # if b_m <= 1/self.M and R_m >= self.R_min*3:
        #     reward += self.penalty*3
            
            
        # if b_m >= 1/self.M:
        #     reward -= self.penalty*3

        # print(reward)
        state_info = [
            delay,
            energy,
            task['eta_m'],   # task complexity
            self.total_bandwidth,
            self.total_computation,
            cache_hit_right,
            self.user_device_params[user_id]['d']
        ]

        self.Task_processed += 1

        return reward, state_info, done

    # Increment current simulation time
    def increase_time(self):
        self.current_time += self.T_max
        
    def reset(self):
        """
        Reset the environment to its initial state.
        """
        self.cache = [] 
        self.current_cache_size = 0
        self.Task_processed = 0  
        self.transmitting_tasks = [] 
        self.processing_tasks = [] 
        self.current_time = 0.0  
        self.user_device_params = []
        self.initialize_user_device_params()
        self.total_bandwidth = 0  
        self.total_computation = 0  

    def render(self):
        print(self.penalties)
        # print(f"Number of Users: {self.M}")
        # print(f"Number of Task Processed: {self.Task_processed}")
        # print(f"Total Bandwidth Used: {self.total_bandwidth}")
        # print(f"Total Computation Used: {self.total_computation}")
        # print(f"Current Cache Size: {np.round(self.current_cache_size/1000,2)} Kb")

In [3]:
# Double Deep Q-Network (DDQN) Agent
class DDQNAgent:
    def __init__(self, env, alpha=0.001, gamma=0.95, epsilon=1.0, batch_size=64, max_steps_per_episode=1, update_target_freq=25):
        self.env = env  # Environment for the agent
        self.num_users = env.M  # Number of users/devices in the environment
        self.num_tasks = env.M # Number of task
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor for future rewards
        self.epsilon = epsilon  # Exploration rate
        self.batch_size = batch_size
        self.max_steps_per_episode = max_steps_per_episode  # Maximum steps per episode
        self.save_interval = 1000  # save model 
        self.num_candidates = 1000  # Define the number of candidate actions to sample
        self.state_dim = 5  # state dimensions
        self.action_dim = 6  # action dimensions
        self.all_action = self.sample_all_action()  # Actions Space = 200000
        self.is_training = True
        self.all = 0  
        self.update_target_freq = update_target_freq  # Frequency to update the target network

        self.memory = deque(maxlen=2000)
        self.model = self.build_model()
        self.target_model = self.build_model()
        self.update_target_network()  # Initialize the target network

    def build_model(self):
        # Define inputs for state and action
        input_state = layers.Input(shape=(self.state_dim,))
        input_action = layers.Input(shape=(self.action_dim,))
        
        # Concatenate state and action inputs
        concat = layers.Concatenate()([input_state, input_action])
        
        # Pass through dense layers
        dense1 = layers.Dense(64, activation='relu')(concat)
        dense2 = layers.Dense(64, activation='relu')(dense1)
        dense3 = layers.Dense(64, activation='relu')(dense2)
        output = layers.Dense(1, activation='linear')(dense3)  # Output the predicted reward
        
        # Create and compile the model
        model = models.Model(inputs=[input_state, input_action], outputs=output)
        model.compile(loss='mse', optimizer=optimizers.Adam(learning_rate=self.alpha, clipvalue=1.0))
        return model

    def update_target_network(self):
        tau = 0.125
        weights = self.model.get_weights() # give weights
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * tau + target_weights[i] * (1 - tau) # update weights
        self.target_model.set_weights(target_weights) # add weights that updated
        #self.target_model.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    def update_epsilon(self, episode, total_episodes):
        """
        Adjust the epsilon value based on the episode index.
        The decay rate changes based on different stages of the training process.
        """
        # Define three different decay rates for three stages of training
        if episode < 0.25 * total_episodes:  # Stage 1 (0–25% of episodes)
            decay_rate = 0.9992  # Slow decay for broad exploration
        elif 0.25 * total_episodes <= episode < 0.75 * total_episodes:  # Stage 2 (25–75%)
            decay_rate = 0.9980  # Moderate decay
        else:  # Stage 3 (75–100%)
            decay_rate = 0.9930  # Faster decay to focus on exploitation

        # Update epsilon value
        min_epsilon = 0.01  # Set a minimum epsilon value
        self.epsilon = max(min_epsilon, self.epsilon * decay_rate)

    def act(self, state):
        if np.random.rand() <= self.epsilon and self.is_training:
            # Choose a random action from the predefined set (exploration)
            action = self.sample_random_action()

        else:
            if self.is_training:
                # Sample a subset of actions from all actions for training
                candidate_actions = np.array([self.sample_random_action() for _ in range(self.num_candidates)])

            else:
                # Use all actions during testing
                candidate_actions = self.all_action #np.array([self.sample_random_action() for _ in range(1000)]) 

            # Prepare the state array for batch prediction
            state_batch = np.tile(state, (candidate_actions.shape[0], 1))

            # Normalize actions
            normalized_action = np.array([self.normalize_action(action) for action in candidate_actions])

            # Predict the reward for each candidate action
            predicted_rewards = self.model.predict([state_batch, normalized_action])

            # Select the action with the highest predicted reward
            best_action_index = np.argmax(predicted_rewards)
            action = candidate_actions[best_action_index]

            #if self.is_training==False:                                                                                                                 #
            print(action)
        return action

    def sample_random_action(self):
        # Generate random values for the action parameters
        # Generate a random value from a discrete set of 10 values ​​(for discretization). Discretization reduces the operation space, improving performance with minimal impact on results.
        alpha = np.round(np.random.choice(np.linspace(0, 1, 10)),3)
        b = np.round(np.random.choice(np.linspace(0.005, (2 / self.num_users), 10)),3)  # The minimum is set to 0.01 to avoid zero values, and the upper limit ensures fair distribution across users.
        p = np.round(np.random.choice(np.linspace(0.005, self.env.P_max, 10)),3)  # The lower limit is 0.01 to avoid zero values, which are impractical and can cause calculation errors.
        f_ue = np.round(np.random.choice(np.linspace(1e6, self.env.F_max_ue, 10))) # The minimum is set to 1 MHz to avoid unrealistic values, ensuring reasonable usage.
        f_es = np.round(np.random.choice(np.linspace(1e6, ((2 * self.env.F_max_es) / self.num_users), 10))) # The lower limit prevents unrealistic values, while the upper limit ensures fair resource distribution.
        cache_hit = np.random.choice([0, 1])  # Generate a random cache hit value, either 0 (no cache hit) or 1 (cache hit).
        
        return np.array([alpha, b, p, f_ue, f_es, cache_hit])
    
    def sample_all_action(self):
        # Generate all action
        alpha = np.round(np.linspace(0, 1, 10),3) 
        b = np.round(np.linspace(0.005, (2 / self.num_users), 10),3) 
        p = np.round(np.linspace(0.005, self.env.P_max, 10),3) 
        f_ue = np.round(np.linspace(1e6, self.env.F_max_ue, 10))
        f_es = np.round(np.linspace(1e6, ((2 * self.env.F_max_es) / self.num_users), 10))
        cache_hit = [0, 1]

        samples = []

        for i in alpha:
            for j in b:
                for k in p:
                    for l in f_ue:
                        for m in f_es:
                            for n in cache_hit:
                                samples.append(np.array([i, j, k, l, m, n]))

        samples = np.array(samples)

        return samples

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        # Prioritized sampling based on rewards
        minibatch = random.sample(self.memory, self.batch_size)

        # Vectorized state, action, next_state extraction and reshaping
        states = np.array([sample[0] for sample in minibatch])
        actions = np.array([sample[1] for sample in minibatch])
        rewards = np.array([sample[2] for sample in minibatch])
        next_states = np.array([sample[3] for sample in minibatch])
        dones = np.array([sample[4] for sample in minibatch])

        states = states.reshape(self.batch_size, -1)
        actions = actions.reshape(self.batch_size, -1)
        next_states = next_states.reshape(self.batch_size, -1)

        # Sample a subset of actions from all actions for training
        candidate_actions = np.array([self.sample_random_action() for _ in range(self.num_candidates)])
        normalized_candidate_actions = np.array([self.normalize_action(action) for action in candidate_actions])

        # Expand dimensions to match state_batch and normalized_candidate_actions
        state_batch = np.repeat(next_states, self.num_candidates, axis=0)
        action_batch = np.tile(normalized_candidate_actions, (self.batch_size, 1))

        # Predict Q-values ​​for each next_state-action pair
        predicted_q_values = self.model.predict([state_batch, action_batch])
        max_q_values = np.max(predicted_q_values.reshape(self.batch_size, self.num_candidates), axis=1)
        print("predicted_q_values shape:", predicted_q_values.shape)

        # Compute target for the Q-learning update
        targets = rewards + self.gamma * max_q_values * (1 - dones)

        # Predict current Q-values ​​and update them
        target_f = self.model.predict([states, actions])

        for i in range(self.batch_size):
            target_f[i, 0] = targets[i]
        
        # Train the model on the updated Q-values
        self.model.fit([states, actions], target_f, epochs=1, verbose=0)

    def normalize_state(self, state):
        # Normalize states between values ​​0 and 1
        normalized_state = np.array([
                state[0] / 300,  # Normalizing task complexity (eta_m)
                state[1],  # Bandwidth (assuming it's already normalized to [0, 1])
                state[2] / self.env.F_max_es,  # Normalizing computation
                state[3], # cache hit
                state[4] / (self.env.area_size/2) # distance
        ])
        return normalized_state
    
    def normalize_action(self, action):
        # Normalize actions between values ​​0 and 1
        normalized_action = np.array([
                action[0] ,  # alpha
                action[1] / (2 / self.num_users),  # bandwidth
                action[2] / self.env.P_max,  # power transmission
                action[3] / self.env.F_max_ue,  # Normalizing computation of user
                action[4] / ((2 * self.env.F_max_es) / self.num_users),  # Normalizing computation of server
                action[5] # cache hit
        ])
        return normalized_action

    def load(self, name):
        self.model.load_weights(f'{name}model.weights.h5')
        self.target_model.load_weights(f'{name}target_model.weights.h5')

    def save(self, name):
        self.model.save_weights(f'{name}model.weights.h5')
        self.target_model.save_weights(f'{name}target_model.weights.h5')

    def train(self, num_episodes):
        # Lists to store average delay and energy values for each episode
        avg_delays = []
        avg_energies = []
        avg_rewards = []

        self.env.is_training = True
        self.is_training = True

        for episode in range(num_episodes):

            self.env.reset()

            # Initialize total delay and energy for this episode
            total_delay = 0
            total_energy = 0
            total_reward = 0

            done_step = False

            # Initialize the number of tasks in this episode
            num_all_tasks = 0
            actual_steps = 0

            for step in range(self.max_steps_per_episode):

                if done_step:
                    break

                self.env.create_task()

                for item in range(self.num_users):

                    state = np.array(self.env.get_state())

                    normalized_state = np.array(self.normalize_state(state))

                    action = self.act(normalized_state)
                    
                    # Execute the actions in the environment
                    reward, next_state_info, done = self.env.step(action)

                    # Extract delay and energy values from the next device information
                    delay = next_state_info.pop(0)
                    energy = next_state_info.pop(0)

                    next_state = np.array(next_state_info)

                    normalized_action = self.normalize_action(action)
                    normal_nextstate = self.normalize_state(next_state)

                    self.remember(normalized_state, normalized_action, reward, normal_nextstate, done)

                    # Accumulate the total delay and energy for the episode
                    total_delay += delay
                    total_energy += energy
                    total_reward += reward

                    num_all_tasks += 1

                    if done:
                        done_step = True
                        # Exit the loop if the episode is done
                        break

                self.env.increase_time()

                actual_steps += 1
                
            self.env.render()
            self.all += num_all_tasks

            # Calculate and store average delay and energy for the episode
            avg_delay = (total_delay / num_all_tasks) * 1000  # Convert to milliseconds
            avg_energy = total_energy / num_all_tasks
            avg_reward = total_reward / num_all_tasks
            avg_delays.append(avg_delay)
            avg_energies.append(avg_energy)
            avg_rewards.append(avg_reward)

            # Update epsilon for the epsilon-greedy strategy
            self.update_epsilon(episode, num_episodes)

            self.replay()

            # save model
            if (episode + 1) % self.save_interval == 0:
                self.save(f'Model{episode+1}')

            if (episode + 1) % self.update_target_freq == 0:
                self.update_target_network()
            
            # Print the episode's results
            print(f"Train : Episode {episode + 1}/{num_episodes} - Steps Count {actual_steps} - Tasks Count {num_all_tasks} - Avg Delay: {avg_delay}, Avg Energy: {avg_energy}, Avg Reward: {avg_reward}")
            print("-" * 100)

        # Optionally plot the results
        self.plot_results(avg_delays, avg_energies, avg_rewards)

    def test(self, num_test_steps=1):
        # Initialize total delay, alpha values, and rewards for the test
        total_delay = 0
        total_alpha = 0
        total_energy = 0
        total_reward = 0
        
        done_step = False

        # Counter for actual steps
        num_all_tasks = 0
        actual_steps = 0
        
        # Set epsilon to 0 for testing (no exploration)
        self.epsilon = 0

        self.env.is_training = False
        self.is_training = False
        
        # Reset the environment 
        self.env.reset()

        for step in range(num_test_steps):

            if done_step:
                    break

            self.env.create_task()

            for item in range(self.num_users):

                state = np.array(self.env.get_state())

                normalized_state = self.normalize_state(state)

                action = self.act(normalized_state)
                
                # Execute the actions in the environment
                reward, next_state_info, done = self.env.step(action)

                # Extract delay and energy values from the next device information
                delay = next_state_info.pop(0)
                energy = next_state_info.pop(0)

                # Accumulate delay and alpha values
                total_delay += delay
                total_energy += energy
                total_alpha += 1 - action[0]

                num_all_tasks += 1

                if done:
                    done_step = True
                    # Exit the loop if the episode is done
                    break

            self.env.increase_time()

            # Increment the actual steps counter
            actual_steps += 1

        self.env.render()

        # Calculate and return the average delay and alpha for the test
        total_delay = total_delay * 1000  # Convert to milliseconds
        avg_energy = (total_energy / num_all_tasks) * 1000  # Convert to milliJoule 
        avg_alpha = total_alpha / num_all_tasks
        avg_reward = total_reward / num_all_tasks


        # Print the episode's results
        print(f"Test : Steps Count {actual_steps} - Tasks Count {num_all_tasks} - Delay: {total_delay}, Avg Energy: {avg_energy}, Avg Reward: {avg_reward}, Avg Alpha: {avg_alpha}")
        print("-" * 100)

        # if num_all_tasks == 15:
        #     return avg_delay, avg_alpha

        return total_delay, avg_alpha


    def plot_results(self, avg_delays, avg_energies, avg_rewards):
        episodes = np.arange(1, len(avg_delays) + 1)

        plt.figure(figsize=(12, 6))

        plt.subplot(1, 3, 1)
        plt.plot(episodes, avg_delays, label='Avg Delay')
        plt.xlabel('Episode')
        plt.ylabel('Average Delay')
        plt.title('Average Delay per Episode')
        plt.legend()

        plt.subplot(1, 3, 2)
        plt.plot(episodes, avg_energies, label='Avg Energy')
        plt.xlabel('Episode')
        plt.ylabel('Average Energy')
        plt.title('Average Energy per Episode')
        plt.legend()

        plt.subplot(1, 3, 3)
        plt.plot(episodes, avg_rewards, label='Avg Reward')
        plt.xlabel('Episode')
        plt.ylabel('Average Reward')
        plt.title('Average Reward per Episode')
        plt.legend()

        plt.tight_layout()
        plt.show()

In [4]:
# Assuming you have your EdgeComputingEnvironment defined as per your code
env = EdgeComputingEnvironment()

# Initialize the DQN agent
agent = DDQNAgent(env)


# Load the model if you want to continue training
#agent.load("Model2000")

# Train the agent
num_episodes = 2500  # Adjust the number of episodes as needed
agent.train(num_episodes)

# Save the final model
# agent.save("dqn_model.h5")

# for batch in range(3):

#     agent.train(num_episodes)  # Train in batches of episodes
#     agent.save(f'Model-{batch}.h5')
    
#     # Clear the session to free memory
#     K.clear_session()

#     # Reload the model for the next batch
#     agent.load(f'Model-{batch}.h5')

[1, 2, 1, 0, 0, 0]
Train : Episode 1/2500 - Steps Count 1 - Tasks Count 3 - Avg Delay: 8.840333333333334, Avg Energy: 0.0009283333333333334, Avg Reward: -132.89777777777778
----------------------------------------------------------------------------------------------------
[2, 4, 1, 0, 0, 0]
Train : Episode 2/2500 - Steps Count 1 - Tasks Count 2 - Avg Delay: 165.96800000000002, Avg Energy: 0.001448, Avg Reward: -1696.6161111111112
----------------------------------------------------------------------------------------------------
[2, 5, 2, 0, 0, 0]
Train : Episode 3/2500 - Steps Count 1 - Tasks Count 4 - Avg Delay: 0.48149999999999993, Avg Energy: 0.0009385, Avg Reward: -32.46361111111111
----------------------------------------------------------------------------------------------------
[3, 10, 2, 0, 0, 0]
Train : Episode 4/2500 - Steps Count 1 - Tasks Count 8 - Avg Delay: 20.698, Avg Energy: 0.00041874999999999996, Avg Reward: -226.8806944444445
--------------------------------------

KeyboardInterrupt: 

In [5]:
for i in range(5):
    agent.test(num_test_steps=1)

[1.00000000e+00 5.00000000e-03 5.00000000e-03 5.00666667e+08
 4.00000000e+09 1.00000000e+00]
[822, 5170, 2613, 2063, 22, 0]
Test : Steps Count 1 - Tasks Count 1 - Delay: 0.367, Avg Energy: 0.159, Avg Reward: 0.0, Avg Alpha: 0.0
----------------------------------------------------------------------------------------------------
[1.00000000e+00 5.00000000e-03 4.80000000e-02 5.00666667e+08
 4.00000000e+09 1.00000000e+00]
[822, 5170, 2614, 2063, 22, 0]
Test : Steps Count 1 - Tasks Count 1 - Delay: 0.58, Avg Energy: 0.252, Avg Reward: 0.0, Avg Alpha: 0.0
----------------------------------------------------------------------------------------------------
[1.000e+00 4.800e-02 5.000e-03 1.500e+09 1.334e+09 1.000e+00]
[822, 5170, 2614, 2064, 22, 0]
Test : Steps Count 1 - Tasks Count 1 - Delay: 0.273, Avg Energy: 4.35, Avg Reward: 0.0, Avg Alpha: 0.0
----------------------------------------------------------------------------------------------------
[1.00000000e+00 5.00000000e-03 1.35000000e-01 