In [None]:
import numpy as np
import tensorflow as tf
from collections import deque
import matplotlib.pyplot as plt

# Custom SumTree for Prioritized Experience Replay
class SumTree:
    def __init__(self, capacity):
        self.capacity = capacity
        self.tree = np.zeros(2 * capacity - 1)
        self.data = np.zeros(capacity, dtype=object)
        self.data_pointer = 0

    def add(self, priority, data):
        index = self.data_pointer + self.capacity - 1
        self.data[self.data_pointer] = data
        self.update(index, priority)
        self.data_pointer = (self.data_pointer + 1) % self.capacity

    def update(self, index, priority):
        change = priority - self.tree[index]
        self.tree[index] = priority
        while index != 0:
            index = (index - 1) // 2
            self.tree[index] += change

    def get_leaf(self, value):
        parent_idx = 0
        while True:
            left_child = 2 * parent_idx + 1
            right_child = left_child + 1
            if left_child >= len(self.tree):
                leaf_idx = parent_idx
                break
            if value <= self.tree[left_child]:
                parent_idx = left_child
            else:
                value -= self.tree[left_child]
                parent_idx = right_child
        data_index = leaf_idx - self.capacity + 1
        return leaf_idx, self.tree[leaf_idx], self.data[data_index]

    def total_priority(self):
        return self.tree[0]

    def size(self):
        return min(self.capacity, self.data_pointer)

# Mobile Edge Computing Environment
class MECEnvironment:
    def __init__(self, num_md, num_es, num_tasks):
        self.num_md = num_md
        self.num_es = num_es
        self.num_tasks = num_tasks

        # Initialize system parameters
        self.md_compute = np.random.uniform(1.0, 2.5, num_md)  # GHz
        self.es_compute = np.random.uniform(10.0, 15.0, num_es)  # GHz
        self.task_requirements = np.random.uniform(1.0, 5.0, num_tasks)  # c_k
        self.md_battery = np.random.uniform(3000, 5000, num_md)  # mAh
        self.es_battery = np.random.uniform(10000, 20000, num_es)  # mAh
        self.task_to_md = np.random.randint(0, num_md, num_tasks)  # Assign tasks to MDs
        self.reset()

    def reset(self):
        self.md_battery = np.random.uniform(3000, 5000, self.num_md)
        self.es_battery = np.random.uniform(10000, 20000, self.num_es)
        self.tasks = np.column_stack([
            np.random.uniform(2.0, 5.0, self.num_tasks),  # δ_k
            self.task_requirements  # c_k
        ])
        self.remaining_tasks = self.num_tasks
        self.total_energy_consumed = 0
        self.constraint_violations = 0
        self.completed_tasks = 0
        return self._get_state()

    def _get_state(self):
        # Normalized state vector
        return np.concatenate([
            self.md_battery / 5000,
            self.es_battery / 20000,
            self.md_compute / 2.5,
            self.es_compute / 15.0,
            self.tasks[:, 0] / 5.0,  # δ_k
            self.tasks[:, 1] / 5.0    # c_k
        ])

    def step(self, action):
        self.total_energy_consumed = 0
        self.constraint_violations = 0
        self.completed_tasks = 0
        done = False

        for task_idx, a in enumerate(action):
            if a not in {0, 1}:
                continue  # Skip invalid actions

            md_idx = self.task_to_md[task_idx]
            if a == 0:  # Local processing
                t = self.tasks[task_idx, 0] / self.md_compute[md_idx]
                e = t * (1.01 + 0.08 * (md_idx % 3))
                if self.md_battery[md_idx] < e:
                    self.constraint_violations += 1
                    done = True
                    break
                self.md_battery[md_idx] -= e
                self.total_energy_consumed += e
            else:  # Offload to Edge Server
                es_idx = task_idx % self.num_es
                t = self.tasks[task_idx, 0] / self.es_compute[es_idx]
                e_server = t * (0.61 + 0.08 * (es_idx % 3))
                e_transmit = self.tasks[task_idx, 0] * 0.1
                total_e = e_server + e_transmit
                if self.es_battery[es_idx] < total_e:
                    self.constraint_violations += 1
                    done = True
                    break
                self.es_battery[es_idx] -= total_e
                self.total_energy_consumed += total_e

            self.completed_tasks += 1

        # Reward function
        energy_penalty = 0.1 * self.total_energy_consumed  # Linear energy penalty
        task_bonus = 10 * self.completed_tasks  # Small bonus per task
        if self.remaining_tasks == 0:
            task_bonus += 500  # Large bonus for full completion
        constraint_penalty = -50 * self.constraint_violations  # Moderate penalty

        reward = task_bonus - energy_penalty + constraint_penalty
        return self._get_state(), reward, done

# DQN Agent with Prioritized Experience Replay
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = SumTree(10000)
        self.batch_size = 32
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_model()

    def _build_model(self):
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(24, activation='relu', input_shape=(self.state_size,)),
            tf.keras.layers.Dense(24, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate))
        return model

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.randint(0, 2, self.action_size)
        q_values = self.model.predict(state.reshape(1, -1), verbose=0)
        return np.argmax(q_values, axis=1)

    def remember(self, state, action, reward, next_state, done):
        q_values = self.model.predict(state.reshape(1, -1), verbose=0)
        next_q_values = self.target_model.predict(next_state.reshape(1, -1), verbose=0)
        td_error = abs(reward + self.gamma * np.max(next_q_values) - q_values[0][action]).mean()
        self.memory.add(td_error + 1e-5, (state, action, reward, next_state, done))

    def replay(self):
        if self.memory.size() < self.batch_size:
            return
        total_priority = self.memory.total_priority()
        segment = total_priority / self.batch_size
        batch = []
        for i in range(self.batch_size):
            a = segment * i
            b = segment * (i + 1)
            value = np.random.uniform(a, b)
            index, priority, data = self.memory.get_leaf(value)
            batch.append(data)
        states, actions, rewards, next_states, dones = zip(*batch)
        states = np.array(states)
        next_states = np.array(next_states)
        q_values = self.model.predict(states, verbose=0)
        next_q_values = self.target_model.predict(next_states, verbose=0)
        for i in range(self.batch_size):
            if dones[i]:
                q_values[i][actions[i]] = rewards[i]
            else:
                q_values[i][actions[i]] = rewards[i] + self.gamma * np.max(next_q_values[i])
        self.model.fit(states, q_values, batch_size=self.batch_size, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# Initialize environment and agent
env = MECEnvironment(num_md=10, num_es=4, num_tasks=20)

# Correct state size calculation
state_size = 2 * (env.num_md + env.num_es + env.num_tasks)
agent = DQNAgent(state_size=state_size, action_size=env.num_tasks)

# Track metrics
energy_history = []
reward_history = []

for episode in range(1000):
    state = env.reset()
    total_reward = 0
    done = False
    while not done:
        action = agent.act(state)
        next_state, reward, done = env.step(action)
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward

    agent.replay()
    energy_history.append(env.total_energy_consumed)
    reward_history.append(total_reward)

    # Log progress
    print(f"Episode: {episode}, Energy: {env.total_energy_consumed:.2f}, Reward: {total_reward:.2f}")

# Plot results
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(energy_history)
plt.xlabel("Episode")
plt.ylabel("Energy Consumed")
plt.title("Energy Consumption Over Episodes")

plt.subplot(1, 2, 2)
plt.plot(reward_history)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("Reward Over Episodes")
plt.show()

Episode: 0, Energy: 63.34, Reward: 116133.22
Episode: 1, Energy: 21.73, Reward: 166578.76
Episode: 2, Energy: 58.28, Reward: 132901.99
Episode: 3, Energy: 28.21, Reward: 206960.78
Episode: 4, Energy: 18.38, Reward: 178385.42
Episode: 5, Energy: 17.18, Reward: 201813.30
Episode: 6, Energy: 14.04, Reward: 183921.76
Episode: 7, Energy: 20.77, Reward: 231743.01
Episode: 8, Energy: 35.93, Reward: 216264.80
Episode: 9, Energy: 11.14, Reward: 199294.02
Episode: 10, Energy: 26.40, Reward: 201484.27
Episode: 11, Energy: 20.48, Reward: 150921.34
Episode: 12, Energy: 50.62, Reward: 136989.66
Episode: 13, Energy: 45.23, Reward: 200549.86
Episode: 14, Energy: 28.92, Reward: 223754.06
Episode: 15, Energy: 14.74, Reward: 176610.93
Episode: 16, Energy: 43.82, Reward: 183735.29
Episode: 17, Energy: 61.98, Reward: 141460.56
Episode: 18, Energy: 50.53, Reward: 219211.59
Episode: 19, Energy: 19.66, Reward: 210278.10
Episode: 20, Energy: 56.54, Reward: 241550.42
Episode: 21, Energy: 13.73, Reward: 200213.2

KeyboardInterrupt: 