In [None]:
import numpy as np
import tensorflow as tf

# Custom SumTree Implementation for Prioritized Experience Replay
class SumTree:
    def __init__(self, capacity):
        self.capacity = capacity
        self.tree = np.zeros(2 * capacity - 1)  # Sum tree
        self.data = np.zeros(capacity, dtype=object)  # Experience buffer
        self.data_pointer = 0

    def add(self, priority, data):
        """Add new experience with priority"""
        index = self.data_pointer + self.capacity - 1
        self.data[self.data_pointer] = data  # Store transition
        self.update(index, priority)  # Update tree
        self.data_pointer = (self.data_pointer + 1) % self.capacity  # Circular overwrite

    def update(self, index, priority):
        """Update priority value in the tree"""
        change = priority - self.tree[index]
        self.tree[index] = priority
        while index != 0:
            index = (index - 1) // 2
            self.tree[index] += change

    def get_leaf(self, value):
        """Retrieve experience with priority sample"""
        parent_idx = 0
        while True:
            left_child = 2 * parent_idx + 1
            right_child = left_child + 1
            if left_child >= len(self.tree):
                leaf_idx = parent_idx
                break
            if value <= self.tree[left_child]:
                parent_idx = left_child
            else:
                value -= self.tree[left_child]
                parent_idx = right_child
        data_index = leaf_idx - self.capacity + 1
        return leaf_idx, self.tree[leaf_idx], self.data[data_index]

    def total_priority(self):
        """Return sum of all priorities"""
        return self.tree[0]

    def size(self):
        return min(self.capacity, self.data_pointer)


# Mobile Edge Computing Environment
class MECEnvironment:
    def __init__(self, num_md, num_es, num_tasks):
        self.num_md = num_md
        self.num_es = num_es
        self.num_tasks = num_tasks
        self.reset()

    def reset(self):
        """Initialize mobile devices, edge servers, and task parameters"""
        self.md_battery = np.random.uniform(3000, 5000, self.num_md)
        self.es_battery = np.random.uniform(10000, 20000, self.num_es)
        self.tasks = np.random.uniform(2.0, 5.0, self.num_tasks)  # Task data size
        return self._get_state()

    def _get_state(self):
        """Normalize and return current environment state"""
        return np.concatenate([
            self.md_battery / 5000,
            self.es_battery / 20000,
            self.tasks / 5.0
        ])

    def step(self, action):
        """Apply action, update state, and return next_state, reward, done"""
        energy = 0
        done = False
        reward = 0

        for task_idx, a in enumerate(action):
            if a == 0:  # Local processing
                md_idx = task_idx % self.num_md
                t = self.tasks[task_idx] / (1.0 + 2.5 * np.random.rand())  
                e = t * (1.01 + 0.08 * np.random.rand())  
                if self.md_battery[md_idx] < e:
                    done = True  # Battery depleted
                    reward = -1000
                    break
                self.md_battery[md_idx] -= e
                energy += e
            else:  # Offload to Edge Server
                es_idx = task_idx % self.num_es
                t = self.tasks[task_idx] / (10.0 + 5.0 * np.random.rand())  
                e = t * (0.61 + 0.08 * np.random.rand()) + self.tasks[task_idx] * (0.1 * np.random.rand())
                if self.es_battery[es_idx] < e:
                    done = True
                    reward = -1000
                    break
                self.es_battery[es_idx] -= e
                energy += e

        if not done:
            reward = -energy  # Minimize energy consumption

        return self._get_state(), reward, done


# Deep Q-Network Agent with PER
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = SumTree(capacity=10000)
        self.batch_size = 32
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_model()

    def _build_model(self):
        """Build the deep Q-learning model"""
        model = tf.keras.Sequential([
            tf.keras.layers.Input(shape=(self.state_size,)),  
            tf.keras.layers.Dense(24, activation='relu'),
            tf.keras.layers.Dense(24, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate))
        return model

    def update_target_model(self):
        """Copy weights to target network"""
        self.target_model.set_weights(self.model.get_weights())

    def act(self, state):
        """Choose action based on epsilon-greedy policy"""
        if np.random.rand() <= self.epsilon:
            return np.random.randint(0, 2, self.action_size)  
        q_values = self.model.predict(state.reshape(1, -1), verbose=0)
        return np.argmax(q_values, axis=1)

    def remember(self, state, action, reward, next_state, done):
        """Store experience in priority replay buffer"""
        q_values = self.model.predict(state.reshape(1, -1), verbose=0)
        next_q_values = self.target_model.predict(next_state.reshape(1, -1), verbose=0)

        # Ensure actions are integers (multi-dimensional case)
        action = np.array(action).astype(int)  # Convert all actions to integers

        # Compute TD error and ensure it's a scalar per action
        td_error = abs(reward + self.gamma * np.max(next_q_values) - q_values[0][action])
        td_error = np.mean(td_error).item()  # Take mean and ensure scalar

        # Store experience with priority
        self.memory.add(td_error, (state, action, reward, next_state, done))

    def replay(self):
        """Sample experiences and train the network"""
        if self.memory.size() < self.batch_size:
            return
        batch = [self.memory.get_leaf(np.random.uniform(0, self.memory.total_priority()))[2] for _ in range(self.batch_size)]
        states, actions, rewards, next_states, dones = zip(*batch)

        states = np.array(states)
        next_states = np.array(next_states)
        q_values = self.model.predict(states, verbose=0)
        next_q_values = self.target_model.predict(next_states, verbose=0)

        for i in range(self.batch_size):
            q_values[i][actions[i]] = rewards[i] + (1 - dones[i]) * self.gamma * np.max(next_q_values[i])

        self.model.fit(states, q_values, epochs=1, verbose=0, batch_size=self.batch_size)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


# Training Loop
env = MECEnvironment(num_md=10, num_es=4, num_tasks=50)
agent = DQNAgent(state_size=env.num_md + env.num_es + env.num_tasks, action_size=env.num_tasks)

for episode in range(1000):
    state = env.reset()
    total_reward = 0
    done = False
    while not done:
        action = agent.act(state)
        next_state, reward, done = env.step(action)
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
    agent.replay()
    agent.update_target_model()
    print(f"Episode: {episode}, Total Reward: {total_reward}, Epsilon: {agent.epsilon}")


Episode: 0, Total Reward: -35788.948161118766, Epsilon: 0.995
Episode: 1, Total Reward: -40143.77761301275, Epsilon: 0.990025
Episode: 2, Total Reward: -36472.81289585969, Epsilon: 0.985074875
Episode: 3, Total Reward: -37610.867000146456, Epsilon: 0.9801495006250001
Episode: 4, Total Reward: -42274.79858466075, Epsilon: 0.9752487531218751
Episode: 5, Total Reward: -33132.36250415337, Epsilon: 0.9703725093562657
Episode: 6, Total Reward: -36837.02337372617, Epsilon: 0.9655206468094844
Episode: 7, Total Reward: -35181.711959293956, Epsilon: 0.960693043575437
Episode: 8, Total Reward: -36545.0040040704, Epsilon: 0.9558895783575597
Episode: 9, Total Reward: -39835.21130040179, Epsilon: 0.9511101304657719
Episode: 10, Total Reward: -34962.263442687785, Epsilon: 0.946354579813443
Episode: 11, Total Reward: -37535.673883639414, Epsilon: 0.9416228069143757
Episode: 12, Total Reward: -33954.19118927559, Epsilon: 0.9369146928798039
Episode: 13, Total Reward: -33026.92464813577, Epsilon: 0.93223

In [5]:
import sumtree
print(sumtree)


<module 'sumtree' from 'c:\\Python312\\Lib\\site-packages\\sumtree\\__init__.py'>
