In [3]:
import numpy as np
import tensorflow as tf
from collections import deque
import matplotlib.pyplot as plt

# -------------------------------
# Prioritized Replay Buffer
# -------------------------------
class PrioritizedReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.priorities = np.zeros(capacity)
        self.pos = 0
        self.max_priority = 1.0

    def add(self, experience):
        if len(self.buffer) < self.capacity:
            self.buffer.append(experience)
        else:
            self.buffer[self.pos] = experience
        self.priorities[self.pos] = self.max_priority
        self.pos = (self.pos + 1) % self.capacity

    def sample(self, batch_size, alpha=0.6):
        priorities = self.priorities[:len(self.buffer)] ** alpha
        probs = priorities / priorities.sum()
        indices = np.random.choice(len(self.buffer), batch_size, p=probs)
        samples = [self.buffer[idx] for idx in indices]
        weights = (len(self.buffer) * probs[indices]) ** (-0.5)
        weights /= weights.max()
        return samples, indices, np.array(weights, dtype=np.float32)

    def update_priorities(self, indices, priorities):
        for idx, priority in zip(indices, priorities):
            self.priorities[idx] = priority

# -------------------------------------------
# Mobile Edge Computing Environment (MECEnv) - FIXED
# -------------------------------------------
class MECEnvironment:
    def __init__(self, num_md=5, num_es=3, num_tasks=20):
        self.num_md = num_md
        self.num_es = num_es
        self.num_tasks = num_tasks
        
        # Static parameters
        self.md_compute = np.random.uniform(1.0, 2.5, num_md)
        self.es_compute = np.random.uniform(10.0, 15.0, num_es)
        self.task_requirements = np.random.uniform(1.0, 5.0, num_tasks)
        self.task_to_md = np.random.randint(0, num_md, num_tasks)
        
        self.reset()

    def reset(self):
        # Dynamic state components
        self.md_battery = np.random.uniform(3000, 5000, self.num_md)
        self.es_battery = np.random.uniform(10000, 20000, self.num_es)
        
        # Task tracking
        self.current_task = 0
        self.completed_tasks = 0
        self.total_energy = 0
        self.violations = 0
        
        return self._get_state()

    def _get_state(self):
        # FIXED: Use current_task before increment
        md_idx = self.task_to_md[self.current_task]
        return np.concatenate([
            [self.md_battery[md_idx] / 5000],
            [self.es_battery.mean() / 20000],
            [self.md_compute[md_idx] / 2.5],
            [self.es_compute.mean() / 15.0],
            [self.task_requirements[self.current_task] / 5.0],
            [self.current_task / self.num_tasks]
        ])

    def step(self, action):
        task_md = self.task_to_md[self.current_task]
        reward = 0
        done = False
        
        # Process current task
        if action == 0:  # Local processing
            required_time = self.task_requirements[self.current_task] / self.md_compute[task_md]
            energy_cost = required_time * (1.01 + 0.08 * (task_md % 3))
            
            if self.md_battery[task_md] >= energy_cost:
                self.md_battery[task_md] -= energy_cost
                self.total_energy += energy_cost
                self.completed_tasks += 1
                reward = 5 - energy_cost/100
            else:
                self.violations += 1
                reward = -10
                self.md_battery[task_md] = 0

        else:  # Offloading
            es_idx = np.random.randint(self.num_es)
            comp_time = self.task_requirements[self.current_task] / self.es_compute[es_idx]
            tx_energy = self.task_requirements[self.current_task] * 0.1
            server_energy = comp_time * (0.61 + 0.08 * (es_idx % 3))
            total_energy = tx_energy + server_energy
            
            if self.es_battery[es_idx] >= total_energy:
                self.es_battery[es_idx] -= total_energy
                self.total_energy += total_energy
                self.completed_tasks += 1
                reward = 5 - total_energy/100
            else:
                self.violations += 1
                reward = -10
                self.es_battery[es_idx] = 0

        # Get next state BEFORE incrementing task counter
        next_state = self._get_state() if self.current_task < self.num_tasks-1 else np.zeros(6)
        
        # Move to next task
        self.current_task += 1
        
        # Check episode completion
        if self.current_task >= self.num_tasks:
            done = True
            reward += 20 if self.completed_tasks == self.num_tasks else 0
            reward -= 2 * self.violations

        return next_state, reward, done, {}

# -------------------------------
# Dueling DQN Agent (Unchanged)
# -------------------------------
# [Keep the DQNAgent class exactly as provided in previous corrected version]

# -------------------------------
# Training and Evaluation
# -------------------------------
def train_agent(episodes=500):
    env = MECEnvironment(num_tasks=20)
    agent = DQNAgent(state_size=6, action_size=2)
    
    rewards = []
    completion_rates = []
    energy_consumptions = []
    
    for episode in range(episodes):
        state = env.reset()
        total_reward = 0
        done = False
        
        while not done:
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward
            
            if len(agent.memory.buffer) > agent.batch_size:
                agent.replay()
        
        if episode % 10 == 0:
            agent.update_target_network()
        
        rewards.append(total_reward)
        completion_rates.append(env.completed_tasks/env.num_tasks)
        energy_consumptions.append(env.total_energy)
        
        print(f"Episode {episode+1}/{episodes}")
        print(f"Tasks completed: {env.completed_tasks}/20")
        print(f"Total reward: {total_reward:.2f}")
        print(f"Epsilon: {agent.epsilon:.3f}\n")
    
    plt.figure(figsize=(15,5))
    
    plt.subplot(1,3,1)
    plt.plot(rewards)
    plt.title('Training Rewards')
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    
    plt.subplot(1,3,2)
    plt.plot(completion_rates)
    plt.title('Task Completion Rate')
    plt.xlabel('Episode')
    plt.ylabel('Completion Rate')
    
    plt.subplot(1,3,3)
    plt.plot(energy_consumptions)
    plt.title('Energy Consumption')
    plt.xlabel('Episode')
    plt.ylabel('Total Energy')
    
    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    train_agent()

Episode 1/500
Tasks completed: 20/20
Total reward: 119.74
Epsilon: 1.000

Episode 2/500
Tasks completed: 20/20
Total reward: 119.77
Epsilon: 1.000

Episode 3/500
Tasks completed: 20/20
Total reward: 119.79
Epsilon: 1.000

Episode 4/500
Tasks completed: 20/20
Total reward: 119.70
Epsilon: 0.923

Episode 5/500
Tasks completed: 20/20
Total reward: 119.82
Epsilon: 0.835

Episode 6/500
Tasks completed: 20/20
Total reward: 119.75
Epsilon: 0.755

Episode 7/500
Tasks completed: 20/20
Total reward: 119.75
Epsilon: 0.683

Episode 8/500
Tasks completed: 20/20
Total reward: 119.84
Epsilon: 0.618

Episode 9/500
Tasks completed: 20/20
Total reward: 119.83
Epsilon: 0.559

Episode 10/500
Tasks completed: 20/20
Total reward: 119.85
Epsilon: 0.506

Episode 11/500
Tasks completed: 20/20
Total reward: 119.84
Epsilon: 0.458

Episode 12/500
Tasks completed: 20/20
Total reward: 119.86
Epsilon: 0.414

Episode 13/500
Tasks completed: 20/20
Total reward: 119.79
Epsilon: 0.374

Episode 14/500
Tasks completed: 20

KeyboardInterrupt: 