In [None]:
from collections import deque
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras import models, layers, optimizers
from config import *

class DQN:
    def __init__(self):
        self.update_freq = UPDATE_FREQ
        self.replay_size = REPLAY_SIZE
        self.step = 0
        self.replay_queue = deque(maxlen=self.replay_size)
        
        # Calculate action space
        self.power_number = 3 ** (NumberOfUsers / NumberOfUAVs)
        self.action_number = 7 * int(self.power_number)
        
        # Create models
        self.model = self._create_model()
        self.target_model = self._create_model()

    def _create_model(self):
        """Create the DQN model architecture"""
        STATE_DIM = NumberOfUAVs * 3 + NumberOfUsers
        ACTION_DIM = 7 * self.power_number
        
        model = models.Sequential([
            layers.Dense(40, input_dim=STATE_DIM, activation='relu'),
            layers.Dense(ACTION_DIM, activation="linear")
        ])
        model.compile(
            loss='mean_squared_error', 
            optimizer=optimizers.Adam(learning_rate=LEARNING_RATE)
        )
        return model

    def choose_action(self, state, epsilon):
        """Choose action using epsilon-greedy policy"""
        if np.random.uniform() < epsilon:
            return np.random.choice(self.action_number)
        else:
            return np.argmax(self.model.predict(state))

    def remember(self, state, action, next_state, reward):
        """Store experience in replay memory"""
        self.replay_queue.append((state, action, next_state, reward))

    def train(self, batch_size=BATCH_SIZE, gamma=GAMMA):
        """Train the DQN using experience replay"""
        if len(self.replay_queue) < self.replay_size:
            return
            
        self.step += 1
        
        # Update target network periodically
        if self.step % self.update_freq == 0:
            self.target_model.set_weights(self.model.get_weights())
            
        # Sample batch from replay memory
        batch = random.sample(self.replay_queue, batch_size)
        states = np.array([x[0] for x in batch])
        next_states = np.array([x[2] for x in batch])
        
        # Get Q-values for current and next states
        current_q = self.model.predict(states)
        next_q = self.target_model.predict(next_states)
        
        # Update Q-values using Bellman equation
        for i, (_, action, _, reward) in enumerate(batch):
            current_q[i][action] = reward + gamma * np.max(next_q[i])
            
        # Train the model
        self.model.fit(states, current_q, verbose=0)
        
    def take_action_NOMA(self, acting_UAV, User_asso_list, ChannelGain_list):
        """Implement NOMA power allocation"""
        acting_user_list = np.where(User_asso_list.iloc[0,:] == acting_UAV)[0]
        First_user = acting_user_list[0]
        Second_user = acting_user_list[1]
        
        first_user_CG = ChannelGain_list.iloc[0,First_user]
        second_user_CG = ChannelGain_list.iloc[0,Second_user]
        
        # NOMA power allocation based on channel conditions
        if first_user_CG >= second_user_CG:
            User0 = Second_user  # Far user (higher power)
            User1 = First_user   # Near user (lower power)
        else:
            User0 = First_user
            User1 = Second_user
            
        self.Power_allocation_list.iloc[0,User0] = self.Power_unit * 3/4
        self.Power_allocation_list.iloc[0,User1] = self.Power_unit * 1/4