In [None]:
import numpy as np
import random
from collections import deque
from tensorflow.keras import models, layers, optimizers
from sklearn.cluster import KMeans
from scipy.optimize import linear_sum_assignment
from .action_masking import ActionMasking

class DQN(object):
    def __init__(self, NumberOfUAVs, NumberOfUsers):
        self.update_freq = 2000  # Model update frequency
        self.replay_size = 20000  # Training set size 
        self.step = 0
        self.replay_queue = deque(maxlen=self.replay_size)
        
        # Define state and action dimensions
        self.state_dim = NumberOfUAVs*3 + NumberOfUsers
        self.action_dim = 126  # Total actions
        
        # Initialize action masking
        self.action_masker = ActionMasking(self.action_dim)
        
        # Create models
        self.model = self.create_model()
        self.target_model = self.create_model()

    def create_model(self):
        """Create neural network model"""
        model = models.Sequential([
            layers.Dense(90, input_dim=self.state_dim, activation='relu'),
            layers.Dense(self.action_dim, activation="linear")
        ])
        model.compile(loss='mean_squared_error',
                     optimizer=optimizers.Adam(learning_rate=0.001))
        return model

    def Choose_action(self, s, epsilon, acting_UAV, User_asso_list):
        """Choose action based on epsilon-greedy policy with action masking"""
        acting_user_list = np.where(User_asso_list.iloc[0,:] == acting_UAV)[0]
        cluster_size = len(acting_user_list)
        
        if np.random.uniform() < epsilon:
            # Random action from valid actions
            return self.action_masker.get_random_valid_action(cluster_size)
        else:
            # Get Q-values and apply mask
            q_values = self.model.predict(s, verbose=0)[0]
            mask = self.action_masker.get_mask(cluster_size)
            masked_q_values = self.action_masker.apply_mask(q_values, mask)
            return np.argmax(masked_q_values)

    def remember(self, s, a, next_s, reward):
        """Store experience in replay memory"""
        self.replay_queue.append((s, a, next_s, reward))

    def train(self, batch_size=128, lr=1, factor=1):
        """Train the model"""
        if len(self.replay_queue) < self.replay_size:
            return
        self.step += 1
        
        if self.step % self.update_freq == 0:
            self.target_model.set_weights(self.model.get_weights())

        replay_batch = random.sample(self.replay_queue, batch_size)
        s_batch = np.array([replay[0] for replay in replay_batch])
        next_s_batch = np.array([replay[2] for replay in replay_batch])

        Q = self.model.predict(s_batch, verbose=0)
        Q_next = self.target_model.predict(next_s_batch, verbose=0)

        # Update Q values using the Bellman equation
        for i, replay in enumerate(replay_batch):
            _, a, _, reward = replay
            cluster_size = self.action_masker.get_cluster_size_from_action(a)
            mask = self.action_masker.get_mask(cluster_size)
            masked_next_q = self.action_masker.apply_mask(Q_next[i], mask)
            Q[i][a] = (1 - lr) * Q[i][a] + lr * (reward + factor * np.amax(masked_next_q))

        # Train the model
        self.model.fit(s_batch, Q, verbose=0)

    def User_association(self, UAV_Position, User_Position, UAVsnumber, Usersnumber):
        """Perform user association using K-means clustering"""
        # Convert user positions to array for clustering
        User_Position_array = np.zeros([Usersnumber, 2])
        User_Position_array[:, 0] = User_Position.iloc[0,:].T
        User_Position_array[:, 1] = User_Position.iloc[1,:].T

        # Perform K-means clustering
        K_means_association = KMeans(n_clusters=UAVsnumber).fit(User_Position_array)
        User_cluster = K_means_association.labels_
        Cluster_center = K_means_association.cluster_centers_

        # Create UAV position array
        UAV_Position_array = np.zeros([UAVsnumber, 2])
        UAV_Position_array[:, 0] = UAV_Position.iloc[0, :].T
        UAV_Position_array[:, 1] = UAV_Position.iloc[1, :].T

        # Initialize user association list
        User_association_list = pd.DataFrame(
            np.zeros((1, Usersnumber)),
            columns=np.arange(Usersnumber).tolist(),
        )

        # Calculate distances between UAVs and cluster centers
        distance_UAVi2C = np.zeros((UAVsnumber, UAVsnumber))
        for UAV_name in range(UAVsnumber):
            for cluster_name in range(UAVsnumber):
                distance_UAVi2C[UAV_name, cluster_name] = np.linalg.norm(
                    UAV_Position_array[UAV_name,:] - Cluster_center[cluster_name])

        # Use Hungarian algorithm for optimal assignment
        row_ind, col_ind = linear_sum_assignment(distance_UAVi2C)
        
        # Assign users to UAVs based on clustering
        for i in range(UAVsnumber):
            Servied_cluster = col_ind[i]
            Servied_users = np.where(User_cluster == Servied_cluster)
            Servied_users_list = Servied_users[0]

            # Update user association list
            for j in range(np.size(Servied_users)):
                User_association_list.iloc[0, Servied_users_list[j]] = int(i)
            User_association_list = User_association_list.astype('int')

        return User_association_list