In [None]:
import tensorflow as tf
import numpy as np
import gym
import cv2
import os

class DDDQNAgent:
    def __init__(self, state_dim, num_actions, discount_factor=0.99, learning_rate=0.0001):
        self.discount_factor = discount_factor
        self.learning_rate = learning_rate
        self.state_dim = state_dim
        self.num_actions = num_actions

        # Build the policy and target networks
        self.policy_net = self.build_network()
        self.target_net = self.build_network()
        self.target_net.set_weights(self.policy_net.get_weights())

        self.optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=self.learning_rate)

        self.memory = []

    def build_network(self):
        inputs = tf.keras.layers.Input(shape=self.state_dim)
        x = tf.keras.layers.Conv2D(32, (8, 8), strides=(4, 4), activation='relu')(inputs)
        x = tf.keras.layers.Conv2D(64, (4, 4), strides=(2, 2), activation='relu')(x)
        x = tf.keras.layers.Conv2D(64, (3, 3), strides=(1, 1), activation='relu')(x)
        x = tf.keras.layers.Flatten()(x)
        x = tf.keras.layers.Dense(512, activation='relu')(x)

        # Advantage stream
        adv_stream = tf.keras.layers.Dense(256, activation='relu')(x)
        adv_stream = tf.keras.layers.Dense(self.num_actions, activation='linear')(adv_stream)

        # Value stream
        val_stream = tf.keras.layers.Dense(256, activation='relu')(x)
        val_stream = tf.keras.layers.Dense(1, activation='linear')(val_stream)

        # Combine the streams to get Q-values
        q_values = val_stream + (adv_stream - tf.reduce_mean(adv_stream, axis=1, keepdims=True))

        return tf.keras.Model(inputs=inputs, outputs=q_values)

    def select_action(self, state, epsilon):
        if np.random.rand() < epsilon:
            return np.random.choice(self.num_actions)
        q_values = self.policy_net.predict(state)
        return np.argmax(q_values[0])

    def store_transition(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def learn(self, batch_size=128):
        if len(self.memory) < batch_size:
            return

        indices = np.random.choice(len(self.memory), size=batch_size, replace=False)
        batch = [self.memory[i] for i in indices]

        states, actions, rewards, next_states, dones = zip(*batch)

        states = np.vstack(states)
        next_states = np.vstack(next_states)

        target_q_values = self.target_net.predict(next_states)
        double_q_values = rewards + self.discount_factor * np.max(target_q_values, axis=1) * (1 - np.array(dones)) # Convert dones to a NumPy array

        with tf.GradientTape() as tape:
            q_values = self.policy_net(states)
            actions_one_hot = tf.one_hot(actions, self.num_actions, dtype=tf.float32)
            selected_action_q_values = tf.reduce_sum(q_values * actions_one_hot, axis=1)
            loss = tf.reduce_mean(tf.square(double_q_values - selected_action_q_values))

        grads = tape.gradient(loss, self.policy_net.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.policy_net.trainable_variables))

    def update_target_network(self):
        self.target_net.set_weights(self.policy_net.get_weights())

    def train(self, num_episodes, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay=0.995):
        dataset_folder = '/Users/martinprabhu/Downloads/UAVDT/M0205'  # Replace with the actual path to your "UAVDT" folder
        target_shape = (84, 84)  # Define your target frame size

        def load_dataset(dataset_folder):
            dataset = []
            for root, _, files in os.walk(dataset_folder):
                if files:  # Check if the folder contains files (images)
                    episode_data = []
                    for filename in sorted(files):
                        if filename.endswith(".jpg"):  # Adjust the file extension as needed
                            filepath = os.path.join(root, filename)
                            frame = cv2.imread(filepath)
                            if frame is not None:
                                episode_data.append(frame)
                    if episode_data:
                        dataset.append(episode_data)
            return dataset

        def preprocess_frame(frame, target_shape=(84, 84)):
            # Resize the frame to the target shape (e.g., 84x84)
            frame = cv2.resize(frame, target_shape)

            # Convert the frame to grayscale
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

            # Normalize pixel values to the range [0, 1]
            frame = frame.astype(np.float32) / 255.0

            # Add a batch dimension and a channel dimension
            frame = np.expand_dims(frame, axis=0)
            frame = np.expand_dims(frame, axis=-1)

            # Return the preprocessed state
            return frame

        def calculate_reward(state, done):
            # Define a reward for balancing the pole
            reward = 1.0  # Default reward for each time step

            # Optionally, you can provide a penalty for episode termination (e.g., pole falling)
            if done:
                reward = -1.0  # Penalty for episode termination (pole falling)

            return reward

        for episode_data in load_dataset(dataset_folder):
            episode = dataset_folder.index(episode_data)
            print(f"Training on episode: {episode + 1}")

            done = False
            total_reward = 0

            for frame in episode_data:
                state = preprocess_frame(frame, target_shape)
                epsilon = max(epsilon_start * (epsilon_decay ** episode), epsilon_end)
                action = self.select_action(state, epsilon)

                reward = calculate_reward(state, done)  # Pass state and done as arguments

                next_state = preprocess_frame(frame, target_shape)  # Replace with actual next frame
                self.store_transition(state, action, reward, next_state, done)

                self.learn()

                self.update_target_network()

                total_reward += reward

                print(f"Episode {episode + 1}/{num_episodes}, Total Reward: {total_reward}")

        print("Training finished")

# Create the DDDQN agent and train it
state_dim = (84, 84, 1)  # Define the state dimension based on your preprocessing
num_actions = 2  # Define the number of actions based on your task
agent = DDDQNAgent(state_dim, num_actions)
agent.train(num_episodes=200)  # Adjust the number of episodes as needed

#Save the trained model
agent.policy_net.save("model.h5")
