In [None]:
import numpy as np
import tensorflow as tf
from collections import deque
import random

# Simplified DQNAgent
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=1000)
        self.gamma = 0.9  # Discount factor for future rewards. Balances the importance of immediate vs. future rewards.
        self.epsilon = 1.0  # Exploration rate for the ε-greedy policy. Starts high (1.0) to encourage exploration and decays over time.
        self.epsilon_min = 0.1
        self.epsilon_decay = 0.95
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):  # Creates a simple feed-forward neural network model.
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(16, input_shape=(self.state_size,), activation='relu'),   # Input layer
            tf.keras.layers.Dense(16, activation='relu'),                                   # Hidden layer
            tf.keras.layers.Dense(self.action_size, activation='linear')                    # Output layer
        ])
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate))
        # Mean Squared Error (MSE), used to minimize the difference between predicted and target Q-values.
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
      # With probability ε, choose a random action (exploration). Otherwise, select the action with the highest Q-value (exploitation).
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        q_values = self.model.predict(np.array([state]), verbose=0)
        return np.argmax(q_values[0])

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward if done else reward + self.gamma * np.amax(self.model.predict(np.array([next_state]), verbose=0)[0])
            target_f = self.model.predict(np.array([state]), verbose=0)
            target_f[0][action] = target
            self.model.fit(np.array([state]), target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# Simplified environment simulation
def initialize_simulation():
    return np.zeros(4)  # Simplified state with fewer dimensions

def step_simulation(action):
    next_state = np.random.rand(4)  # Random next state for simplicity
    reward = 1 if action == np.random.randint(0, 2) else -1
    done = np.random.rand() > 0.95  # Randomly end the episode
    return next_state, reward, done, {}

# Main training loop
state_size = 4
action_size = 2  # Fewer actions for faster training
agent = DQNAgent(state_size, action_size)

episodes = 10  # Reduce number of episodes
max_steps = 20  # Shorten the episode length
batch_size = 16  #  number of samples processed together in one iteration of training

for e in range(episodes):
    state = initialize_simulation()
    done = False
    total_reward = 0

    for step in range(max_steps):
        action = agent.act(state)
        next_state, reward, done, _ = step_simulation(action)
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
        if done:
            break

        # Train the agent every step for quicker learning
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

    print(f"Episode {e+1}/{episodes}, Total Reward: {total_reward}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Episode 1/10, Total Reward: 0
Episode 2/10, Total Reward: -4
Episode 3/10, Total Reward: -4
Episode 4/10, Total Reward: -2
Episode 5/10, Total Reward: 12
Episode 6/10, Total Reward: -2
Episode 7/10, Total Reward: -2
Episode 8/10, Total Reward: 2
Episode 9/10, Total Reward: -3
Episode 10/10, Total Reward: -2


In [1]:
import numpy as np
import tensorflow as tf
from collections import deque
import random
import matplotlib.pyplot as plt
from IPython.display import display, clear_output
import ipywidgets as widgets

# Simplified DQNAgent
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=500)
        self.gamma = 0.9  # Discount rate
        self.epsilon = 1.0  # Exploration rate
        self.epsilon_min = 0.1
        self.epsilon_decay = 0.9
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(8, input_shape=(self.state_size,), activation='relu'),
            tf.keras.layers.Dense(8, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        q_values = self.model.predict(np.array([state]), verbose=0)
        return np.argmax(q_values[0])

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward if done else reward + self.gamma * np.amax(self.model.predict(np.array([next_state]), verbose=0)[0])
            target_f = self.model.predict(np.array([state]), verbose=0)
            target_f[0][action] = target
            self.model.fit(np.array([state]), target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# Environment simulation
def initialize_simulation():
    return np.zeros(4)

def step_simulation(action):
    next_state = np.random.rand(4)
    reward = 1 if action == np.random.randint(0, 2) else -1
    done = np.random.rand() > 0.9
    return next_state, reward, done, {}

# GUI
def train_agent_colab(episodes=5, max_steps=10, batch_size=8):
    state_size = 4
    action_size = 2
    agent = DQNAgent(state_size, action_size)
    rewards = []

    # Widgets for visualization
    output_widget = widgets.Output()
    progress = widgets.IntProgress(value=0, min=0, max=episodes, description="Training:")
    display(progress, output_widget)

    with output_widget:
        for e in range(episodes):
            state = initialize_simulation()
            done = False
            total_reward = 0

            for step in range(max_steps):
                action = agent.act(state)
                next_state, reward, done, _ = step_simulation(action)
                agent.remember(state, action, reward, next_state, done)
                state = next_state
                total_reward += reward
                if done:
                    break

                if len(agent.memory) > batch_size:
                    agent.replay(batch_size)

            rewards.append(total_reward)
            progress.value += 1

            # Visualization in Colab
            clear_output(wait=True)
            plt.figure(figsize=(8, 5))
            plt.plot(rewards, label='Rewards per Episode')
            plt.xlabel('Episodes')
            plt.ylabel('Total Reward')
            plt.title('Training Progress')
            plt.legend()
            plt.grid(True)
            plt.show()

        print("Training Complete!")
        return rewards

# Run Training with Visualization in Colab
train_agent_colab(episodes=5, max_steps=10, batch_size=8)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


IntProgress(value=0, description='Training:', max=5)

Output()

[2, 0, 4, -3, -2]