{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# DDQN Agent for gym-idsgame (PyTorch Version)\n",
    "\n",
    "This notebook demonstrates how to use the DDQN (Double Deep Q-Network) agent with the gym-idsgame environment. We'll cover:\n",
    "\n",
    "1. Installation of required packages\n",
    "2. Setting up the environment\n",
    "3. Configuring the DDQN agent\n",
    "4. Training the agent\n",
    "5. Evaluating the agent\n",
    "6. Visualizing the results"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Installation of Required Packages\n",
    "\n",
    "First, let's install the necessary packages:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install gym-idsgame==1.0.12\n",
    "!pip install torch\n",
    "!pip install matplotlib\n",
    "!pip install numpy\n",
    "!pip install gym"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Import DDQN Implementation\n",
    "\n",
    "Let's import our DDQN implementation and other necessary libraries:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import necessary libraries\n",
    "import numpy as np\n",
    "import torch\n",
    "import matplotlib.pyplot as plt\n",
    "import gym\n",
    "import time\n",
    "import os\n",
    "import datetime\n",
    "\n",
    "# Import our DDQN implementation\n",
    "from DDQN import DDQNConfig, DDQNAgent, create_ddqn_agent\n",
    "\n",
    "# Check if GPU is available\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "print(f\"Using device: {device}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Set up the Environment\n",
    "\n",
    "Now, let's create the gym-idsgame environment:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set random seed for reproducibility\n",
    "random_seed = 42\n",
    "np.random.seed(random_seed)\n",
    "torch.manual_seed(random_seed)\n",
    "if torch.cuda.is_available():\n",
    "    torch.cuda.manual_seed(random_seed)\n",
    "    torch.cuda.manual_seed_all(random_seed)\n",
    "    torch.backends.cudnn.deterministic = True\n",
    "    torch.backends.cudnn.benchmark = False\n",
    "\n",
    "# Create output directory for results\n",
    "current_time = datetime.datetime.now().strftime(\"%Y-%m-%d_%H-%M-%S\")\n",
    "result_dir = f\"./results/ddqn_pytorch_{current_time}\"\n",
    "if not os.path.exists(result_dir):\n",
    "    os.makedirs(result_dir)\n",
    "\n",
    "# Create environment\n",
    "env_name = \"idsgame-minimal_defense-v2\"\n",
    "env = gym.make(env_name, save_dir=result_dir)\n",
    "\n",
    "# Print environment information\n",
    "print(f\"Environment: {env_name}\")\n",
    "print(f\"Action space: {env.action_space}\")\n",
    "print(f\"Observation space: {env.observation_space}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Configure the DDQN Agent\n",
    "\n",
    "Next, let's configure our DDQN agent with appropriate parameters:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Configure DDQN agent\n",
    "ddqn_config = DDQNConfig(\n",
    "    gamma=0.99,                       # Discount factor\n",
    "    lr=0.00005,                       # Learning rate\n",
    "    batch_size=64,                    # Batch size for training\n",
    "    epsilon=1.0,                      # Initial exploration rate\n",
    "    epsilon_decay=0.999,              # Decay rate for epsilon\n",
    "    min_epsilon=0.01,                 # Minimum value for epsilon\n",
    "    target_network_update_freq=100,   # Frequency of target network updates\n",
    "    replay_memory_size=10000,         # Size of replay buffer\n",
    "    num_episodes=5000,                # Number of episodes to train for\n",
    "    eval_frequency=500,               # Frequency of evaluations during training\n",
    "    eval_episodes=50,                 # Number of episodes for each evaluation\n",
    "    train_log_frequency=50,           # Frequency of logging during training\n",
    "    eval_log_frequency=1,             # Frequency of logging during evaluation\n",
    "    eval_render=False,                # Whether to render during evaluation\n",
    "    render=False,                     # Whether to render during training\n",
    "    attacker=True,                    # Whether the agent is an attacker\n",
    "    defender=False,                   # Whether the agent is a defender\n",
    "    save_dir=result_dir,              # Directory to save results\n",
    "    save_frequency=500,               # Frequency to save the model during training\n",
    "    hidden_layers=[128, 128],         # Hidden layer sizes for the neural network\n",
    "    device=device                     # Device to run the neural network on\n",
    ")\n",
    "\n",
    "# Create DDQN agent\n",
    "agent = DDQNAgent(env, ddqn_config)\n",
    "\n",
    "# Print agent information\n",
    "print(f\"Agent state dimension: {agent.state_dim}\")\n",
    "print(f\"Agent action dimension: {agent.num_actions}\")\n",
    "print(f\"Neural network hidden layers: {ddqn_config.hidden_layers}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Train the DDQN Agent\n",
    "\n",
    "Now, let's train our DDQN agent:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train the agent\n",
    "print(\"Starting training...\")\n",
    "start_time = time.time()\n",
    "\n",
    "# Train for specified number of episodes\n",
    "train_result = agent.train()\n",
    "\n",
    "# Calculate training time\n",
    "training_time = time.time() - start_time\n",
    "print(f\"Training completed in {training_time:.2f} seconds\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Evaluate the Trained Agent\n",
    "\n",
    "Now that we have trained our agent, let's evaluate its performance:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Final evaluation with more episodes\n",
    "print(\"Performing final evaluation...\")\n",
    "agent.config.eval_episodes = 100  # Increase number of evaluation episodes\n",
    "eval_result = agent.eval_model()\n",
    "\n",
    "# Calculate average metrics\n",
    "avg_reward = sum(eval_result.episode_rewards) / len(eval_result.episode_rewards)\n",
    "avg_steps = sum(eval_result.episode_steps) / len(eval_result.episode_steps)\n",
    "\n",
    "print(f\"Final Evaluation Results:\")\n",
    "print(f\"Average Reward: {avg_reward:.2f}\")\n",
    "print(f\"Average Steps: {avg_steps:.2f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Visualize the Results\n",
    "\n",
    "Let's visualize the training progress and the agent's performance:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot training rewards\n",
    "plt.figure(figsize=(12, 6))\n",
    "plt.subplot(1, 2, 1)\n",
    "plt.plot(agent.train_result.episode_rewards)\n",
    "plt.title('Episode Rewards')\n",
    "plt.xlabel('Episode')\n",
    "plt.ylabel('Reward')\n",
    "plt.grid(True)\n",
    "\n",
    "# Plot running average of rewards\n",
    "plt.subplot(1, 2, 2)\n",
    "window_size = 100\n",
    "running_avg = [np.mean(agent.train_result.episode_rewards[max(0, i-window_size):i+1]) \n",
    "               for i in range(len(agent.train_result.episode_rewards))]\n",
    "plt.plot(running_avg)\n",
    "plt.title(f'Running Average Reward (Window Size: {window_size})')\n",
    "plt.xlabel('Episode')\n",
    "plt.ylabel('Average Reward')\n",
    "plt.grid(True)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig(os.path.join(result_dir, 'training_rewards.png'))\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot action distribution during training\n",
    "plt.figure(figsize=(12, 6))\n",
    "\n",
    "# Get action counts\n",
    "actions = list(agent.train_result.action_counts.keys())\n",
    "counts = list(agent.train_result.action_counts.values())\n",
    "\n",
    "# Sort by action index\n",
    "action_counts = sorted(zip(actions, counts))\n",
    "actions, counts = zip(*action_counts)\n",
    "\n",
    "# Plot action distribution\n",
    "plt.bar(actions, counts)\n",
    "plt.title('Action Distribution During Training')\n",
    "plt.xlabel('Action')\n",
    "plt.ylabel('Count')\n",
    "plt.grid(True, axis='y')\n",
    "plt.savefig(os.path.join(result_dir, 'action_distribution.png'))\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Save and Load the Trained Model\n",
    "\n",
    "Let's save our trained model for future use and demonstrate how to load it:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save the trained model\n",
    "save_path = os.path.join(result_dir, \"final_model\")\n",
    "if not os.path.exists(save_path):\n",
    "    os.makedirs(save_path)\n",
    "\n",
    "# Save policy network\n",
    "torch.save(agent.policy_network.state_dict(), os.path.join(save_path, \"policy_network.pt\"))\n",
    "\n",
    "# Save training metrics\n",
    "np.save(os.path.join(save_path, \"training_metrics.npy\"), {\n",
    "    \"episode_rewards\": agent.train_result.episode_rewards,\n",
    "    \"avg_episode_rewards\": agent.avg_episode_rewards,\n",
    "    \"action_counts\": agent.train_result.action_counts\n",
    "})\n",
    "\n",
    "print(f\"Model saved to: {save_path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Example of loading the model\n",
    "def load_model(model_path, env):\n",
    "    # Create a new agent\n",
    "    config = DDQNConfig(\n",
    "        attacker=True,\n",
    "        defender=False,\n",
    "        save_dir=result_dir,\n",
    "        hidden_layers=[128, 128],\n",
    "        device=device\n",
    "    )\n",
    "    \n",
    "    loaded_agent = DDQNAgent(env, config)\n",
    "    \n",
    "    # Load model weights\n",
    "    policy_path = os.path.join(model_path, \"policy_network.pt\")\n",
    "    loaded_agent.policy_network.load_state_dict(torch.load(policy_path, map_location=device))\n",
    "    loaded_agent.target_network.load_state_dict(torch.load(policy_path, map_location=device))\n",
    "    \n",
    "    # Set to evaluation mode\n",
    "    loaded_agent.policy_network.eval()\n",
    "    loaded_agent.target_network.eval()\n",
    "    \n",
    "    return loaded_agent\n",
    "\n",
    "# Uncomment to load the model\n",
    "# loaded_agent = load_model(save_path, env)\n",
    "# test_result = loaded_agent.eval_model()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9. Running a Demo with the Trained Agent\n",
    "\n",
    "Let's run a demo with our trained agent and see how it performs in the environment:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_demo(agent, episodes=5, render=True, sleep_time=0.1):\n",
    "    \"\"\"Run a demo of the trained agent\"\"\"\n",
    "    print(\"\\nRunning demo with trained agent...\")\n",
    "    \n",
    "    for episode in range(episodes):\n",
    "        state = env.reset()\n",
    "        if isinstance(state, tuple):\n",
    "            state = state[0]\n",
    "            \n",
    "        episode_reward = 0\n",
    "        episode_steps = 0\n",
    "        done = False\n",
    "        \n",
    "        print(f\"\\nEpisode {episode+1}:\")\n",
    "        \n",
    "        while not done:\n",
    "            if render:\n",
    "                env.render()\n",
    "            if sleep_time > 0:\n",
    "                time.sleep(sleep_time)\n",
    "            \n",
    "            # Select best action\n",
    "            state_tensor = torch.FloatTensor(state).to(device)\n",
    "            with torch.no_grad():\n",
    "                q_values = agent.policy_network(state_tensor)\n",
    "            action = q_values.argmax().item()\n",
    "            \n",
    "            next_state, reward, done, _ = env.step(action)\n",
    "            \n",
    "            print(f\"Step {episode_steps}: Action={action}, Reward={reward}\")\n",
    "            \n",
    "            state = next_state\n",
    "            episode_reward += reward\n",
    "            episode_steps += 1\n",
    "            \n",
    "            if hasattr(env, 'idsgame_config') and episode_steps >= env.idsgame_config.game_config.max_steps:\n",
    "                done = True\n",
    "        \n",
    "        print(f\"Episode {episode+1} finished with total reward: {episode_reward}\")\n",
    "    \n",
    "    if render:\n",
    "        env.close()\n",
    "\n",
    "# Uncomment to run the demo with the trained agent\n",
    "# run_demo(agent, episodes=3, render=True, sleep_time=0.1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 10. Conclusion\n",
    "\n",
    "In this notebook, we have:\n",
    "1. Implemented a DDQN agent using PyTorch\n",
    "2. Configured and trained the agent in the gym-idsgame environment\n",
    "3. Evaluated the agent's performance\n",
    "4. Visualized the training results\n",
    "5. Demonstrated how to save and load the trained model\n",
    "6. Created a demo to showcase the trained agent's behavior\n",
    "\n",
    "The DDQN agent learns to make strategic decisions in the intrusion detection game by balancing exploration and exploitation through an epsilon-greedy policy, and using experience replay to learn from past experiences. The Double DQN approach helps to prevent overestimation of Q-values, leading to more stable and effective learning."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

In [None]:
pip install -e src/gym-idsgame

In [19]:
!pip uninstall gymnasium
!pip install gym==0.21.0
!pip install scikit-learn

!pip install gym-idsgame==1.0.12

[0mCollecting gym==0.21.0
  Using cached gym-0.21.0.tar.gz (1.5 MB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[3 lines of output][0m
  [31m   [0m error in gym setup command: 'extras_require' must be a dictionary whose values are strings or lists of strings containing valid project/version requirement specifiers.
  [31m   [0m [31m[end of output][0m
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.
Collecting gym-idsgame==1.0.12
  Using cached gym_idsgame-1.0.12-py3-none-

In [60]:
import gymnasium
from gymnasium.core import Env

def patched_reset(self):
    return self.reset()

Env.reset = patched_reset


In [61]:
import os
import gym
import gym_idsgame
import numpy as np
import torch
# src/environment/idsgame_wrapper.py
# from src.agents.dqn_agent import DQNAgent
from src.environment.compatibility_wrapper import GymCompatibilityWrapper
from src.utils.utils import print_summary
from src.utils.plotting import plot_results
# from src.utils import create_artefact_dirs

print('DONE IMPORTING')

DONE IMPORTING


In [73]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np

from src.environment.explorer import IDSGameExplorer
explorer = IDSGameExplorer()
# explorer.run_comprehensive_exploration()

In [74]:
import src.agents.ddqn_agent
print(src.agents.ddqn_agent.__file__)

/Users/krishnaasrinivas/Desktop/AI-Agent-for-Cyber-Security/src/agents/ddqn_agent.py


In [87]:
import os
import gymnasium as gym
import warnings
from datetime import datetime

# from gym_idsgame.agents.training_agents.q_learning.q_agent_config import QAgentConfig
# from gym_idsgame.agents.training_agents.q_learning.dqn.dqn_config import DQNConfig
from experiments.util import util
from experiments.util.plotting_util import read_and_plot_results
# from src.agents.ddqn_agent import DDQNAgent
from src.utils.utils import get_output_dir, print_summary
from src.environment.compatibility_wrapper import GymCompatibilityWrapper
from src.utils.plotting import plot_results

warnings.filterwarnings('ignore')
print('done')

done


In [88]:
env_name = "idsgame-random_attack-v8"
output_dir = os.getcwd()
random_seed = 42
env = gym.make(env_name, save_dir=output_dir + "results/data/" + str(random_seed))
# env = GymCompatibilityWrapper(env)

env = GymCompatibilityWrapper(env)
env = env.unwrapped

print("\nEnvironment Information:")
print(f"Observation Space: {env.observation_space}")
print(f"Action Space: {env.action_space}")


Environment Information:
Observation Space: Box(0, 9, (1, 11), int32)
Action Space: Discrete(30)


In [89]:
from src.agents.ddqn_agent import DDQNConfig, DDQNAgent, create_ddqn_agent
import numpy as np
import torch
import matplotlib.pyplot as plt
import gym
import time
import os
import datetime

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [90]:
# Set random seed for reproducibility
random_seed = 42
np.random.seed(random_seed)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Create output directory for results
current_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
result_dir = f"./results/ddqn_pytorch_{current_time}"
if not os.path.exists(result_dir):
    os.makedirs(result_dir)

# # Create environment
# env_name = "idsgame-random_defense-v0"
# env = gym.make(env_name, save_dir=result_dir)

# Print environment information
print(f"Environment: {env_name}")
print(f"Action space: {env.action_space}")
print(f"Observation space: {env.observation_space}")

Environment: idsgame-random_attack-v8
Action space: Discrete(30)
Observation space: Box(0, 9, (1, 11), int32)


In [91]:

from src.agents.ddqn_agent import DDQNConfig, DDQNAgent, create_ddqn_agent

# Configure DDQN agent\n",
ddqn_config = DDQNConfig(
       gamma=0.99,                       # Discount factor\n",
        lr=0.00005,                       # Learning rate\n",
        batch_size=64,                    # Batch size for training\n",
      epsilon=1.0,                      # Initial exploration rate\n",
       epsilon_decay=0.999,              # Decay rate for epsilon\n",
 min_epsilon=0.01,                 # Minimum value for epsilon\n",
   target_network_update_freq=100,   # Frequency of target network updates\n",
    replay_memory_size=10000,         # Size of replay buffer\n",
   num_episodes=5000,                # Number of episodes to train for\n",
  eval_frequency=500,               # Frequency of evaluations during training\n",
  eval_episodes=50,                 # Number of episodes for each evaluation\n",
  train_log_frequency=50,           # Frequency of logging during training\n",
  eval_log_frequency=1,             # Frequency of logging during evaluation\n",
 eval_render=False,                # Whether to render during evaluation\n",
 render=False,                     # Whether to render during training\n",
 attacker=True,                    # Whether the agent is an attacker\n",
defender=False,                   # Whether the agent is a defender\n",
 save_dir=result_dir,              # Directory to save results\n",
 save_frequency=500,               # Frequency to save the model during training\n",
 hidden_layers=[128, 128],         # Hidden layer sizes for the neural network\n",
 device=device                     # Device to run the neural network on\n",
    )
agent = DDQNAgent(env, ddqn_config)

# Print agent information
print(f"Agent state dimension: {agent.state_dim}")
print(f"Agent action dimension: {agent.num_actions}")
print(f"Neural network hidden layers: {ddqn_config.hidden_layers}")

Agent state dimension: 30
Agent action dimension: 30
Neural network hidden layers: [128, 128]


In [92]:
def print_summary(result, title=""):
    print(f"\n{title} Summary:")
    print("-"*50)
    print(f"- Average Reward: {np.mean(result.episode_rewards):.2f} ± {np.std(result.episode_rewards):.2f}")
    print(f"- Max Reward: {np.max(result.episode_rewards):.2f}")
    print(f"- Min Reward: {np.min(result.episode_rewards):.2f}")
    print(f"- Avg Episode Length: {np.mean(result.episode_steps):.2f} steps")

    # Call this after training
    print_summary(agent.train_result, "Training")
    print_summary(agent.eval_result, "Evaluation")

    print(f"- Cumulative Reward: {sum(result.episode_rewards)}")

def plot_training(agent):
    """Plot training progress"""
    plt.figure(figsize=(12,6))
    
    # Smoothed rewards
    rewards = agent.train_result.episode_rewards
    window_size = 50
    smooth_rewards = [np.mean(rewards[max(0,i-window_size):i+1]) 
                     for i in range(len(rewards))]
    
    plt.plot(smooth_rewards)
    plt.title("Training Progress (Smoothed Rewards)")
    plt.xlabel("Episode")
    plt.ylabel("Reward")
    plt.show()


In [95]:
try:
    config = DDQNConfig(
        batch_size=64,
        lr=1e-4,
        gamma=0.99,
        tau =0.005,  # Test soft update parameter
        clip_norm=10.0  # Test gradient clipping parameter
    )
    print("DDQNConfig initialized successfully with tau and clip_norm.")
except TypeError as e:
    print(f"TypeError: {e}")
except Exception as e:
    print(f"Unexpected error: {e}")

agent = DDQNAgent(env, config)

# 2. Train Agent
train_agent(agent, episodes=1000)  # Modified training loop

# 3. Final Evaluation
final_eval_result = agent.eval_model()
print_summary(final_eval_result, "Final Evaluation")

# 4. Save Results
agent.save_model()
plot_training(agent)

TypeError: DDQNConfig.__init__() got an unexpected keyword argument 'tau'


NameError: name 'train_agent' is not defined

In [71]:
# Import necessary libraries
import numpy as np
import tensorflow as tf
import gym
import gym_idsgame
import time
import os
from datetime import datetime
from src.agents.ddqn_agent import DDQNConfig, DDQNAgent, create_ddqn_agent

# Set random seed
np.random.seed(42)
torch.manual_seed(42)

def extract_attacker_obs(obs):
    """Extract and flatten attacker observation"""
    if isinstance(obs, tuple):
        obs = obs[0]  # Handle gym>=0.26.0 reset() returning (obs, info)
    return np.array(obs).flatten()


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# # Create environment
# env_name = "idsgame-minimal_defense-v2"
# env = gym.make(env_name)

# Create output directory
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
result_dir = f"./results/ddqn_{timestamp}"
if not os.path.exists(result_dir):
    os.makedirs(result_dir)


sample_obs = env.reset()[0] if isinstance(env.reset(), tuple) else env.reset()
state_dim = np.array(sample_obs).flatten().shape[0]
action_dim = env.attacker_action_space.n

print(f"Environment: {env_name}")
print(f"State dimension: {state_dim}")
print(f"Action dimension: {action_dim}")


# Create DDQN configuration
config = DDQNConfig(
    gamma=0.99,                       # Discount factor
    lr=1e-3,                          # Learning rate
    batch_size=32,                    # Reduce batch size to improve stability
    epsilon=1.0,                      # Initial exploration rate
    epsilon_decay=0.995,              # Decay rate for epsilon
    min_epsilon=0.01,                 # Minimum value for epsilon
    target_network_update_freq=10,    # Update target network frequency
    replay_memory_size=10000,         # Size of replay buffer
    num_episodes=10000,               # Number of episodes to train for
    train_log_frequency=100,          # Log progress every 100 episodes
    eval_frequency=1000,              # Evaluate every 1000 episodes
    save_dir=result_dir,              # Directory to save results
    hidden_layers=[64, 64]            # Smaller hidden layers for stability
)
agent = DDQNAgent(env, config)

# Training parameters
num_episodes = 20000
max_steps = 100
log_frequency = 500

# For tracking progress
reward_history = []
episode_lengths = []
start_time = time.time()

print(f"Starting training for {num_episodes} episodes...")

# Training loop
for ep in range(num_episodes):
    # Reset environment
    obs = env.reset()
    if isinstance(obs, tuple):
        obs = obs[0]
        
    state = extract_attacker_obs(obs)  # Make sure state is properly formatted
    
    total_reward = 0
    steps = 0
    done = False
    
    # Episode loop
    for t in range(max_steps):
        # Select attacker action
        action = agent.select_action(state)
        
        # Get defender action (random)
        defense_action = env.defender_action_space.sample()
        
        # Combine actions for the environment
        full_action = (action, defense_action)
        
        # Take step in environment
        next_obs, reward, terminated, truncated, info = env.step(full_action)
        done = terminated or truncated
        
        # Get attacker reward
        attacker_reward = reward[0] if isinstance(reward, tuple) else reward
        
        # Extract and format next state
        next_state = extract_attacker_obs(next_obs)
        
        # Store transition in replay buffer
        agent.replay_buffer.push(state, action, attacker_reward, next_state, bool(done))
        
        # Update state and metrics
        state = next_state
        total_reward += attacker_reward
        steps += 1
        
        if done:
            break
            
    # Store episode metrics
    reward_history.append(total_reward)
    episode_lengths.append(steps)
    
    # Log progress
    if (ep + 1) % log_frequency == 0 or ep == 0:
        avg_reward = np.mean(reward_history[-min(log_frequency, len(reward_history)):]) 
        elapsed_time = time.time() - start_time
        print(f"[Ep {ep+1}/{num_episodes}] AvgReward (last {min(log_frequency, len(reward_history))}): {avg_reward:.2f} | "
              f"Total: {total_reward:.2f} | Epsilon: {agent.epsilon:.3f} | "
              f"Steps: {steps} | Time: {elapsed_time:.1f}s")

# Agent training should have been updating automatically through the 
# agent's internal train() method, if implemented properly

training_time = time.time() - start_time
print(f"\nTraining completed in {training_time:.1f} seconds")

# Calculate evaluation metrics
result = {
    "average_reward": np.mean(reward_history),
    "reward_std": np.std(reward_history),
    "max_reward": np.max(reward_history),
    "min_reward": np.min(reward_history),
    "average_episode_length": np.mean(episode_lengths),
    "episode_length_std": np.std(episode_lengths),
    "max_episode_length": np.max(episode_lengths),
    "min_episode_length": np.min(episode_lengths),
    "average_hack_probability": 0.0,
    "hack_probability_std": 0.0,
    "max_hack_probability": 0.0,
    "min_hack_probability": 0.0,
    "cumulative_reward": int(np.sum(reward_history)),
}

# Print evaluation summary
print("\n📊 Final DDQN Training Performance:")
print(env_name)
print('Results: ', result)
print(f"- Average Reward: {result['average_reward']:.2f} ± {result['reward_std']:.2f}")
print(f"- Max-Min Reward: {result['max_reward']:.2f} - {result['min_reward']:.2f}")
print(f"- Average Episode Length: {result['average_episode_length']:.2f} ± {result['episode_length_std']:.2f}")
print(f"- Cumulative Reward: {result['cumulative_reward']}")

# Save final model if agent has save_model method
if hasattr(agent, 'save_model'):
    agent.save_model()

Using device: cpu
Environment: idsgame-random_attack-v8
State dimension: 33
Action dimension: 30
Starting training for 20000 episodes...
[Ep 1/20000] AvgReward (last 1): -1.00 | Total: -1.00 | Epsilon: 1.000 | Steps: 2 | Time: 0.0s
[Ep 500/20000] AvgReward (last 500): -0.10 | Total: 0.17 | Epsilon: 1.000 | Steps: 9 | Time: 0.6s
[Ep 1000/20000] AvgReward (last 500): -0.06 | Total: -1.00 | Epsilon: 1.000 | Steps: 2 | Time: 1.2s
[Ep 1500/20000] AvgReward (last 500): -0.08 | Total: -1.00 | Epsilon: 1.000 | Steps: 4 | Time: 1.8s
[Ep 2000/20000] AvgReward (last 500): -0.12 | Total: -1.00 | Epsilon: 1.000 | Steps: 6 | Time: 2.4s
[Ep 2500/20000] AvgReward (last 500): -0.08 | Total: 1.00 | Epsilon: 1.000 | Steps: 3 | Time: 3.0s
[Ep 3000/20000] AvgReward (last 500): -0.00 | Total: -1.00 | Epsilon: 1.000 | Steps: 8 | Time: 3.9s
[Ep 3500/20000] AvgReward (last 500): -0.04 | Total: -1.00 | Epsilon: 1.000 | Steps: 9 | Time: 4.5s
[Ep 4000/20000] AvgReward (last 500): 0.03 | Total: 1.00 | Epsilon: 1.0

2025-04-11 01:04:34,713 - DDQN_Agent - INFO - Model saved to: ./results/ddqn_2025-04-11_01-04-10/model
2025-04-11 01:04:34,713 - DDQN_Agent - INFO - Model saved to: ./results/ddqn_2025-04-11_01-04-10/model
2025-04-11 01:04:34,713 - DDQN_Agent - INFO - Model saved to: ./results/ddqn_2025-04-11_01-04-10/model
2025-04-11 01:04:34,713 - DDQN_Agent - INFO - Model saved to: ./results/ddqn_2025-04-11_01-04-10/model
2025-04-11 01:04:34,713 - DDQN_Agent - INFO - Model saved to: ./results/ddqn_2025-04-11_01-04-10/model
2025-04-11 01:04:34,713 - DDQN_Agent - INFO - Model saved to: ./results/ddqn_2025-04-11_01-04-10/model
2025-04-11 01:04:34,713 - DDQN_Agent - INFO - Model saved to: ./results/ddqn_2025-04-11_01-04-10/model
2025-04-11 01:04:34,713 - DDQN_Agent - INFO - Model saved to: ./results/ddqn_2025-04-11_01-04-10/model
2025-04-11 01:04:34,713 - DDQN_Agent - INFO - Model saved to: ./results/ddqn_2025-04-11_01-04-10/model
2025-04-11 01:04:34,713 - DDQN_Agent - INFO - Model saved to: ./results/d

[Ep 20000/20000] AvgReward (last 500): -0.08 | Total: 1.00 | Epsilon: 1.000 | Steps: 2 | Time: 24.2s

Training completed in 24.2 seconds

📊 Final DDQN Training Performance:
idsgame-random_attack-v8
Results:  {'average_reward': -0.04738668110484288, 'reward_std': 1.4632312443832924, 'max_reward': 4.0, 'min_reward': -1.0, 'average_episode_length': 4.6359, 'episode_length_std': 3.5468621611221374, 'max_episode_length': 27, 'min_episode_length': 1, 'average_hack_probability': 0.0, 'hack_probability_std': 0.0, 'max_hack_probability': 0.0, 'min_hack_probability': 0.0, 'cumulative_reward': -947}
- Average Reward: -0.05 ± 1.46
- Max-Min Reward: 4.00 - -1.00
- Average Episode Length: 4.64 ± 3.55
- Cumulative Reward: -947


In [40]:
sample_obs = env.reset()[0] if isinstance(env.reset(), tuple) else env.reset()
state_dim = np.array(sample_obs).flatten().shape[0]
action_dim = env.attacker_action_space.n

print(f"Environment: {env_name}")
print(f"State dimension: {state_dim}")
print(f"Action dimension: {action_dim}")

agent = DDQNAgent(
    state_dim=state_dim,
    action_dim=action_dim,
    buffer_capacity=10000,     # Replay buffer size
    gamma=0.99,                # Discount factor
    lr=1e-3,                   # Learning rate
    batch_size=64,             # Training batch size
    epsilon_start=1.0,         # Initial exploration rate
    epsilon_end=0.01,          # Final exploration rate
    epsilon_decay=0.995,       # Exploration decay rate
    update_target_every=10,    # Target network update frequency
    device=device
)

# Training parameters
num_episodes = 10000
max_steps = 100
log_frequency = 500

# For tracking progress
reward_history = []
loss_history = []
episode_lengths = []
start_time = time.time()

print(f"Starting training for {num_episodes} episodes...")

# Training loop
for ep in range(num_episodes):
    # Reset environment
    state = extract_attacker_obs(env.reset())
    total_reward = 0
    episode_loss = 0
    steps = 0
    done = False
    
    # Episode loop
    for t in range(max_steps):
        # Select attacker action
        action = agent.select_action(state)
        
        # Select random defender action
        defense_action = env.defender_action_space.sample()
        
        # Combine actions and take step
        full_action = (action, defense_action)
        next_obs, reward, terminated, truncated, info = env.step(full_action)
        done = terminated or truncated
        
        # Extract attacker reward
        attacker_reward = reward[0] if isinstance(reward, tuple) else reward
        next_state = extract_attacker_obs(next_obs)
        
        # Store transition in replay buffer
        agent.store(state, action, attacker_reward, next_state, done)
        
        # Update agent
        loss = agent.update()
        if loss:
            episode_loss += loss
        
        # Update state and metrics
        state = next_state
        total_reward += attacker_reward
        steps += 1
        
        if done:
            break
            
    # Store episode metrics
    reward_history.append(total_reward)
    loss_history.append(episode_loss / max(1, steps))
    episode_lengths.append(steps)
    
    # Log progress
    if (ep + 1) % log_frequency == 0 or ep == 0:
        avg_reward = np.mean(reward_history[-min(log_frequency, len(reward_history)):]) 
        avg_loss = np.mean(loss_history[-min(log_frequency, len(loss_history)):])/max(1, min(log_frequency, len(loss_history)))
        elapsed_time = time.time() - start_time
        print(f"[Ep {ep+1}/{num_episodes}] AvgReward (last {min(log_frequency, len(reward_history))}): {avg_reward:.2f} | "
              f"Total: {total_reward:.2f} | Epsilon: {agent.epsilon:.3f} | Loss: {avg_loss:.4f} | "
              f"Steps: {steps} | Time: {elapsed_time:.1f}s")

training_time = time.time() - start_time
print(f"\nTraining completed in {training_time:.1f} seconds")

# Calculate evaluation metrics
result = {
    "average_reward": np.mean(reward_history),
    "reward_std": np.std(reward_history),
    "max_reward": np.max(reward_history),
    "min_reward": np.min(reward_history),
    "average_episode_length": np.mean(episode_lengths),
    "episode_length_std": np.std(episode_lengths),
    "max_episode_length": np.max(episode_lengths),
    "min_episode_length": np.min(episode_lengths),
    "average_hack_probability": 0.0,
    "hack_probability_std": 0.0,
    "max_hack_probability": 0.0,
    "min_hack_probability": 0.0,
    "cumulative_reward": int(np.sum(reward_history)),
}

# Print evaluation summary
print("\n📊 Final DDQN Training Performance:")
print('Results: ', result)
print(f"- Average Reward: {result['average_reward']:.2f} ± {result['reward_std']:.2f}")
print(f"- Max-Min Reward: {result['max_reward']:.2f} - {result['min_reward']:.2f}")
print(f"- Average Episode Length: {result['average_episode_length']:.2f} ± {result['episode_length_std']:.2f}")
print(f"- Cumulative Reward: {result['cumulative_reward']}")

# Save the trained agent
save_path = "./ddqn_model.pt"
agent.save(save_path)

Environment: idsgame-random_defense-v0
State dimension: 33
Action dimension: 30


TypeError: DDQNAgent.__init__() got an unexpected keyword argument 'state_dim'

In [43]:
# Import DDQN agent and necessary libraries
import numpy as np
import torch
import gym
import gym_idsgame
import time
# from DDQN import DDQNAgent, extract_attacker_obs, set_random_seeds

# Set random seed
random_seeds = 42

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# # Create environment
# env_name = "idsgame-minimal_defense-v2"
# env = gym.make(env_name)

# Get dimensions
sample_obs = env.reset()[0] if isinstance(env.reset(), tuple) else env.reset()
state_dim = np.array(sample_obs).flatten().shape[0]
action_dim = env.attacker_action_space.n

print(f"Environment: {env_name}")
print(f"State dimension: {state_dim}")
print(f"Action dimension: {action_dim}")

# Create DDQN agent - adjusting to match your implementation
agent = DDQNAgent(
    expected_state_dim=state_dim,  # Assuming this is the parameter name in your code
    action_dim=action_dim,
    buffer_capacity=10000,     # Replay buffer size
    gamma=0.99,                # Discount factor
    lr=1e-3,                   # Learning rate
    batch_size=64,             # Training batch size
    epsilon_start=1.0,         # Initial exploration rate
    epsilon_end=0.01,          # Final exploration rate
    epsilon_decay=0.995,       # Exploration decay rate
    device=device
)

# Training parameters
num_episodes = 10000
max_steps = 100
log_frequency = 500

# For tracking progress
reward_history = []
loss_history = []
episode_lengths = []
start_time = time.time()

print(f"Starting training for {num_episodes} episodes...")

# Training loop
for ep in range(num_episodes):
    # Reset environment
    state = extract_attacker_obs(env.reset())
    total_reward = 0
    episode_loss = 0
    steps = 0
    done = False
    
    # Episode loop
    for t in range(max_steps):
        # Select attacker action
        action = agent.select_action(state)
        
        # Select random defender action
        defense_action = env.defender_action_space.sample()
        
        # Combine actions and take step
        full_action = (action, defense_action)
        next_obs, reward, terminated, truncated, info = env.step(full_action)
        done = terminated or truncated
        
        # Extract attacker reward
        attacker_reward = reward[0] if isinstance(reward, tuple) else reward
        next_state = extract_attacker_obs(next_obs)
        
        # Store transition in replay buffer
        agent.store(state, action, attacker_reward, next_state, done)
        
        # Update agent
        loss = agent.update()
        if loss:
            episode_loss += loss
        
        # Update state and metrics
        state = next_state
        total_reward += attacker_reward
        steps += 1
        
        if done:
            break
            
    # Store episode metrics
    reward_history.append(total_reward)
    loss_history.append(episode_loss / max(1, steps))
    episode_lengths.append(steps)
    
    # Log progress
    if (ep + 1) % log_frequency == 0 or ep == 0:
        avg_reward = np.mean(reward_history[-min(log_frequency, len(reward_history)):]) 
        avg_loss = np.mean(loss_history[-min(log_frequency, len(loss_history)):])
        elapsed_time = time.time() - start_time
        print(f"[Ep {ep+1}/{num_episodes}] AvgReward (last {min(log_frequency, len(reward_history))}): {avg_reward:.2f} | "
              f"Total: {total_reward:.2f} | Epsilon: {agent.epsilon:.3f} | Loss: {avg_loss:.4f} | "
              f"Steps: {steps} | Time: {elapsed_time:.1f}s")

training_time = time.time() - start_time
print(f"\nTraining completed in {training_time:.1f} seconds")

# Calculate evaluation metrics
result = {
    "average_reward": np.mean(reward_history),
    "reward_std": np.std(reward_history),
    "max_reward": np.max(reward_history),
    "min_reward": np.min(reward_history),
    "average_episode_length": np.mean(episode_lengths),
    "episode_length_std": np.std(episode_lengths),
    "max_episode_length": np.max(episode_lengths),
    "min_episode_length": np.min(episode_lengths),
    "average_hack_probability": 0.0,
    "hack_probability_std": 0.0,
    "max_hack_probability": 0.0,
    "min_hack_probability": 0.0,
    "cumulative_reward": int(np.sum(reward_history)),
}

# Print evaluation summary
print("\n📊 Final DDQN Training Performance:")
print('Results: ', result)
print(f"- Average Reward: {result['average_reward']:.2f} ± {result['reward_std']:.2f}")
print(f"- Max-Min Reward: {result['max_reward']:.2f} - {result['min_reward']:.2f}")
print(f"- Average Episode Length: {result['average_episode_length']:.2f} ± {result['episode_length_std']:.2f}")
print(f"- Cumulative Reward: {result['cumulative_reward']}")

# Save the trained agent if it has a save method
if hasattr(agent, 'save'):
    save_path = "./ddqn_model.pt"
    agent.save(save_path)

Using device: cpu
Environment: idsgame-random_defense-v0
State dimension: 33
Action dimension: 30


TypeError: DDQNAgent.__init__() got an unexpected keyword argument 'expected_state_dim'

In [36]:
num_episodes = 10000
max_steps = 100
log_frequency = 500

# For tracking progress
reward_history = []
loss_history = []
episode_lengths = []
start_time = time.time()

print(f"Starting training for {num_episodes} episodes...")
for ep in range(num_episodes):
    # Reset environment
    state = extract_attacker_obs(env.reset())
    total_reward = 0
    episode_loss = 0
    steps = 0
    done = False
    
    # Episode loop
    for t in range(max_steps):
        # Select attacker action
        action = agent.select_action(state)
        
        # Select random defender action
        defense_action = env.defender_action_space.sample()
        
        # Combine actions and take step
        full_action = (action, defense_action)
        next_obs, reward, terminated, truncated, info = env.step(full_action)
        done = terminated or truncated
        
        # Extract attacker reward
        attacker_reward = reward[0] if isinstance(reward, tuple) else reward
        next_state = extract_attacker_obs(next_obs)
        
        # Store transition in replay buffer
        agent.store(state, action, attacker_reward, next_state, done)
        
        # Update agent
        loss = agent.update()
        if loss:
            episode_loss += loss
        
        # Update state and metrics
        state = next_state
        total_reward += attacker_reward
        steps += 1
        
        if done:
            break
            
    # Store episode metrics
    reward_history.append(total_reward)
    loss_history.append(episode_loss / max(1, steps))
    episode_lengths.append(steps)
    
    # Log progress
    if (ep + 1) % log_frequency == 0 or ep == 0:
        avg_reward = np.mean(reward_history[-min(log_frequency, len(reward_history)):]) 
        avg_loss = np.mean(loss_history[-min(log_frequency, len(loss_history)):])/max(1, min(log_frequency, len(loss_history)))
        elapsed_time = time.time() - start_time
        print(f"[Ep {ep+1}/{num_episodes}] AvgReward (last {min(log_frequency, len(reward_history))}): {avg_reward:.2f} | "
              f"Total: {total_reward:.2f} | Epsilon: {agent.epsilon:.3f} | Loss: {avg_loss:.4f} | "
              f"Steps: {steps} | Time: {elapsed_time:.1f}s")

training_time = time.time() - start_time
print(f"\nTraining completed in {training_time:.1f} seconds")

# Calculate evaluation metrics
result = {
    "average_reward": np.mean(reward_history),
    "reward_std": np.std(reward_history),
    "max_reward": np.max(reward_history),
    "min_reward": np.min(reward_history),
    "average_episode_length": np.mean(episode_lengths),
    "episode_length_std": np.std(episode_lengths),
    "max_episode_length": np.max(episode_lengths),
    "min_episode_length": np.min(episode_lengths),
    "average_hack_probability": 0.0,
    "hack_probability_std": 0.0,
    "max_hack_probability": 0.0,
    "min_hack_probability": 0.0,
    "cumulative_reward": int(np.sum(reward_history)),
}

# Print evaluation summary
print("\n📊 Final DDQN Training Performance:")
print('Results: ', result)
print(f"- Average Reward: {result['average_reward']:.2f} ± {result['reward_std']:.2f}")
print(f"- Max-Min Reward: {result['max_reward']:.2f} - {result['min_reward']:.2f}")
print(f"- Average Episode Length: {result['average_episode_length']:.2f} ± {result['episode_length_std']:.2f}")
print(f"- Cumulative Reward: {result['cumulative_reward']}")

# Save the trained agent
save_path = "./ddqn_model.pt"
agent.save(save_path)RetryClaude can make mistakes. Please double-check responses.

SyntaxError: invalid syntax (1082262925.py, line 97)

In [34]:
# Train the agent
print("Starting training...")
start_time = time.time()

# Train for specified number of episodes
train_result = agent.train()

# Calculate training time
training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")

2025-04-11 00:31:11,464 - DDQN_Agent - INFO - Starting training


Starting training...


TypeError: cannot unpack non-iterable int object

In [None]:
# Final evaluation with more episodes
print("Performing final evaluation...")
agent.config.eval_episodes = 100  # Increase number of evaluation episodes
eval_result = agent.eval_model()

# Calculate average metrics
avg_reward = sum(eval_result.episode_rewards) / len(eval_result.episode_rewards)
avg_steps = sum(eval_result.episode_steps) / len(eval_result.episode_steps)

print(f"Final Evaluation Results:")
print(f"Average Reward: {avg_reward:.2f}")
print(f"Average Steps: {avg_steps:.2f}")

In [None]:
# Plot training rewards
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(agent.train_result.episode_rewards)
plt.title('Episode Rewards')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.grid(True)

# Plot running average of rewards
plt.subplot(1, 2, 2)
window_size = 100
running_avg = [np.mean(agent.train_result.episode_rewards[max(0, i-window_size):i+1]) 
               for i in range(len(agent.train_result.episode_rewards))]
plt.plot(running_avg)
plt.title(f'Running Average Reward (Window Size: {window_size})')
plt.xlabel('Episode')
plt.ylabel('Average Reward')
plt.grid(True)

plt.tight_layout()
plt.savefig(os.path.join(result_dir, 'training_rewards.png'))
plt.show()

In [None]:
# Plot action distribution during training
plt.figure(figsize=(12, 6))

# Get action counts
actions = list(agent.train_result.action_counts.keys())
counts = list(agent.train_result.action_counts.values())

# Sort by action index
action_counts = sorted(zip(actions, counts))
actions, counts = zip(*action_counts)

# Plot action distribution
plt.bar(actions, counts)
plt.title('Action Distribution During Training')
plt.xlabel('Action')
plt.ylabel('Count')
plt.grid(True, axis='y')
plt.savefig(os.path.join(result_dir, 'action_distribution.png'))
plt.show()