In [1]:
# Set GPU runtime first: Runtime > Change Runtime Type > GPU

# Check GPU
!nvidia-smi

Sat Jul  5 17:00:50 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   58C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
# Install specific compatible versions
!pip install gymnasium==0.29.1
!pip install ale-py==0.9.0
!pip install stable-baselines3==2.3.2
!pip install autorom[accept-rom-license]==0.6.1

Collecting gymnasium==0.29.1
  Downloading gymnasium-0.29.1-py3-none-any.whl.metadata (10 kB)
Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gymnasium
  Attempting uninstall: gymnasium
    Found existing installation: gymnasium 1.2.0
    Uninstalling gymnasium-1.2.0:
      Successfully uninstalled gymnasium-1.2.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dopamine-rl 4.1.2 requires gymnasium>=1.0.0, but you have gymnasium 0.29.1 which is incompatible.[0m[31m
[0mSuccessfully installed gymnasium-0.29.1
Collecting ale-py==0.9.0
  Downloading ale_py-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.5 kB)
Downloading ale_py-0.9.0-cp311-cp311-manylinux_2_17_x86_64.

In [3]:
# Install ROMs with proper acceptance
!AutoROM --accept-license --install-dir /usr/local/lib/python3.11/dist-packages/ale_py/roms

AutoROM will download the Atari 2600 ROMs.
They will be installed to:
	/usr/local/lib/python3.11/dist-packages/ale_py/roms

Existing ROMs will be overwritten.


In [4]:
# Test basic imports first
try:
    import gymnasium as gym
    print(f"✅ Gymnasium {gym.__version__} imported successfully")
except Exception as e:
    print(f"❌ Gymnasium import failed: {e}")

try:
    import ale_py
    print(f"✅ ALE-py {ale_py.__version__} imported successfully")
except Exception as e:
    print(f"❌ ALE-py import failed: {e}")

try:
    import stable_baselines3
    print(f"✅ Stable-Baselines3 {stable_baselines3.__version__} imported successfully")
except Exception as e:
    print(f"❌ Stable-Baselines3 import failed: {e}")

✅ Gymnasium 0.29.1 imported successfully
✅ ALE-py 0.9.0 imported successfully
✅ Stable-Baselines3 2.3.2 imported successfully


In [5]:
import gymnasium as gym
import ale_py

print("🔍 Testing Asteroids with compatible versions...")

try:
    # Register ALE environments
    gym.register_envs(ale_py)
    print("✅ ALE environments registered")

    # Check available environments
    all_envs = list(gym.envs.registry.keys())
    asteroid_envs = [env for env in all_envs if 'asteroid' in env.lower()]
    print(f"Asteroid environments found: {asteroid_envs}")

    # Try to create Asteroids environment
    if asteroid_envs:
        env_name = asteroid_envs[0]
        env = gym.make(env_name)
        print(f"✅ Successfully created: {env_name}")

        obs, info = env.reset()
        print(f"✅ Environment reset successful")
        print(f"  Observation shape: {obs.shape}")
        print(f"  Action space: {env.action_space}")

        # Test one step
        action = env.action_space.sample()
        obs, reward, terminated, truncated, info = env.step(action)
        print(f"✅ Environment step successful")

        env.close()
        working_env = env_name

    else:
        # Try direct ALE approach
        ale = ale_py.ALEInterface()
        available_roms = ale.getAvailableRoms()
        print(f"Available ROMs: {available_roms}")

        if 'asteroids' in available_roms:
            ale.loadROM('asteroids')
            print("✅ Direct ALE Asteroids loading successful")
            working_env = 'direct_ale'
        else:
            working_env = None

except Exception as e:
    print(f"❌ Environment test failed: {e}")
    working_env = None

print(f"\n🎯 Working environment: {working_env}")

🔍 Testing Asteroids with compatible versions...
✅ ALE environments registered
Asteroid environments found: ['Asteroids-v0', 'AsteroidsDeterministic-v0', 'AsteroidsNoFrameskip-v0', 'Asteroids-v4', 'AsteroidsDeterministic-v4', 'AsteroidsNoFrameskip-v4', 'Asteroids-ram-v0', 'Asteroids-ramDeterministic-v0', 'Asteroids-ramNoFrameskip-v0', 'Asteroids-ram-v4', 'Asteroids-ramDeterministic-v4', 'Asteroids-ramNoFrameskip-v4', 'ALE/Asteroids-v5', 'ALE/Asteroids-ram-v5']
✅ Successfully created: Asteroids-v0
✅ Environment reset successful
  Observation shape: (210, 160, 3)
  Action space: Discrete(14)
✅ Environment step successful

🎯 Working environment: Asteroids-v0


  logger.deprecation(


## DQN Code for Asteroids

In [None]:
import os
import random
import time
from dataclasses import dataclass
from typing import Optional

import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from stable_baselines3.common.atari_wrappers import (
    ClipRewardEnv,
    EpisodicLifeEnv,
    FireResetEnv,
    MaxAndSkipEnv,
    NoopResetEnv,
)
from stable_baselines3.common.buffers import ReplayBuffer
from torch.utils.tensorboard import SummaryWriter


@dataclass
class Args:
    """Hyperparameters for Custom Asteroids DQN"""
    exp_name: str = "custom_asteroids_dqn"
    seed: int = 1
    torch_deterministic: bool = True
    cuda: bool = True

    # Environment - use the working Asteroids environment!
    env_id: str = "AsteroidsNoFrameskip-v4"  # Use NoFrameskip version for training

    # Training parameters
    total_timesteps: int = 1000000  # 1M timesteps (~90 minutes)
    learning_rate: float = 1e-4
    buffer_size: int = 100000
    gamma: float = 0.99
    target_network_frequency: int = 1000
    batch_size: int = 32
    start_e: float = 1.0
    end_e: float = 0.01
    exploration_fraction: float = 0.10
    learning_starts: int = 100000
    train_frequency: int = 4


class CustomQNetwork(nn.Module):
    """Custom CNN for Asteroids - you can modify this architecture!"""

    def __init__(self, env, hidden_size=512):
        super().__init__()
        self.env = env

        # CNN Feature Extractor - CUSTOMIZE THIS!
        self.network = nn.Sequential(
            # First conv layer: detect basic shapes
            nn.Conv2d(4, 32, 8, stride=4),
            nn.ReLU(),

            # Second conv layer: detect movement patterns
            nn.Conv2d(32, 64, 4, stride=2),
            nn.ReLU(),

            # Third conv layer: complex spatial relationships
            nn.Conv2d(64, 64, 3, stride=1),
            nn.ReLU(),

            nn.Flatten(),
        )

        # Calculate conv output size
        with torch.no_grad():
            sample_input = torch.zeros(1, 4, 84, 84)
            conv_output_size = self.network(sample_input).shape[1]

        # Value head - CUSTOMIZE THIS!
        self.value_head = nn.Sequential(
            nn.Linear(conv_output_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Linear(hidden_size // 2, env.single_action_space.n)
        )

    def forward(self, x):
        features = self.network(x / 255.0)  # Normalize pixels
        return self.value_head(features)


def make_env(env_id, seed, idx, capture_video, run_name):
    """Create Asteroids environment with proper wrappers"""
    def thunk():
        if capture_video and idx == 0:
            env = gym.make(env_id, render_mode="rgb_array")
            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
        else:
            env = gym.make(env_id)

        print(f"✅ Successfully created environment: {env_id}")

        env = gym.wrappers.RecordEpisodeStatistics(env)
        env = NoopResetEnv(env, noop_max=30)
        env = MaxAndSkipEnv(env, skip=4)
        env = EpisodicLifeEnv(env)

        if "FIRE" in env.unwrapped.get_action_meanings():
            env = FireResetEnv(env)

        env = ClipRewardEnv(env)
        env = gym.wrappers.ResizeObservation(env, (84, 84))
        env = gym.wrappers.GrayScaleObservation(env)
        env = gym.wrappers.FrameStack(env, 4)

        env.action_space.seed(seed)
        return env
    return thunk


def linear_schedule(start_e: float, end_e: float, duration: int, t: int):
    """Linear decay for exploration"""
    slope = (end_e - start_e) / duration
    return max(slope * t + start_e, end_e)


def train_asteroids_dqn():
    """Main training function"""
    args = Args()

    # Setup
    run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
    writer = SummaryWriter(f"runs/{run_name}")

    # Seeding
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = args.torch_deterministic

    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")

    # Environment
    envs = gym.vector.SyncVectorEnv([
        make_env(args.env_id, args.seed, 0, True, run_name)
    ])

    # Networks
    q_network = CustomQNetwork(envs, hidden_size=512).to(device)
    optimizer = optim.Adam(q_network.parameters(), lr=args.learning_rate)
    target_network = CustomQNetwork(envs, hidden_size=512).to(device)
    target_network.load_state_dict(q_network.state_dict())

    # Experience Replay
    rb = ReplayBuffer(
        args.buffer_size,
        envs.single_observation_space,
        envs.single_action_space,
        device,
        handle_timeout_termination=False,
    )

    # Training Loop
    obs, _ = envs.reset(seed=args.seed)
    episode_rewards = []
    episode_lengths = []

    for global_step in range(args.total_timesteps):
        # Exploration rate
        epsilon = linear_schedule(
            args.start_e, args.end_e,
            args.exploration_fraction * args.total_timesteps,
            global_step
        )

        # Action selection
        if random.random() < epsilon:
            actions = np.array([envs.single_action_space.sample()])
        else:
            q_values = q_network(torch.Tensor(obs).to(device))
            actions = torch.argmax(q_values, dim=1).cpu().numpy()

        # Environment step
        next_obs, rewards, terminations, truncations, infos = envs.step(actions)

        # Track metrics
        if "final_info" in infos:
            for info in infos["final_info"]:
                if info and "episode" in info:
                    episode_rewards.append(info["episode"]["r"])
                    episode_lengths.append(info["episode"]["l"])
                    writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
                    writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step)
                    print(f"Step {global_step}, Episode reward: {info['episode']['r']}")

        # Store experience
        real_next_obs = next_obs.copy()
        for idx, d in enumerate(terminations):
            if d:
                real_next_obs[idx] = infos["final_observation"][idx]

        rb.add(obs, real_next_obs, actions, rewards, terminations, infos)
        obs = next_obs

        # Training
        if global_step > args.learning_starts:
            if global_step % args.train_frequency == 0:
                data = rb.sample(args.batch_size)

                with torch.no_grad():
                    target_max, _ = target_network(data.next_observations).max(dim=1)
                    td_target = data.rewards.flatten() + args.gamma * target_max * (1 - data.dones.flatten())

                old_val = q_network(data.observations).gather(1, data.actions).squeeze()
                loss = F.mse_loss(td_target, old_val)

                if global_step % 1000 == 0:
                    writer.add_scalar("losses/td_loss", loss, global_step)
                    writer.add_scalar("losses/q_values", old_val.mean().item(), global_step)
                    writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)

                # Optimize
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        # Update target network
        if global_step % args.target_network_frequency == 0:
            target_network.load_state_dict(q_network.state_dict())

    # Save model
    model_path = f"models/{run_name}.pt"
    os.makedirs("models", exist_ok=True)
    torch.save(q_network.state_dict(), model_path)

    envs.close()
    writer.close()

    return q_network, episode_rewards, model_path


def evaluate_model(model_path, num_episodes=10):
    """Evaluate trained model on Asteroids"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Create environment
    env = gym.vector.SyncVectorEnv([make_env("AsteroidsNoFrameskip-v4", 42, 0, False, "eval")])
    q_network = CustomQNetwork(env).to(device)
    q_network.load_state_dict(torch.load(model_path))
    q_network.eval()

    print(f"✅ Evaluating on: AsteroidsNoFrameskip-v4")

    # Evaluate
    obs, _ = env.reset()
    episode_rewards = []
    episode_lengths = []
    current_reward = 0
    current_length = 0

    for step in range(50000):  # Max steps for evaluation
        with torch.no_grad():
            q_values = q_network(torch.Tensor(obs).to(device))
            actions = torch.argmax(q_values, dim=1).cpu().numpy()

        obs, rewards, terminations, truncations, infos = env.step(actions)
        current_reward += rewards[0]
        current_length += 1

        if terminations[0] or truncations[0]:
            episode_rewards.append(current_reward)
            episode_lengths.append(current_length)
            print(f"Episode {len(episode_rewards)}: {current_reward} points, {current_length} steps")

            current_reward = 0
            current_length = 0
            obs, _ = env.reset()

            if len(episode_rewards) >= num_episodes:
                break

    env.close()

    # Results
    avg_reward = np.mean(episode_rewards)
    avg_length = np.mean(episode_lengths)

    print(f"\n📊 Evaluation Results:")
    print(f"Average Score: {avg_reward:.1f}")
    print(f"Average Length: {avg_length:.0f}")
    print(f"Best Score: {max(episode_rewards)}")
    print(f"Episodes: {len(episode_rewards)}")

    return episode_rewards, episode_lengths


if __name__ == "__main__":
    start_time = time.time()
    print("🚀 Starting Custom Asteroids DQN Training...")

    # Train the model
    model, rewards, model_path = train_asteroids_dqn()

    print(f"✅ Training completed! Model saved to: {model_path}")
    print(f"Training time: {(time.time() - start_time)/60:.1f} minutes")

    # Evaluate the model
    print("\n🎯 Evaluating trained model...")
    eval_rewards, eval_lengths = evaluate_model(model_path)

🚀 Starting Custom Asteroids DQN Training...
✅ Successfully created environment: AsteroidsNoFrameskip-v4


  real_next_obs[idx] = infos["final_observation"][idx]
  """


Moviepy - Building video /content/videos/AsteroidsNoFrameskip-v4__custom_asteroids_dqn__1__1751735191/rl-video-episode-0.mp4.
Moviepy - Writing video /content/videos/AsteroidsNoFrameskip-v4__custom_asteroids_dqn__1__1751735191/rl-video-episode-0.mp4



  real_next_obs[idx] = infos["final_observation"][idx]


Moviepy - Done !
Moviepy - video ready /content/videos/AsteroidsNoFrameskip-v4__custom_asteroids_dqn__1__1751735191/rl-video-episode-0.mp4
Step 538, Episode reward: [480.]
Moviepy - Building video /content/videos/AsteroidsNoFrameskip-v4__custom_asteroids_dqn__1__1751735191/rl-video-episode-1.mp4.
Moviepy - Writing video /content/videos/AsteroidsNoFrameskip-v4__custom_asteroids_dqn__1__1751735191/rl-video-episode-1.mp4





Moviepy - Done !
Moviepy - video ready /content/videos/AsteroidsNoFrameskip-v4__custom_asteroids_dqn__1__1751735191/rl-video-episode-1.mp4
Step 1287, Episode reward: [530.]
Step 2179, Episode reward: [680.]
Step 3627, Episode reward: [1180.]
Step 4253, Episode reward: [830.]
Step 4806, Episode reward: [610.]
Step 6743, Episode reward: [1300.]
Step 7237, Episode reward: [230.]
Moviepy - Building video /content/videos/AsteroidsNoFrameskip-v4__custom_asteroids_dqn__1__1751735191/rl-video-episode-8.mp4.
Moviepy - Writing video /content/videos/AsteroidsNoFrameskip-v4__custom_asteroids_dqn__1__1751735191/rl-video-episode-8.mp4





Moviepy - Done !
Moviepy - video ready /content/videos/AsteroidsNoFrameskip-v4__custom_asteroids_dqn__1__1751735191/rl-video-episode-8.mp4
Step 7733, Episode reward: [180.]
Step 9281, Episode reward: [1490.]
