<a href="https://colab.research.google.com/github/rumeshsmrr/reinforcement-learning-lunarlander/blob/main/DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install -y swig ffmpeg
!pip install gymnasium[box2d] stable-baselines3[extra]


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 34 not upgraded.
Need to get 1,116 kB of archives.
After this operation, 5,542 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig4.0 amd64 4.0.2-1ubuntu1 [1,110 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig all 4.0.2-1ubuntu1 [5,632 B]
Fetched 1,116 kB in 1s (1,552 kB/s)
Selecting previously unselected package swig4.0.
(Reading database ... 126333 files and directories currently installed.)
Preparing to unpack .../swig4.0_4.0.2-1ubuntu1_amd64.deb ...
Unpacking swig4.0 (4.0.2-1ubuntu1) ...
Selecting previously unselected package swig.
Preparing to unpack .../swig_4.0.2-1u

In [2]:
import os
import gymnasium as gym
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from gymnasium.wrappers import RecordVideo
from IPython.display import HTML
from base64 import b64encode


In [3]:
def display_video(video_path):
    if not os.path.exists(video_path):
        print(f"Video file {video_path} not found.")
        return
    with open(video_path, 'rb') as f:
        mp4 = f.read()
    data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
    return HTML(f"""
    <video width=600 controls>
        <source src="{data_url}" type="video/mp4">
    </video>
    """)



In [4]:
# Folder to save training videos
video_folder = "./dqn_training_videos"
os.makedirs(video_folder, exist_ok=True)

# Training parameters
total_timesteps = 2_000_000  # 2 million timesteps
record_every_timesteps = 500_000  # Save video after every 500,000 steps


In [5]:
# Main environment for training
train_env = gym.make("LunarLander-v3")

# DQN model
model = DQN("MlpPolicy", train_env, verbose=1)


  from pkg_resources import resource_stream, resource_exists
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [6]:
# Training loop
for step in range(0, total_timesteps, record_every_timesteps):
    # Train the model for 'record_every_timesteps'
    model.learn(total_timesteps=record_every_timesteps, reset_num_timesteps=False)

    # Save a video after training chunk
    eval_env = gym.make("LunarLander-v3", render_mode="rgb_array")
    eval_env = RecordVideo(eval_env, video_folder=video_folder, name_prefix=f"dqn_step_{step + record_every_timesteps}", episode_trigger=lambda x: True)

    obs, _ = eval_env.reset()
    done = False
    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, _ = eval_env.step(action)
        done = terminated or truncated

    eval_env.close()

    # Display the saved video
    video_files = [f for f in os.listdir(video_folder) if f.endswith('.mp4')]
    video_files.sort()
    latest_video = os.path.join(video_folder, video_files[-1])
    display(display_video(latest_video))


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 118      |
|    ep_rew_mean      | -171     |
|    exploration_rate | 0.991    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 387      |
|    time_elapsed     | 1        |
|    total_timesteps  | 474      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 3.54     |
|    n_updates        | 93       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 102      |
|    ep_rew_mean      | -182     |
|    exploration_rate | 0.985    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 533      |
|    time_elapsed     | 1        |
|    total_timesteps  | 813      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 2.66     |
|    n_updates      

  logger.warn(
  """


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 429      |
|    ep_rew_mean      | 81.4     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 892      |
|    fps              | 777      |
|    time_elapsed     | 2        |
|    total_timesteps  | 501847   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.445    |
|    n_updates        | 125436   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 430      |
|    ep_rew_mean      | 84.4     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 896      |
|    fps              | 706      |
|    time_elapsed     | 5        |
|    total_timesteps  | 503627   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.247    |
|    n_updates      

  logger.warn(


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 489      |
|    ep_rew_mean      | 229      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2024     |
|    fps              | 741      |
|    time_elapsed     | 1        |
|    total_timesteps  | 1001251  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 1.08     |
|    n_updates        | 250287   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 492      |
|    ep_rew_mean      | 228      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2028     |
|    fps              | 762      |
|    time_elapsed     | 4        |
|    total_timesteps  | 1003345  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.345    |
|    n_updates      

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 582      |
|    ep_rew_mean      | 180      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2888     |
|    fps              | 678      |
|    time_elapsed     | 1        |
|    total_timesteps  | 1501200  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.131    |
|    n_updates        | 375274   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 560      |
|    ep_rew_mean      | 188      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 2892     |
|    fps              | 730      |
|    time_elapsed     | 3        |
|    total_timesteps  | 1502513  |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.139    |
|    n_updates      

In [7]:
mean_reward, std_reward = evaluate_policy(model, train_env, n_eval_episodes=10)
print(f"Final DQN agent performance: {mean_reward:.2f} +/- {std_reward:.2f}")




Final DQN agent performance: 217.48 +/- 64.70
