In [1]:
import gymnasium
import torch
import numpy as np
from stable_baselines3 import DQN
# import gym

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


In [2]:
# -------------------------------------------------------
# 1. Load Stable-Baselines3 Pretrained DQN LunarLander Model
# -------------------------------------------------------

# Assuming the pretrained model file "dqn_lunarlander.zip"
# is in your working directory.
model_path = "rl-baselines3-zoo/rl-trained-agents/dqn/LunarLander-v2_1/LunarLander-v2.zip"

# Load the model
model = DQN.load(model_path)

# Retrieve the underlying PyTorch Q-network
policy = model.policy
q_net = policy.q_net   # This is a torch.nn.Module that maps state â†’ Q-values

Exception: 'bytes' object cannot be interpreted as an integer
Exception: 'bytes' object cannot be interpreted as an integer
Exception: 'bytes' object cannot be interpreted as an integer


In [18]:
# -------------------------------------------------------
# 2. Run 10,000 Episodes and Record Every State
# -------------------------------------------------------

env = gymnasium.make("LunarLander-v3")
num_episodes = 10_000

# Storage: each entry will be a dict
# {
#     "state": np.array([...]),
#     "q_values": np.array([...])  # shape: (num_actions,)
# }
records = []

for ep in range(num_episodes):
    state, _ = env.reset()
    done = False

    while not done:
        # Convert state to tensor for Q-net
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)

        # -------------------------------------------------------
        # 3. Compute Q-values for this state
        # -------------------------------------------------------
        with torch.no_grad():
            q_values = q_net(state_tensor).cpu().numpy().squeeze()

        # Store state and q-values
        records.append({
            "state": state.copy(),
            "q_values": q_values.copy()
        })

        # Let the agent act (you can choose deterministic or stochastic)
        action, _ = model.predict(state, deterministic=True)
        if np.random.rand() < 0.15:
            action = env.action_space.sample() #Randomly sample 15% of the time

        # Step environment
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        state = next_state


env.close()

In [9]:
print(len(records))

67165


In [19]:
print(records[0])
print(records[1])

{'state': array([-0.00667114,  1.4150207 , -0.67573935,  0.18222623,  0.00773706,
        0.15306488,  0.        ,  0.        ], dtype=float32), 'q_values': array([74.40999, 73.49079, 71.75399, 74.92645], dtype=float32)}
{'state': array([-0.01327419,  1.418535  , -0.6662064 ,  0.15614559,  0.01357337,
        0.11673716,  0.        ,  0.        ], dtype=float32), 'q_values': array([74.35034, 73.48212, 71.96946, 74.8025 ], dtype=float32)}


In [20]:
import pandas as pd
states_arry = np.array([records[i]['state'] for i in range(len(records))])
states_df = pd.DataFrame(states_arry, columns=["x", "y", "x'", "y'", "angle", "angular_velocity", "left_contact", "right_contact"])
states_df.to_csv("ll_states.csv")

In [21]:
rewards_arry = np.array([records[i]['q_values'] for i in range(len(records))])
rewards_df = pd.DataFrame(rewards_arry, columns=["Nothing", "Left", "Main", "Right"])
rewards_df.to_csv("ll_outcomes.csv")