In [1]:
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym

SEED = 42
rng = np.random.default_rng(SEED)

In [2]:
env_id = "FrozenLake-v1"
env_kwargs = {"is_slippery": False}
env = gym.make(env_id, **env_kwargs)
state, info = env.reset(seed=SEED)

n_states = env.observation_space.n
n_actions = env.action_space.n

print(f"Environment: {env_id}")
print(f"States: {n_states}, Actions: {n_actions}")
print("Initial observation:", state)

Environment: FrozenLake-v1
States: 16, Actions: 4
Initial observation: 0


## Environment run
Run a handful of greedy rollouts and optionally render the grid to check the learned policy.

In [3]:
# 1. Initialize a container for your logs
transitions = []

# 3. Run for 100 steps
for step in range(100):
    # RANDOM POLICY: Pick a random action (0, 1, or 2 for MountainCar)
    # We ignore the state and just sample from the action space
    action = env.action_space.sample()
    
    # Step the environment
    # next_state: where we ended up
    # reward: what we got for this step
    # terminated: did we reach the goal?
    # truncated: did time run out?
    next_state, reward, terminated, truncated, info = env.step(action)
    
    # LOGGING: Save the "transition"
    transitions.append({
        "step": step,
        "state": state,
        "action": action,
        "reward": reward,
        "next_state": next_state,
        "terminated": terminated
    })
    
    # Update state for the next step
    state = next_state
    
    # If the episode ends, reset it so we can keep gathering 100 steps
    if terminated or truncated:
        state, info = env.reset()

# 4. View the first few logs to understand the data
print(f"Collected {len(transitions)} transitions.")
print("Sample transition:", transitions[0])

Collected 100 transitions.
Sample transition: {'step': 0, 'state': 0, 'action': np.int64(2), 'reward': 0, 'next_state': 1, 'terminated': False}


In [14]:
env.close()