#### ```Angle-Based Policy```<br>

In [1]:
import gymnasium as gym

env = gym.make("CartPole-v1", render_mode="human")
state, _ = env.reset()

done = False
while not done:
    angle = state[2]  # Pole's angle
    action = 0 if angle < 0 else 1  # Move left if angle < 0, else right
    state, reward, done, truncated, info = env.step(action)
    env.render()
env.close()

#### `Position-Based Policy`

In [2]:
done = False
env = gym.make("CartPole-v1", render_mode="human")
state, _ = env.reset()
while not done:
    position = state[0]  # Cart's position
    action = 0 if position < 0 else 1  # Move left if position < 0, else right
    state, reward, done, truncated, info = env.step(action)
    env.render()
env.close()

#### `Velocity-Based Policy`

In [3]:
done = False
env = gym.make("CartPole-v1", render_mode="human")
state, _ = env.reset()
while not done:
    velocity = state[1]  # Cart's velocity
    action = 0 if velocity < 0 else 1  # Move left if velocity < 0, else right
    state, reward, done, truncated, info = env.step(action)
    env.render()
env.close()

#### `Combined Policy`

In [4]:
import random

done = False
env = gym.make("CartPole-v1", render_mode="human")
state, _ = env.reset()
while not done:
    angle, velocity = state[2], state[1]
    if angle < 0 and velocity < 0:
        action = 0  # Move left
    elif angle > 0 and velocity > 0:
        action = 1  # Move right
    else:
        action = random.choice([0, 1])  # Random action
    state, reward, done, truncated, info = env.step(action)
    env.render()
env.close()

#### ` DQN Agent Implementation`

In [5]:
import gymnasium as gym
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy

2025-01-02 10:08:30.597970: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-02 10:08:30.625548: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735792710.657824   48106 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735792710.667814   48106 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-02 10:08:30.704367: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [6]:
# Create the CartPole-v1 environment
env = gym.make("CartPole-v1")
# Initialize the DQN model
model = DQN("MlpPolicy", env, verbose=1, learning_rate=0.001, buffer_size=50000)

# Train the model for 10,000 timesteps
model.learn(total_timesteps=10000)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 41.2     |
|    ep_rew_mean      | 41.2     |
|    exploration_rate | 0.843    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 256      |
|    time_elapsed     | 0        |
|    total_timesteps  | 165      |
| train/              |          |
|    learning_rate    | 0.001    |
|    loss             | 0.223    |
|    n_updates        | 16       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 37.9     |
|    ep_rew_mean      | 37.9     |
|    exploration_rate | 0.712    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 314      |
|    time_elapsed     | 0        |
|    total_timesteps  | 303      |
| train/              |       

<stable_baselines3.dqn.dqn.DQN at 0x78ff8ff57d40>

In [7]:
# Evaluate the trained agent
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward}, Std: {std_reward}")

Mean reward: 9.2, Std: 0.7483314773547883




In [9]:
env = gym.make("CartPole-v1", render_mode='human')
# Visualize the trained agent
for episode in range(20):
    obs = env.reset()[0]  # Reset the environment
    done = False
    while not done:
        action, _ = model.predict(obs, deterministic=True)  # Predict action
        obs, reward, done, _, _ = env.step(action)          # Take the action
        env.render()                                        # Render the environment
env.close()