# Cartpole
Planner __init__ expects a reward and transition matrix P, which is a nested dictionary 
[OpenAI Gym](https://gymnasium.farama.org/) style discrete environment where
P[state][action] is a list of tuples (probability, next state, reward, terminal).

The gym cartpole environment does not include this matrix, so we'll use the CartpoleWrapper class to create a wrapped gym environment, which modifies the observation space and includes P.  

In [None]:
!pip install bettermdptools

In [1]:
import gymnasium as gym
from bettermdptools.envs.cartpole_wrapper import CartpoleWrapper
from bettermdptools.utils.test_env import TestEnv
from bettermdptools.algorithms.planner import Planner
from bettermdptools.algorithms.rl import RL
import numpy as np

In [None]:
base_env = gym.make('CartPole-v1', render_mode=None)
cartpole = CartpoleWrapper(base_env)

# run VI
V, V_track, pi = Planner(cartpole.P).value_iteration()

#test policy
test_scores = TestEnv.test_env(env=cartpole, n_iters=100, render=False, pi=pi, user_input=False)
print(np.mean(test_scores))

# Q-learning
Q, V, pi, Q_track, pi_track, rewards = RL(cartpole).q_learning()

#test policy
test_scores = TestEnv.test_env(env=cartpole, n_iters=100, render=False, pi=pi, user_input=False)
print(np.mean(test_scores))



runtime = 41.46 seconds
9.4


runtime = 175.35 seconds
103.45
