In [12]:
import gymnasium as gym
env = gym.make('CartPole-v1')
print(env.action_space)
print(env.action_space.n)
print([env.action_space.sample() for _ in range(10)])
print(env.observation_space)
print(env.observation_space.shape)

Discrete(2)
2
[1, 1, 0, 1, 1, 0, 1, 0, 1, 1]
Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)
(4,)


In [13]:
print(env.reset(seed=100))
print(env.step(0))
print(env.step(1))

(array([ 0.03349816,  0.0096554 , -0.02111368, -0.04570484], dtype=float32), {})
(array([ 0.03369127, -0.18515752, -0.02202777,  0.24024247], dtype=float32), 1.0, False, False, {})
(array([ 0.02998812,  0.01027205, -0.01722292, -0.05930644], dtype=float32), 1.0, False, False, {})


In [14]:
class RandomAgent:
    def __init__(self):
        self.env = gym.make('CartPole-v1')
    def play(self, episodes=1):
        self.trewards = list()
        for e in range(episodes):
            self.env.reset()
            for step in range(1, 100):
                a = self.env.action_space.sample()
                state, reward, done, trunc, info = self.env.step(a)
                if done:
                    self.trewards.append(step)
                    break

ra = RandomAgent()
ra.play(15)
print(ra.trewards)
print(round(sum(ra.trewards) / len(ra.trewards), 2))


[18, 22, 19, 30, 26, 17, 59, 24, 23, 10, 20, 11, 19, 20, 17]
22.33


In [15]:
import os
import random
import warnings
import numpy as np
import tensorflow as tf
from tensorflow import keras
from collections import deque
from keras.layers import Dense
from keras.models import Sequential
from keras import optimizers
from tensorflow.python.data.ops import range_op
import gymnasium as gym

print("TensorFlow version:", tf.__version__)
print("Eager execution enabled:", tf.executing_eagerly())

warnings.simplefilter('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['PYTHONHASHSEED'] = '0'

tf.compat.v1.enable_eager_execution()

opt = optimizers.Adam(learning_rate=0.0001)

random.seed(100)
tf.random.set_seed(100)

class DQLAgent:
    def __init__(self):
        self.epsilon = 1.0
        self.epsilon_decay = 0.9975
        self.epsilon_min = 0.1
        self.memory = deque(maxlen=2000)
        self.batch_size = 32
        self.gamma = 0.9
        self.trewards = list()
        self.max_treward = 0 
        self._create_model()
        self.env = gym.make('CartPole-v1')
    def _create_model(self):
        self.model = Sequential()
        self.model.add(Dense(24, activation='relu', input_dim=4))
        self.model.add(Dense(24, activation='relu'))
        self.model.add(Dense(2, activation='linear'))
        self.model.compile(loss='mse', optimizer=opt)

class DQLAgent(DQLAgent):
    def act(self, state):
        if random.random() < self.epsilon:
            return self.env.action_space.sample()
        else:
            return np.argmax(self.model.predict(state)[0])
    def replay(self):
        batch = random.sample(self.memory, self.batch_size)
        for state, action, next_state, reward, done in batch:
            if not done:
                reward += self.gamma * np.amax(
                    self.model.predict(next_state)[0])
                target = self.model.predict(state)
                target[0, action] = reward
                self.model.fit(state, target, epochs=2, verbose=False)
                if self.epsilon > self.epsilon_min:
                    self.epsilon *= self.epsilon_decay

class DQLAgent(DQLAgent):
    def learn(self, episodes):
        for e in range(1, episodes + 1):
            state, _ = self.env.reset()
            state = np.reshape(state, [1, 4])
            for f in range(1, 5000):
                action = self.act(state)
                next_state, reward, done, trunc, _ = self.env.step(action)
                next_state = np.reshape(next_state, [1, 4])
                self.memory.append([state, action, next_state, reward, done])
                state = next_state
                if done or trunc:
                    self.trewards.append(f)
                    self.max_treward = max(self.max_treward, f)
                    templ = f'episode={e:4d} | treward={f:4d}'
                    templ += f' | max={self.max_treward:4d}'
                    print(templ, end='\r')
                    break
            if len(self.memory) > self.batch_size:
                self.replay()
        print()

class DQLAgent(DQLAgent):
    def test(self, episodes):
        for e in range(1, episodes + 1):
            state, _ = self.env.reset()
            state = np.reshape(state, [1, 4])
            for f in range(1, 5001):
                state_tensor = tf.convert_to_tensor(state, dtype=tf.float32)
                action = np.argmax(self.model(state_tensor).numpy()[0])
                state, reward, done, trunc, _ = self.env.step(action)
                state = np.reshape(state, [1, 4])
                if done or trunc:
                    print(f, end=' ')
                    break

agent = DQLAgent()

TensorFlow version: 2.18.0
Eager execution enabled: True


In [17]:
print(agent.epsilon)

0.09997053357470892


In [16]:
agent.learn(500)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13m

In [18]:
reward = agent.test(15)
print(reward)

9 9 10 9 9 9 10 10 8 9 9 10 10 9 9 None
