In [39]:
%pip install numpy 
%pip install gym 
%pip install moviepy
%pip install tensorflow 
%pip install matplotlib

Note: you may need to restart the kernel to use updated packages.

Collecting moviepy
  Downloading moviepy-1.0.3.tar.gz (388 kB)
     ---------------------------------------- 0.0/388.3 kB ? eta -:--:--
     - -------------------------------------- 10.2/388.3 kB ? eta -:--:--
     -- ---------------------------------- 30.7/388.3 kB 660.6 kB/s eta 0:00:01
     -------------- ----------------------- 143.4/388.3 kB 1.4 MB/s eta 0:00:01
     -------------------------------------- 388.3/388.3 kB 2.7 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting decorator<5.0,>=4.0.2 (from moviepy)
  Downloading decorator-4.4.2-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting imageio<3.0,>=2.5 (from moviepy)
  Downloading imageio-2.34.1-py3-none-any.whl.metadata (4.9 kB)
Collecting imageio_ffmpeg>=0.2.0 (from moviepy)
  Downloading imageio_ffmpeg-0.4.9-py3-none-win_amd64.whl.metadata (1.7 kB)
Collecting tqdm<5.0,>=4.11.2 (fr

In [1]:
import numpy as np
import gym
import setuptools.dist
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Input
from tensorflow.keras.optimizers import Adam
from matplotlib import pyplot as plt

In [2]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.n_actions = action_size

        self.lr = 0.001
        self.gamma = 0.99
        self.epsilon = 1
        self.epsilon_decay = 0.005

        self.batch_size = 32
        self.memory_buffer = list()
        self.max_memory_buffer = 2000

        self.model = Sequential([
            Input(shape=(state_size,)),
            Dense(units=24, activation='relu'),
            Dense(units=24, activation='relu'),
            Dense(units=action_size, activation='linear')
        ])

        self.model.compile(loss='mse', optimizer=Adam(learning_rate=self.lr))

    def compute_action(self, current_state):
        if np.random.uniform(0, 1) < self.epsilon:
            return np.random.choice(range(self.n_actions))
        else:
            q_values = self.model.predict(current_state)[0]
            return np.argmax(q_values)
        
    def update_epsilon(self):
        self.epsilon = self.epsilon * np.exp(-self.epsilon_decay)

    def store_episode(self, current_state, action, reward, next_state, done):
        self.memory_buffer.append({
            "current_state": current_state,
            "action": action,
            "reward": reward,
            "next_state": next_state,
            "done": done
        })

        if len(self.memory_buffer) > self.max_memory_buffer:
            self.memory_buffer.pop(0)

    def train(self):
        np.random.shuffle(self.memory_buffer)
        batch_sample = self.memory_buffer[0:self.batch_size]

        for experience in batch_sample:
            q_current_state = self.model.predict(experience["current_state"])[0]
            
            #Calculate Q by Bellman
            if not experience["done"]:
                q_target = experience["reward"] + self.gamma * np.max(self.model.predict(experience["next_state"])[0])
            else:
                q_target = experience["reward"]

            q_current_state[experience["action"]] = q_target

            # treinar o modelo
            self.model.fit(experience["current_state"], np.array([q_current_state]), verbose=0)

In [12]:
# Rodar o modelo
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
print(state_size)
action_size = env.action_space.n

n_episodes = 400
max_iteration_ep = 500

agent = DQNAgent(state_size, action_size)
total_steps = 0

4


In [13]:
for episode in range(n_episodes):
    print(f"Episódio {episode}...")
    current_state = env.reset()
    current_state = np.array([current_state[0]])

    for iteration in range(max_iteration_ep):
        total_steps += 1

        action = agent.compute_action(current_state)
        next_state, reward, done, _, _ = env.step(action)
        next_state = np.array([next_state])

        agent.store_episode(current_state, action, reward, next_state, done)

        if done:
            agent.update_epsilon()
            break

        current_state = next_state

    if total_steps >= agent.batch_size and total_steps % 5 == 0:
        agent.train()

Episódio 0...
Episódio 1...
Episódio 2...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step


Episódio 3...
Episódio 4...
Episódio 5...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Episódio 6...
Episódio 7...
Episódio 8...
Episódio 9...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Episódio 10...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step

  if not isinstance(terminated, (bool, np.bool8)):


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18

In [16]:
def make_video():
    import time
    temp_env = gym.make('CartPole-v1', render_mode="rgb_array")
    env = gym.wrappers.RecordVideo(env=temp_env, video_folder='videos', name_prefix="test-video", episode_trigger=lambda x: x % 2 == 0)
    rewards = 0
    steps = 0
    done = False
    state = env.reset()
    state = np.array([state[0]])

    while not done:
        action = agent.compute_action(state)
        state, reward, done, _, _ = env.step(action)
        state = np.array([state])            
        steps += 1
        rewards += reward

    print(rewards)
    env.close()

make_video()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15

                                                               

Moviepy - Done !
Moviepy - video ready d:\Projetos_Dev\IA\Q-learning\dqn-cart\videos\test-video-episode-0.mp4
107.0


