In [1]:
!pip install gym


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import gym
import numpy as np

# Create the environment
env = gym.make('CartPole-v1')

# Set random seed for producibility
np.random.seed(42)
env.reset(seed=42)

(array([ 0.0273956 , -0.00611216,  0.03585979,  0.0197368 ], dtype=float32),
 {})

### Define the DQN model

In [5]:
import warnings
warnings.filterwarnings('ignore')

def warn(*args, **kwargs):
    pass
warnings.warn = warn

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

def build_model(state_size, action_size):
    model = Sequential()
    model.add(Dense(24, input_dim=state_size, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(action_size, activation='linear'))
    model.compile(loss='mse', optimizer=Adam(learning_rate=0.001))
    return model

state_size_ = env.observation_space.shape[0]
action_size_ = env.action_space.n
model_ = build_model(state_size_, action_size_)

### Implement the replay buffer

In [6]:
from collections import deque
import random

memory = deque(maxlen=2000)
def remember(state, action, reward, next_state, done):
    memory.append((state, action, reward, next_state, done))

In [7]:
epsilon = 1.
epsilon_min = 0.01
epsilon_decay = 0.995

def act(state):
    if np.random.rand() <= epsilon:
        return random.randrange(action_size_)
    q_values = model_.predict(state)
    return np.argmax(q_values[0])

In [10]:
# Define params training
episodes_ = 50 # More episodes to ensure sufficient training
batch_size_ = 32 # Mini-batch size for replay training
gamma = .95 # Discount factor for future rewards


def replay(batch_size):
    global epsilon
    minibatch = random.sample(memory, batch_size)
    for state_, action_, reward_, next_state_, done_ in minibatch:
        target = reward_
        if not done_:
            target = reward_ + gamma * np.amax(model_.predict(next_state_)[0])
            target_f = model_.predict(state_)
            target_f[0][action_] = target
            model_.fit(state_, target_f, epochs=1, verbose=0)
        if epsilon > epsilon_min:
            epsilon *= epsilon_decay

In [None]:
for e in range(episodes_):
    state = env.reset()
    if isinstance(state, tuple):
        state = state[0]
    state = np.reshape(state, [1, state_size_])

    for time in range(200): # Max steps per episode
        # Chose action using epsilon-greedy policy
        action = act(state)

        # Perform action in the environment
        result = env.step(action)
        if len(result) == 4: # Handle 4-value output
            next_state, reward, done, _ = result
        else: # Handle 5-value output
            next_state, reward, done, _, _ = result

        if isinstance(next_state, tuple): # Handle tuple next_state
            next_state = next_state[0]
        next_state = np.reshape(next_state, [1, state_size_])

        # Store experience in memory
        remember(state, action, reward, next_state, done)

        # Update state
        state = next_state

        if done: # If episode ends
            print(f"Episode: {e+1}/{episodes_}, Score: {time}, Epsilon: {epsilon:.2}")
            break

    # Train the model using replay memory
    if len(memory) > batch_size_:
        replay(batch_size_)

env.close()

Episode: 1/50, Score: 32, Epsilon: 1.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━