In [23]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

env = gym.make('CartPole-v1')
GAMMA = 0.95
LEARNING_RATE = 0.001

MEMORY_SIZE = 1000000
BATCH_SIZE = 20

EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.995

class DQNSolver:

    def __init__(self, observation_space, action_space):
        self.exploration_rate = EXPLORATION_MAX

        self.action_space = action_space
        self.memory = deque(maxlen=MEMORY_SIZE)

        self.model = Sequential()
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))
        self.model.add(Dense(24, activation="relu"))
        self.model.add(Dense(self.action_space, activation="linear"))
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(self.action_space)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = random.sample(self.memory, BATCH_SIZE)
        for state, action, reward, state_next, terminal in batch:
            q_update = reward
            if not terminal:
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))
            q_values = self.model.predict(state)
            q_values[0][action] = q_update
            self.model.fit(state, q_values, verbose=0)
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)

observation_space = env.observation_space.shape[0]
action_space = env.action_space.n
dqn_solver = DQNSolver(observation_space, action_space)

for i_episode in range(200):
    observation = env.reset()
    observation = np.reshape(observation, [1, observation_space])
    for t in range(700):
        action = dqn_solver.act(observation)
        observation_next, reward, done, info = env.step(action)
        reward = reward if not done else -reward
        observation_next = np.reshape(observation_next, [1, observation_space])
        dqn_solver.remember(observation, action, reward, observation_next, done)
        observation = observation_next
        dqn_solver.experience_replay()
        if done:
            print(f"Run: {i_episode}, exploration_rate: {dqn_solver.exploration_rate}, score: {t}")
            break
env.close()

Episode finished after 20 timesteps
Run: 0, exploration_rate: 0.995, score: 19
19
Episode finished after 25 timesteps
Run: 1, exploration_rate: 0.8778091417340573, score: 24
24


KeyboardInterrupt: 

In [14]:
env.action_space.n

2

In [18]:
env.observation_space.__dict__

{'dtype': dtype('float32'),
 'shape': (4,),
 'low': array([-4.8000002e+00, -3.4028235e+38, -4.1887903e-01, -3.4028235e+38],
       dtype=float32),
 'high': array([4.8000002e+00, 3.4028235e+38, 4.1887903e-01, 3.4028235e+38],
       dtype=float32),
 'bounded_below': array([ True,  True,  True,  True]),
 'bounded_above': array([ True,  True,  True,  True]),
 'np_random': RandomState(MT19937) at 0x10E92C940}

In [None]:
from gym import spaces
space = spaces.Discrete(8)

In [None]:
space.sample()

In [None]:
env.observation_space.high