In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from collections import deque
import random

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# Creating env
env = gym.make('CartPole-v0')

In [3]:
env.reset()
for t in range(10000):
    random_action = env.action_space.sample()
    env.step(random_action)
    env.render()

env.close()



In [4]:
class Agent:
    
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = .95
        self.epsilon = 1.0 # 100% exploration in the beggining
        self.epsilon_decay = .995
        self.epsilon_min = .01
        self.learning_rate = .0001
        self.model = self._create_model()
        
    def _create_model():
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=.0001))
#         model.summary()
        return model

    def remember(self, state, action, reward, next_state, done):
        
        # Remember past action
        self.memory.append((state, action, reward, next_state, done))
    
    def act(self, state):
        
        # Epsilon Greedy Method
        if np.random.rand()<=self.epsilon():
            # Take a random action
            return np.random.randrange(self.action_size)
        # Ask neural network to suggest suitable action
        return np.argmax(self.model.predict(state)[0])
    
    def train(self, batch_size=32):
        
        # Training using replay buffer
        minibatch = np.random.sample(self.memory, batch_size)
        
        for experience in minibatch:
            
            state, action, reward, next_state, done = experience
            
            if not done:
                target = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
            else:
                target = reward
                
            target_f = self.model.predict(state)
            target_f[0][action] = target
            
            self.model.fit(state, target_f, epochs=1, verbose=0)
            
        if self.epsilon>self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    def load(self, name):
        self.model.load_weights(name)
    
    def save(self, name):
        self.model.save_weights(name)
                
            

## Training the DQN agent