In [0]:
import gym
import numpy as np
import random
from collections import deque

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import os

##### Set parameters

In [0]:
env = gym.make('CartPole-v0')

In [6]:
state_size = env.observation_space.shape[0]
state_size

4

In [10]:
action_size = env.action_space.n
action_size

2

In [0]:
batch_size = 32

In [0]:
n_episodes = 1000
output_dir = 'output_dir/model/'

In [0]:
if not os.path.exists(output_dir):
  os.makedirs(output_dir)

##### Define agent

In [0]:
class DQNAgent:
  
  def __init__(self, state_size, action_size):
    self.state_size = state_size
    self.action_size = action_size
    
    self.memory = deque(maxlen=2000)      # interested in lastest 2000 entries
    self.gamma = 0.95                     # discount factor
    self.epsilon = 1.0                    #exploration/exploit rate initially only explore
    self.epsilon_decay = 0.995            #shift exploration to eploitation gradually
    self.epsilon_min = 0.01               # still explore 1% of time evven after learning exploit
    
    self.learning_rate = 0.001
    self.model = self._build_model()
    
  def _build_model(self):
    model = Sequential()
    model.add(Dense(24, input_dim=self.state_size, activation='relu'))
    model.add(Dense(24, activation='relu'))
    
    model.add(Dense(self.action_size, activation='linear'))  #linear activation coz we wantmodel to output direct actions
    model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
    return model
  
  def remember(self, state, action, reward, next_state, done):
    self.memory.append((state, action, reward, next_state, done))
    
  def act(self, state):
    if np.random.randn() <= self.epsilon:              # Exploration
      return random.randrange(self.action_size)
    # Exploit
    act_values = self.model.predict(state)
    return np.argmax(act_values[0])
  
  def replay(self, batch_size): #revise this method
    minibatch = random.sample(self.memory, batch_size)
    
    for state, action, reward, next_state, done in minibatch:
      target = reward
      
      if not done:
        target = (reward + self.gamma * np.argmax(self.model.predict(next_state)[0]))
        
      target_f = self.model.predict(state)
      target_f[0][action] = target
      
      self.model.fit(state, target_f, epochs=1, verbose=0)
      
      if self.epsilon > self.epsilon_min:
        self.epsilon = self.epsilon * self.epsilon_decay
        
  def load(self, name):
    self.model.load_weights(name)
    
  def save(self, name):
    self.model.save_weights(name)

In [0]:
agent = DQNAgent(state_size, action_size)

##### Interact with environment

socastic gradient assent

In [52]:
done = False

for e in range(n_episodes):
  
  state = env.reset()
  #print(state)
  state = np.reshape(state, (1, state_size))
  
  for time in range(500): # cartpole v0 only has 200 time steps any thing greater than 200 is fine
    
    #env.render()
    action = agent.act(state)
    
    next_state, reward, done, _ = env.step(action)
    reward = reward if not done else -10
    next_state = np.reshape(next_state, (1, state_size))
    
    agent.remember(state, action, reward, next_state, done)
    
    state = next_state
    
    if done:
      print('Episode: {}/{}, Score: {}, e: {:.2}'.format(e, n_episodes, time, agent.epsilon))
      print(agent.epsilon_decay *  agent.epsilon)
      break
      
  if len(agent.memory) > batch_size:
    agent.replay(batch_size)
    
  if e%50 == 0:
    agent.save(output_dir + 'weigths_' + '{:04d}'.format(e) + '.hdf5')

Episode: 0/1000, Score: 9, e: 1.0
0.995
Episode: 1/1000, Score: 37, e: 1.0
0.995
Episode: 2/1000, Score: 17, e: 0.85
0.8475428503023453
Episode: 3/1000, Score: 13, e: 0.73
0.7219385759785162
Episode: 4/1000, Score: 19, e: 0.62
0.6149486215357263
Episode: 5/1000, Score: 30, e: 0.53
0.5238143793828016
Episode: 6/1000, Score: 10, e: 0.45
0.446186062443672
Episode: 7/1000, Score: 24, e: 0.38
0.3800621177172763
Episode: 8/1000, Score: 15, e: 0.33
0.3237376186352221
Episode: 9/1000, Score: 10, e: 0.28
0.2757603055760701
Episode: 10/1000, Score: 48, e: 0.24
0.23489314109365644
Episode: 11/1000, Score: 27, e: 0.2
0.2000824143909432
Episode: 12/1000, Score: 34, e: 0.17
0.17043057265153258
Episode: 13/1000, Score: 46, e: 0.15
0.1451730787173275
Episode: 14/1000, Score: 25, e: 0.12
0.12365869841532712
Episode: 15/1000, Score: 39, e: 0.11
0.10533270926593409
Episode: 16/1000, Score: 54, e: 0.09
0.08972259762946533
Episode: 17/1000, Score: 47, e: 0.077
0.07642587550895225
Episode: 18/1000, Score: 1

KeyboardInterrupt: ignored

To render openai gym

In [0]:
!apt-get install -y xvfb python-opengl > /dev/null 2>&1
!pip install gym pyvirtualdisplay > /dev/null 2>&1

In [23]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(400, 300))
display.start()

xdpyinfo was not found, X start can not be checked! Please install xdpyinfo!


<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '400x300x24', ':1001'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '400x300x24', ':1001'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>