In [3]:
import gym

In [4]:
env=gym.make('CartPole-v0')

In [5]:
env.reset()

array([ 0.01314811, -0.04619591,  0.00447113, -0.03369146])

In [6]:
for e in range(20):
    obs=env.reset()
    for t in range(50):
        env.render()
        obs,reward,done,other=env.step(env.action_space.sample())
        if done:
            print("Game Episode {}/{} High Score {}".format(e+1,20,t))
            break
env.close()
print("All 20 episodes done")

Game Episode 1/20 High Score 19
Game Episode 2/20 High Score 20
Game Episode 3/20 High Score 12
Game Episode 4/20 High Score 17
Game Episode 5/20 High Score 18
Game Episode 6/20 High Score 11
Game Episode 7/20 High Score 21
Game Episode 8/20 High Score 13
Game Episode 9/20 High Score 14
Game Episode 10/20 High Score 17
Game Episode 11/20 High Score 9
Game Episode 12/20 High Score 25
Game Episode 13/20 High Score 11
Game Episode 14/20 High Score 15
Game Episode 15/20 High Score 12
Game Episode 16/20 High Score 31
Game Episode 17/20 High Score 46
Game Episode 18/20 High Score 41
Game Episode 19/20 High Score 18
Game Episode 20/20 High Score 24
All 20 episodes done


In [7]:
import numpy as np
import random
import os
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from collections import deque
import matplotlib.pyplot as plt
class Agent :
    def __init__(self,state_size,action_size):
        self.state_size=state_size
        self.action_size=action_size
        self.memory=deque(maxlen=2000)
        self.gamma=0.95 #discount Error
        #in start epsilon is 100% i.e we trust exploration(random decison) more than past knowledge as in the start we dont have any past knowledge
        self.epsilon=1.0
        self.epsilon_decay=0.995
        #after each game we decrease the trust on exploration by 5% and also start learning by past knowledge
        self.epsilon_min=0.01
        #even after 100000 games we should keep some factor for exploration
        self.learning_rate=0.001
        #learning rate for neural network
        self.model=self._create_model()
    
    def _create_model(self):
        model=Sequential()
        model.add(Dense(24,input_dim=self.state_size,activation='relu'))
        model.add(Dense(24,activation='relu'))
        model.add(Dense(self.action_size,activation='linear'))
        model.compile(loss='mse',optimizer=Adam(lr=0.001))
        return model
    def remember(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done))
        
                    
    def act(self,state):
        if np.random.randn()<=self.epsilon:
            #take a random action(exploration)
            return random.randrange(self.action_size)
        else:
        #ask the neural network to tell a suitable action
            #prediction is of form [[action_size]]
            #so we take 0 index to get the main list
            return np.argmax(self.model.predict(state)[0])
    def train(self,batch_size=32):
        #train using a replay buffer
        #pick samples from the memory
        minibatch=random.sample(self.memory,batch_size)
        for experience in minibatch:
            state,action,reward,next_state,done=experience
            #train using X & Y (X-> state) & (Y->expected reward from this stage)
            #predict the expected reward using bellman's equation
            if not done:
                #game not over
                target=reward+self.gamma*np.amax(self.model.predict(next_state)[0])
            else:
                target=reward
            target_f=self.model.predict(state)
            target_f[0][action]=target
            
            self.model.fit(state,target_f,epochs=1,verbose=0)
            if self.epsilon >self.epsilon_min:
                self.epsilon*=self.epsilon_decay
        
    def load(self,name):
        self.model.load_weights(name)
    def save(self,name):
        self.model.save_weights(name)

In [None]:
n_episodes=1000
batch_size=32
output_dir='cartpole_model/'
state_size=4
agent=Agent(state_size=4,action_size=2)
done=False
for e in range(n_episodes):
    state=env.reset()
    state=np.reshape(state,[1,state_size])
    for time in range(500):
        env.render()
        action=agent.act(state)
        #action is 0 or 1
        next_state,reward,done,other_info=env.step(action)
        reward=reward if not done else -10
        next_state=np.reshape(next_state,[1,state_size])
        agent.remember(state,action,reward,next_state,done)
        if done:
            print("Game Episode {}/{} High Score {}".format(e+1,n_episodes,time))
            break
        #train the model
        if len(agent.memory)>batch_size:
            agent.train(batch_size)
        if e%50==0:
            agent.save(output_dir+"weights_"+'{:04d}'.format(e)+'.hdf5')
print("Deep Q Learner Model trained")
env.close()

In [None]:
model.summary()