In [3]:
import gym

In [4]:
env = gym.make('CartPole-v0')

In [5]:
env.reset()

array([-0.00041587,  0.01791534, -0.00104807,  0.00288219])

In [6]:
env.action_space

Discrete(2)

In [7]:
env.action_space.n

2

In [8]:
env.observation_space.shape[0]

4

In [9]:
env.reset()
for t in range(1000):
    random_action = env.action_space.sample()
    env.step(random_action) 
    env.render()
env.close()
env.reset()



array([ 0.01590937, -0.0041906 ,  0.03429063, -0.00164548])

In [10]:
for e in range(20):
    # 20 episodes 
    observation = env.reset()
    for t in range(50):
        env.render()
        action = env.action_space.sample()
        observation,reward,done,other_info = env.step(action)
        
        if done:
            # Game Episode is over
            print("Game Episode :{}/{} High Score :{}".format(e,20,t))
            break

env.close()
print("All 20 episodes over!")

Game Episode :0/20 High Score :25
Game Episode :1/20 High Score :12
Game Episode :2/20 High Score :16
Game Episode :3/20 High Score :22
Game Episode :4/20 High Score :20
Game Episode :5/20 High Score :18
Game Episode :6/20 High Score :23
Game Episode :7/20 High Score :32
Game Episode :8/20 High Score :28
Game Episode :9/20 High Score :29
Game Episode :10/20 High Score :16
Game Episode :11/20 High Score :36
Game Episode :12/20 High Score :32
Game Episode :13/20 High Score :19
Game Episode :14/20 High Score :27
Game Episode :15/20 High Score :12
Game Episode :16/20 High Score :19
Game Episode :17/20 High Score :17
Game Episode :18/20 High Score :18
Game Episode :19/20 High Score :21
All 20 episodes over!


In [11]:
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import random

In [12]:
class Agent: 
    def __init__(self,state_size,action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95 # Discount Factor
        self.epsilon = 1.0 # Exploration Rate
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.learning_rate = 0.001 
        self.model = self._create_model()
        
    
    def _create_model(self):
        # Neural Network To Approximate Q-Value function
        model = Sequential()
        model.add(Dense(24,input_dim=self.state_size,activation='relu')) 
        model.add(Dense(24,activation='relu')) 
        model.add(Dense(self.action_size,activation='linear'))
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        return model
    
    def remember(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done)) 
        
    def act(self,state):
        if np.random.rand()<=self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state) 
        return np.argmax(act_values[0])
    
    def train(self,batch_size=32): 
        minibatch = random.sample(self.memory,batch_size)
        for state,action,reward,next_state,done in minibatch:
            
            if not done: 
                target = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
            else:
                target = reward
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state,target_f,epochs=1,verbose=0)  
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    def load(self,name):
        self.model.load_weights(name)
    def save(self,name):
        self.model.save_weights(name)


In [13]:
n_episodes = 1000
output_dir = "Cartpole_Model/"

In [14]:
agent = Agent(state_size=4,action_size=2)
done = False
state_size = 4
action_size =2
batch_size = 32

In [15]:
agent = Agent(state_size, action_size) 
done = False
for e in range(n_episodes):
    state = env.reset()
    state = np.reshape(state,[1,state_size])
    
    for time in range(5000):
        env.render()
        action = agent.act(state)
        next_state,reward,done,other_info = env.step(action) 
        reward = reward if not done else -10
        next_state = np.reshape(next_state,[1,state_size])
        agent.remember(state,action,reward,next_state,done)
        state = next_state
        
        if done:
            print("Game Episode :{}/{}, High Score:{},Exploration Rate:{:.2}".format(e,n_episodes,time,agent.epsilon))
            break
            
    if len(agent.memory)>batch_size:
        agent.train(batch_size)
    
    if e%50==0:
        agent.save(output_dir+"weights_"+'{:04d}'.format(e)+".hdf5")
print("Deep Q-Learner Model Trained.")        
env.close()

Game Episode :0/1000, High Score:18,Exploration Rate:1.0
Game Episode :1/1000, High Score:18,Exploration Rate:1.0
Game Episode :2/1000, High Score:39,Exploration Rate:0.99
Game Episode :3/1000, High Score:39,Exploration Rate:0.99
Game Episode :4/1000, High Score:18,Exploration Rate:0.99
Game Episode :5/1000, High Score:24,Exploration Rate:0.98
Game Episode :6/1000, High Score:25,Exploration Rate:0.98
Game Episode :7/1000, High Score:19,Exploration Rate:0.97
Game Episode :8/1000, High Score:12,Exploration Rate:0.97
Game Episode :9/1000, High Score:17,Exploration Rate:0.96
Game Episode :10/1000, High Score:50,Exploration Rate:0.96
Game Episode :11/1000, High Score:13,Exploration Rate:0.95
Game Episode :12/1000, High Score:36,Exploration Rate:0.95
Game Episode :13/1000, High Score:12,Exploration Rate:0.94
Game Episode :14/1000, High Score:27,Exploration Rate:0.94
Game Episode :15/1000, High Score:12,Exploration Rate:0.93
Game Episode :16/1000, High Score:10,Exploration Rate:0.93
Game Epis

Game Episode :139/1000, High Score:27,Exploration Rate:0.5
Game Episode :140/1000, High Score:18,Exploration Rate:0.5
Game Episode :141/1000, High Score:30,Exploration Rate:0.5
Game Episode :142/1000, High Score:15,Exploration Rate:0.49
Game Episode :143/1000, High Score:21,Exploration Rate:0.49
Game Episode :144/1000, High Score:30,Exploration Rate:0.49
Game Episode :145/1000, High Score:20,Exploration Rate:0.49
Game Episode :146/1000, High Score:19,Exploration Rate:0.48
Game Episode :147/1000, High Score:21,Exploration Rate:0.48
Game Episode :148/1000, High Score:45,Exploration Rate:0.48
Game Episode :149/1000, High Score:104,Exploration Rate:0.48
Game Episode :150/1000, High Score:31,Exploration Rate:0.47
Game Episode :151/1000, High Score:24,Exploration Rate:0.47
Game Episode :152/1000, High Score:37,Exploration Rate:0.47
Game Episode :153/1000, High Score:35,Exploration Rate:0.47
Game Episode :154/1000, High Score:58,Exploration Rate:0.46
Game Episode :155/1000, High Score:22,Expl

Game Episode :276/1000, High Score:91,Exploration Rate:0.25
Game Episode :277/1000, High Score:131,Exploration Rate:0.25
Game Episode :278/1000, High Score:199,Exploration Rate:0.25
Game Episode :279/1000, High Score:173,Exploration Rate:0.25
Game Episode :280/1000, High Score:171,Exploration Rate:0.25
Game Episode :281/1000, High Score:199,Exploration Rate:0.25
Game Episode :282/1000, High Score:199,Exploration Rate:0.24
Game Episode :283/1000, High Score:199,Exploration Rate:0.24
Game Episode :284/1000, High Score:199,Exploration Rate:0.24
Game Episode :285/1000, High Score:199,Exploration Rate:0.24
Game Episode :286/1000, High Score:180,Exploration Rate:0.24
Game Episode :287/1000, High Score:178,Exploration Rate:0.24
Game Episode :288/1000, High Score:166,Exploration Rate:0.24
Game Episode :289/1000, High Score:199,Exploration Rate:0.24
Game Episode :290/1000, High Score:185,Exploration Rate:0.23
Game Episode :291/1000, High Score:139,Exploration Rate:0.23
Game Episode :292/1000, H

KeyboardInterrupt: 