In [25]:
import numpy as np
import pandas as pd 
import tensorflow as tf 
import gym 
import os
import copy
from collections import deque

import matplotlib.pyplot as plt

In [168]:
class DQN() :
    
    def __init__(self , envName = "CartPole" ) :
        self.env = gym.make("{}-v0".format(envName))
        self.envName = envName
        self.prefix = "DQN"
        self.modelName = "{}-{}".format(self.prefix,self.envName)
        self.modelPath = "./model/{}-{}.h5".format(self.prefix,self.envName)
        self.memory = deque(maxlen = 3000)
        
        self.gamma = 0.9
        
        #epsilon for egreedy
        self.epsilon = 1.0
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        
        self._buildModel()
        self.history = {"score":[]}
        
        #if os.path.exists(self.modelPath) :
        #    self.loadModel(self.modelPath) 
        #    self.epsilon = self.epsilon_min
        #else :
        #    self._buildModel()         
       
    def loadModel(self,path=None) :
        if path==None :
            path = self.modelPath
        assert os.path.exists(path) 
        self.model = tf.keras.models.load_model(path)
        self.epsilon = self.epsilon_min
        print("load model {} in {} success".format(self.modelName , path))
            
    def saveModel(self,path=None) :
        if path==None :
            path = self.modelPath
        self.model.save(path)
        print("save model {} success.".format(self.modelName ))
        
    def _buildModel(self) :
        model = tf.keras.models.Sequential()
        model.add(tf.keras.layers.Dense(64 , input_shape = self.env.observation_space.shape ,activation="relu" ))
        model.add(tf.keras.layers.Dense(128 ,activation="relu" ))
        model.add(tf.keras.layers.Dense(128 ,activation="relu" ))
        model.add(tf.keras.layers.Dense(self.env.action_space.n ,activation="linear" ))
        model.compile(loss="mse", optimizer="adam" )
        model.summary()
        self.model = model
        print("build model {} success".format(self.modelName))
    
    def act(self,state):
        if np.random.random() <= self.epsilon :
            return self.env.action_space.sample()
        else :
            state = state.reshape(-1,self.env.observation_space.shape[0])
            return np.argmax(self.model.predict(state))
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append( (state, action, reward, next_state, done) )
    
    def update_epsilon(self):
        if self.epsilon >= self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def replay(self , batch_size) :
        batch_size = min(batch_size , len(self.memory))
        batches = np.random.choice(len(self.memory) , batch_size )
        for i in batches :
            state, action, reward, next_state, done = self.memory[i]
            if not done :
                reward += self.gamma * np.amax(self.model.predict(next_state)[0])
            target = self.model.predict(state)
            target[0][action] = reward
            self.model.fit(state, target, epochs=1, verbose=0)
        self.update_epsilon()
    
    def getReward(state, reward, done) :
        return 100 if done else reward
    
    def getScore(score,reward) :
        return score+reward
    
    def train(self, epochs , render = False) :
        for epoch in range(epochs+1) :
            state  = self.env.reset()
            state = state.reshape(-1,self.env.observation_space.shape[0])
            done = False
            score = 0
            while not done :
                if render :
                    self.env.render()
                action = agent.act(state)
                next_state, reward, done, _ = self.env.step(action)
                next_state = next_state.reshape(-1,self.env.observation_space.shape[0])
                
                reward = self.getReward(state , reward , done)
                score = self.getScore(score,reward)
                
                agent.remember(state, action, reward, next_state, done)
                
                state = copy.deepcopy(next_state)
                state = state.reshape(-1,self.env.observation_space.shape[0])
                
                if done :
                    self.history["score"].append(score)
                    print("epoch {} / {} played {} scores.".format(epoch ,epochs ,  score) )
                    score=0
            self.replay(32)        
            if epoch%100 == 0 :
                self.saveModel()
        self.env.close()
    
    def play(self , epochs , render = False) :
        state  = self.env.reset()
        epoch = 0
        score = 0
        while epoch<epochs :
            if render :
                self.env.render()
            state = state.reshape(-1,self.env.observation_space.shape[0])
            action = np.argmax(self.model.predict(state))
            next_state , reward , done , _ = self.env.step(action)
            reward = self.getReward(state , reward , done)
            score = self.getScore(score,reward)
            state = copy.deepcopy(next_state)
            
            if done :
                print( "epoch {} / {} played {} scores.".format(epoch ,epochs ,  score) )
                epoch += 1
                state  = self.env.reset()
                score = 0
        self.env.close()

In [171]:
def getReward(state, reward, done) :
    return state[0,0]+10 if  state[0,0]>=0.5 else state[0,0]+0.5
def getScore(score,reward) :
    return max(score,reward)
agent = DQN("MountainCar")
agent.getReward = getReward
agent.getScore = getScore
# agent.loadModel()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_155 (Dense)            (None, 64)                192       
_________________________________________________________________
dense_156 (Dense)            (None, 128)               8320      
_________________________________________________________________
dense_157 (Dense)            (None, 128)               16512     
_________________________________________________________________
dense_158 (Dense)            (None, 3)                 387       
Total params: 25,411
Trainable params: 25,411
Non-trainable params: 0
_________________________________________________________________
build model DQN-MountainCar success


In [None]:
agent.train(1000)

epoch 0 / 1000 played 0.13592327320495018 scores.
save model DQN-MountainCar success.
epoch 1 / 1000 played 0.04789348201520749 scores.
epoch 2 / 1000 played 0.05766140668208758 scores.
epoch 3 / 1000 played 0.15085886476386484 scores.
epoch 4 / 1000 played 0.05056157875464973 scores.
epoch 5 / 1000 played 0.18572913698947668 scores.
epoch 6 / 1000 played 0.19986448079930486 scores.
epoch 7 / 1000 played 0.04640440348248365 scores.
epoch 8 / 1000 played 0.06652054856517509 scores.
epoch 9 / 1000 played 0.06458720902111953 scores.
epoch 10 / 1000 played 0.12835219754071708 scores.
epoch 11 / 1000 played 0.05086030688574017 scores.
epoch 12 / 1000 played 0.09393878640188535 scores.
epoch 13 / 1000 played 0.21012153671142514 scores.
epoch 14 / 1000 played 0.09249482497614347 scores.
epoch 15 / 1000 played 0.07199195835187372 scores.
epoch 16 / 1000 played 0.06700439595313712 scores.
epoch 17 / 1000 played 0.014482848695038852 scores.
epoch 18 / 1000 played 0.06603871863046212 scores.
epoc

epoch 159 / 1000 played 0.1255673491997904 scores.
epoch 160 / 1000 played 0.32199031555420876 scores.
epoch 161 / 1000 played 0.22694580083950922 scores.
epoch 162 / 1000 played 0.2365457726423164 scores.
epoch 163 / 1000 played 0.13135896895751065 scores.
epoch 164 / 1000 played 0.09121686879866503 scores.
epoch 165 / 1000 played 0.21549546215144233 scores.
epoch 166 / 1000 played 0.0856806789405189 scores.
epoch 167 / 1000 played 0.27387327301900344 scores.
epoch 168 / 1000 played 0.2678408445804599 scores.
epoch 169 / 1000 played 0.21665913909255552 scores.
epoch 170 / 1000 played 0.11231284339983921 scores.
epoch 171 / 1000 played 0.11335674228551978 scores.
epoch 172 / 1000 played 0.09765102023118633 scores.
epoch 173 / 1000 played 0.1744089775568814 scores.
epoch 174 / 1000 played 0.16132374819495648 scores.
epoch 175 / 1000 played 0.11076324303156793 scores.
epoch 176 / 1000 played 0.11264215668350996 scores.
epoch 177 / 1000 played 0.16924266095356266 scores.
epoch 178 / 1000 

epoch 317 / 1000 played 0.21515458213822752 scores.
epoch 318 / 1000 played 0.18218548290965308 scores.
epoch 319 / 1000 played 0.19298222924407604 scores.
epoch 320 / 1000 played 0.2698690055075693 scores.
epoch 321 / 1000 played 0.2974798774271516 scores.
epoch 322 / 1000 played 0.20504929967265612 scores.
epoch 323 / 1000 played 0.16689641659150695 scores.
epoch 324 / 1000 played 0.2049284799277608 scores.
epoch 325 / 1000 played 0.1749187574185147 scores.
epoch 326 / 1000 played 0.22387551825825447 scores.
epoch 327 / 1000 played 0.21815633300677595 scores.
epoch 328 / 1000 played 0.23515109348933133 scores.
epoch 329 / 1000 played 0.2969108048362531 scores.
epoch 330 / 1000 played 0.20859555500712185 scores.
epoch 331 / 1000 played 0.14719112432301357 scores.
epoch 332 / 1000 played 0.235754275620843 scores.
epoch 333 / 1000 played 0.162107490077608 scores.
epoch 334 / 1000 played 0.17041973121867743 scores.
epoch 335 / 1000 played 0.1568459976094827 scores.
epoch 336 / 1000 playe

epoch 476 / 1000 played 0.15428285277676346 scores.
epoch 477 / 1000 played 0.3115599732046821 scores.
epoch 478 / 1000 played 0.335959988071757 scores.
epoch 479 / 1000 played 0.16359714774776257 scores.
epoch 480 / 1000 played 0.32382059351835313 scores.
epoch 481 / 1000 played 0.259535866815231 scores.
epoch 482 / 1000 played 0.2784335592046936 scores.
epoch 483 / 1000 played 0.17520909197678347 scores.
epoch 484 / 1000 played 0.23914090069237143 scores.
epoch 485 / 1000 played 0.31408555160149665 scores.
epoch 486 / 1000 played 0.2610513282500223 scores.
epoch 487 / 1000 played 0.1820420363092629 scores.
epoch 488 / 1000 played 0.34300544883175077 scores.
epoch 489 / 1000 played 0.11485551179248349 scores.
epoch 490 / 1000 played 0.05973374693080252 scores.
epoch 491 / 1000 played 0.2297026484905325 scores.
epoch 492 / 1000 played 0.2752796049034558 scores.
epoch 493 / 1000 played 0.21008275944988924 scores.
epoch 494 / 1000 played 0.1271002784469592 scores.
epoch 495 / 1000 played

epoch 634 / 1000 played 0.27689561294590365 scores.
epoch 635 / 1000 played 0.15548253777005322 scores.
epoch 636 / 1000 played 0.05974570770219395 scores.
epoch 637 / 1000 played 0.30354822925923075 scores.
epoch 638 / 1000 played 0.32484715631956745 scores.
epoch 639 / 1000 played 0.1965955643421513 scores.
epoch 640 / 1000 played 0.22279546242532522 scores.
epoch 641 / 1000 played 0.1985638028204872 scores.
epoch 642 / 1000 played 0.19826892002931218 scores.
epoch 643 / 1000 played 0 scores.
epoch 644 / 1000 played 0.13548303892305763 scores.
epoch 645 / 1000 played 0.19317397799170732 scores.
epoch 646 / 1000 played 0.2851373436107145 scores.
epoch 647 / 1000 played 0.2440679343025795 scores.
epoch 648 / 1000 played 0.24613364293987355 scores.
epoch 649 / 1000 played 0.27052499710125577 scores.
epoch 650 / 1000 played 0.14963531370235994 scores.
epoch 651 / 1000 played 0.31240412108092586 scores.
epoch 652 / 1000 played 0.20971281716506435 scores.
epoch 653 / 1000 played 0.28648919

epoch 793 / 1000 played 0.3321271895768979 scores.
epoch 794 / 1000 played 0.2579258207633384 scores.
epoch 795 / 1000 played 0.28418714263118805 scores.
epoch 796 / 1000 played 0.6528924293634106 scores.
epoch 797 / 1000 played 0.29742381218724695 scores.
epoch 798 / 1000 played 0.22799166399809667 scores.
epoch 799 / 1000 played 0.12442635614398723 scores.
epoch 800 / 1000 played 0.16535377228624049 scores.
save model DQN-MountainCar success.
epoch 801 / 1000 played 0.2766915351389482 scores.
epoch 802 / 1000 played 0.17543798377286746 scores.
epoch 803 / 1000 played 0.3215345047995453 scores.
epoch 804 / 1000 played 0.19921497034514934 scores.
epoch 805 / 1000 played 0.271054381911578 scores.
epoch 806 / 1000 played 0.35054521215183765 scores.
epoch 807 / 1000 played 0.21480938657060372 scores.
epoch 808 / 1000 played 0.28799767546839117 scores.
epoch 809 / 1000 played 0.3155986569738862 scores.
epoch 810 / 1000 played 0.26505704519454554 scores.
epoch 811 / 1000 played 0.1764771449

In [None]:
agent.play(5,True)

In [None]:
plt.plot(agent.history["score"])