In [2]:
from __future__ import print_function, absolute_import, division

import numpy as np
import pandas as pd
pd.set_option("max_columns", None)
import matplotlib
from matplotlib import pyplot as plt
#import seaborn as sns
%precision 2

import os
import ast
import csv
import time
import datetime
import json
import random
import math
from time import ctime

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

import gym

from collections import namedtuple, deque

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

is_ipython = "inline" in matplotlib.get_backend()
if is_ipython:
    from IPython import display

'%.2f'

In [52]:
class DQN:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.99
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_min = 0.05
        self.epsilon_decay = (self.epsilon - self.epsilon_min) / 50000
        self.batch_size = 64
        self.memory = deque(maxlen=2000)
        self.episodes = 1000
        self.train_start = 1000
        self.model = self.build_model()
        self.target_model = self.build_model()
        self.update_target_model()
    
    def build_model(self):
        model = Sequential()
        model.add(Dense(32, input_dim=self.state_size, activation="relu", kernel_initializer="he_uniform"))
        model.add(Dense(16, activation="relu", kernel_initializer="he_uniform"))
        model.add(Dense(self.action_size, activation="linear", kernel_initializer="he_uniform"))
        model.compile(loss="mse",optimizer=Adam(lr=self.learning_rate))
        model.summary()
        return model
    
    def act(self, state):
        if self.epsilon<np.random.rand():
            return np.random.choice(np.arange(self.action_size))
        else:
            act_values = self.model.predict(state)
            return np.argmax(act_values[0])
    
    def remember(self,state, action, reward, next_state, done):
        if action == 2:
            action = 1
        self.memory.append((state, action, reward, next_state, done))
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay
    
    def replay(self):
        if len(self.memory)<self.train_start:
            return
        batch_size = min(len(self.memory),self.batch_size)
        mini_batch = random.sample(self.memory, batch_size)
        update_input = np.zeros((batch_size, self.state_size))
        update_target = np.zeros((batch_size, self.state_size))
        for i in range(batch_size):
            state, action, reward, next_state, done = mini_batch[i]
            target = self.model.predict(state)[0]
            if done:
                target[action] = reward
            else:
                target[action] = reward + self.gamma* np.amax(self.target_model.predict(next_state)[0])
            update_input[i] = state
            update_target[i] = target
        self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0)
    
    def load_model(self, name):
        self.model.load_weights(name)
    
    def save_model(self, name):
        self.model.save_weights(name)
    
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

In [None]:
if __name__ == "__main__":
    env = gym.make('MountainCar-v0')
    state_size = env.observation_space.shape[0]
    action_size = 2
    agent = DQN(state_size, action_size)
    #agent.load_model("./save_model/MountainCar_DQN.h5")
    scores, episodes = [], []

    for e in range(agent.episodes):
        done = False
        score = 0
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        print(state)
        fake_action = 0
        action_count = 0
        while not done:
#             if agent.render:
#                 env.render()
            action_count = action_count + 1
            if action_count == 4:
                action = agent.act(state)
                action_count = 0
                if action == 0:
                    fake_action = 0
                elif action == 1:
                    fake_action = 2
            #print("fake_action ",fake_action, "action ", action)
            next_state, reward, done, info = env.step(fake_action)
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, fake_action, reward, next_state, done)
            agent.replay()
            score += reward
            state = next_state

            if done:
                env.reset()
                agent.update_target_model()
                scores.append(score)
                episodes.append(e)
                print("episode:", e, "  score:", score, "  memory length:", len(agent.memory),
                      "  epsilon:", agent.epsilon)
#         if e % 50 == 0:
#              agent.save_model("./save_model/MountainCar_DQN.h5")


I0506 22:19:35.848170 140423791109952 registration.py:117] Making new env: MountainCar-v0
[2019-05-06 22:19:35,848] Making new env: MountainCar-v0


Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_48 (Dense)             (None, 32)                96        
_________________________________________________________________
dense_49 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_50 (Dense)             (None, 2)                 34        
Total params: 658
Trainable params: 658
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_51 (Dense)             (None, 32)                96        
_________________________________________________________________
dense_52 (Dense)             (None, 16)                528       
________________________________

array([-0.5,  0. ])

episode: 0   score: -200.0   memory length: 200   epsilon: 0.9962000000000018
[[-0.6  0. ]]


array([-0.55,  0.  ])

episode: 1   score: -200.0   memory length: 400   epsilon: 0.9924000000000035
[[-0.58  0.  ]]


array([-0.59,  0.  ])

episode: 2   score: -200.0   memory length: 600   epsilon: 0.9886000000000053
[[-0.49  0.  ]]


array([-0.58,  0.  ])

episode: 3   score: -200.0   memory length: 800   epsilon: 0.984800000000007
[[-0.58  0.  ]]


array([-0.4,  0. ])

episode: 4   score: -200.0   memory length: 1000   epsilon: 0.9810000000000088
[[-0.43  0.  ]]


array([-0.5,  0. ])

episode: 5   score: -200.0   memory length: 1200   epsilon: 0.9772000000000105
[[-0.54  0.  ]]


array([-0.5,  0. ])

episode: 6   score: -200.0   memory length: 1400   epsilon: 0.9734000000000123
[[-0.4  0. ]]


array([-0.55,  0.  ])

episode: 7   score: -200.0   memory length: 1600   epsilon: 0.969600000000014
[[-0.51  0.  ]]


array([-0.56,  0.  ])

episode: 8   score: -200.0   memory length: 1800   epsilon: 0.9658000000000158
[[-0.57  0.  ]]


array([-0.41,  0.  ])

episode: 9   score: -200.0   memory length: 2000   epsilon: 0.9620000000000175
[[-0.54  0.  ]]


array([-0.41,  0.  ])

episode: 10   score: -200.0   memory length: 2000   epsilon: 0.9582000000000193
[[-0.41  0.  ]]


array([-0.54,  0.  ])

episode: 11   score: -200.0   memory length: 2000   epsilon: 0.954400000000021
[[-0.42  0.  ]]


array([-0.45,  0.  ])

episode: 12   score: -200.0   memory length: 2000   epsilon: 0.9506000000000228
[[-0.53  0.  ]]


array([-0.5,  0. ])

episode: 13   score: -200.0   memory length: 2000   epsilon: 0.9468000000000245
[[-0.44  0.  ]]


array([-0.55,  0.  ])

episode: 14   score: -200.0   memory length: 2000   epsilon: 0.9430000000000263
[[-0.53  0.  ]]


array([-0.43,  0.  ])

episode: 15   score: -200.0   memory length: 2000   epsilon: 0.939200000000028
[[-0.51  0.  ]]


array([-0.52,  0.  ])

episode: 16   score: -200.0   memory length: 2000   epsilon: 0.9354000000000298
[[-0.52  0.  ]]


array([-0.57,  0.  ])

episode: 17   score: -200.0   memory length: 2000   epsilon: 0.9316000000000315
[[-0.46  0.  ]]


array([-0.46,  0.  ])

episode: 18   score: -200.0   memory length: 2000   epsilon: 0.9278000000000333
[[-0.58  0.  ]]


array([-0.54,  0.  ])

episode: 19   score: -200.0   memory length: 2000   epsilon: 0.924000000000035
[[-0.47  0.  ]]


array([-0.58,  0.  ])

episode: 20   score: -200.0   memory length: 2000   epsilon: 0.9202000000000368
[[-0.49  0.  ]]


array([-0.51,  0.  ])

episode: 21   score: -200.0   memory length: 2000   epsilon: 0.9164000000000385
[[-0.53  0.  ]]


array([-0.53,  0.  ])

episode: 22   score: -200.0   memory length: 2000   epsilon: 0.9126000000000403
[[-0.49  0.  ]]


array([-0.46,  0.  ])

episode: 23   score: -200.0   memory length: 2000   epsilon: 0.908800000000042
[[-0.43  0.  ]]


array([-0.51,  0.  ])

episode: 24   score: -200.0   memory length: 2000   epsilon: 0.9050000000000438
[[-0.56  0.  ]]


array([-0.5,  0. ])

episode: 25   score: -200.0   memory length: 2000   epsilon: 0.9012000000000455
[[-0.42  0.  ]]


array([-0.4,  0. ])

episode: 26   score: -200.0   memory length: 2000   epsilon: 0.8974000000000473
[[-0.6  0. ]]


array([-0.59,  0.  ])

episode: 27   score: -200.0   memory length: 2000   epsilon: 0.893600000000049
[[-0.54  0.  ]]


array([-0.51,  0.  ])

episode: 28   score: -200.0   memory length: 2000   epsilon: 0.8898000000000508
[[-0.4  0. ]]


array([-0.49,  0.  ])

episode: 29   score: -189.0   memory length: 2000   epsilon: 0.8862090000000524
[[-0.59  0.  ]]


array([-0.45,  0.  ])

episode: 30   score: -200.0   memory length: 2000   epsilon: 0.8824090000000542
[[-0.49  0.  ]]


array([-0.53,  0.  ])

episode: 31   score: -200.0   memory length: 2000   epsilon: 0.8786090000000559
[[-0.45  0.  ]]


array([-0.58,  0.  ])

episode: 32   score: -200.0   memory length: 2000   epsilon: 0.8748090000000577
[[-0.45  0.  ]]


array([-0.59,  0.  ])

episode: 33   score: -200.0   memory length: 2000   epsilon: 0.8710090000000594
[[-0.4  0. ]]


array([-0.55,  0.  ])

episode: 34   score: -200.0   memory length: 2000   epsilon: 0.8672090000000612
[[-0.4  0. ]]


array([-0.46,  0.  ])

episode: 35   score: -200.0   memory length: 2000   epsilon: 0.8634090000000629
[[-0.52  0.  ]]


array([-0.56,  0.  ])

episode: 36   score: -200.0   memory length: 2000   epsilon: 0.8596090000000647
[[-0.41  0.  ]]


array([-0.43,  0.  ])

episode: 37   score: -200.0   memory length: 2000   epsilon: 0.8558090000000664
[[-0.51  0.  ]]


array([-0.57,  0.  ])

episode: 38   score: -200.0   memory length: 2000   epsilon: 0.8520090000000682
[[-0.44  0.  ]]


array([-0.57,  0.  ])

episode: 39   score: -200.0   memory length: 2000   epsilon: 0.8482090000000699
[[-0.46  0.  ]]


array([-0.54,  0.  ])

episode: 40   score: -200.0   memory length: 2000   epsilon: 0.8444090000000717
[[-0.53  0.  ]]


array([-0.53,  0.  ])

episode: 41   score: -200.0   memory length: 2000   epsilon: 0.8406090000000734
[[-0.52  0.  ]]


array([-0.43,  0.  ])

episode: 42   score: -200.0   memory length: 2000   epsilon: 0.8368090000000752
[[-0.57  0.  ]]


array([-0.46,  0.  ])

episode: 43   score: -200.0   memory length: 2000   epsilon: 0.8330090000000769
[[-0.58  0.  ]]


array([-0.6,  0. ])

episode: 44   score: -200.0   memory length: 2000   epsilon: 0.8292090000000787
[[-0.44  0.  ]]


array([-0.54,  0.  ])

episode: 45   score: -200.0   memory length: 2000   epsilon: 0.8254090000000804
[[-0.53  0.  ]]


array([-0.6,  0. ])

episode: 46   score: -200.0   memory length: 2000   epsilon: 0.8216090000000822
[[-0.57  0.  ]]


array([-0.45,  0.  ])

episode: 47   score: -200.0   memory length: 2000   epsilon: 0.8178090000000839
[[-0.54  0.  ]]


array([-0.52,  0.  ])

episode: 48   score: -200.0   memory length: 2000   epsilon: 0.8140090000000857
[[-0.58  0.  ]]


array([-0.43,  0.  ])

episode: 49   score: -200.0   memory length: 2000   epsilon: 0.8102090000000874
[[-0.41  0.  ]]


array([-0.59,  0.  ])

episode: 50   score: -200.0   memory length: 2000   epsilon: 0.8064090000000892
[[-0.48  0.  ]]


array([-0.56,  0.  ])

episode: 51   score: -200.0   memory length: 2000   epsilon: 0.8026090000000909
[[-0.59  0.  ]]


array([-0.56,  0.  ])

episode: 52   score: -200.0   memory length: 2000   epsilon: 0.7988090000000927
[[-0.55  0.  ]]


array([-0.55,  0.  ])

episode: 53   score: -200.0   memory length: 2000   epsilon: 0.7950090000000944
[[-0.5  0. ]]


array([-0.52,  0.  ])

episode: 54   score: -200.0   memory length: 2000   epsilon: 0.7912090000000962
[[-0.47  0.  ]]


array([-0.47,  0.  ])

episode: 55   score: -200.0   memory length: 2000   epsilon: 0.787409000000098
[[-0.44  0.  ]]


array([-0.54,  0.  ])

episode: 56   score: -200.0   memory length: 2000   epsilon: 0.7836090000000997
[[-0.53  0.  ]]


array([-0.57,  0.  ])

episode: 57   score: -200.0   memory length: 2000   epsilon: 0.7798090000001014
[[-0.48  0.  ]]


array([-0.51,  0.  ])

episode: 58   score: -170.0   memory length: 2000   epsilon: 0.7765790000001029
[[-0.45  0.  ]]


array([-0.49,  0.  ])

episode: 59   score: -200.0   memory length: 2000   epsilon: 0.7727790000001047
[[-0.47  0.  ]]


array([-0.52,  0.  ])

episode: 60   score: -200.0   memory length: 2000   epsilon: 0.7689790000001064
[[-0.4  0. ]]


array([-0.54,  0.  ])

episode: 61   score: -200.0   memory length: 2000   epsilon: 0.7651790000001082
[[-0.47  0.  ]]


array([-0.58,  0.  ])

episode: 62   score: -200.0   memory length: 2000   epsilon: 0.7613790000001099
[[-0.47  0.  ]]


array([-0.54,  0.  ])

episode: 63   score: -200.0   memory length: 2000   epsilon: 0.7575790000001117
[[-0.51  0.  ]]


array([-0.57,  0.  ])

episode: 64   score: -200.0   memory length: 2000   epsilon: 0.7537790000001134
[[-0.42  0.  ]]


array([-0.47,  0.  ])

episode: 65   score: -200.0   memory length: 2000   epsilon: 0.7499790000001152
[[-0.53  0.  ]]


array([-0.55,  0.  ])

episode: 66   score: -200.0   memory length: 2000   epsilon: 0.7461790000001169
[[-0.41  0.  ]]


array([-0.56,  0.  ])

episode: 67   score: -200.0   memory length: 2000   epsilon: 0.7423790000001187
[[-0.47  0.  ]]


array([-0.53,  0.  ])

episode: 68   score: -200.0   memory length: 2000   epsilon: 0.7385790000001204
[[-0.58  0.  ]]


array([-0.49,  0.  ])

episode: 69   score: -200.0   memory length: 2000   epsilon: 0.7347790000001222
[[-0.57  0.  ]]


array([-0.53,  0.  ])

episode: 70   score: -200.0   memory length: 2000   epsilon: 0.730979000000124
[[-0.6  0. ]]


array([-0.5,  0. ])

episode: 71   score: -200.0   memory length: 2000   epsilon: 0.7271790000001257
[[-0.46  0.  ]]


array([-0.56,  0.  ])

episode: 72   score: -200.0   memory length: 2000   epsilon: 0.7233790000001274
[[-0.55  0.  ]]


array([-0.5,  0. ])

episode: 73   score: -200.0   memory length: 2000   epsilon: 0.7195790000001292
[[-0.56  0.  ]]


array([-0.46,  0.  ])

episode: 74   score: -200.0   memory length: 2000   epsilon: 0.715779000000131
[[-0.48  0.  ]]


array([-0.55,  0.  ])

episode: 75   score: -200.0   memory length: 2000   epsilon: 0.7119790000001327
[[-0.43  0.  ]]
