In [1]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

from gekko import GEKKO
import math

# Construct MPC
m = GEKKO(remote=True)
m.time = [0,0.02,0.04,0.08,0.16,0.32,0.4] #np.arange(0,0.401,0.05)

# Parameters
masspole = m.Param(0.1)
polemass_length = m.Param(0.05)
total_mass = m.Param(1.1)
gravity = m.Param(9.8)
length = m.Param(0.5)

# Manipulated variable 
u = m.MV(1, lb=0, ub=1, name='u', integer=True)
u.STATUS = 1

# Controlled variable 
ni = ['x','xdot','theta','thetadot']
y = [m.CV(name=ni[i]) for i in range(4)]
x = y[0]
xdot = y[1]
theta = y[2]
thetadot = y[3]
x.STATUS = 1; x.FSTATUS = 1;
xdot.STATUS = 1; xdot.FSTATUS = 1;
theta.STATUS = 1; theta.FSTATUS = 1;
thetadot.STATUS = 1; thetadot.FSTATUS = 1;
m.Minimize(x**2 + theta**2)

# Process model
force = m.Intermediate(20*u-10)
costheta = m.Intermediate(m.cos(theta))
sintheta = m.Intermediate(m.sin(theta))
temp = m.Intermediate((force + polemass_length * thetadot * thetadot * sintheta)/total_mass)
thetaacc = m.Intermediate((gravity * sintheta - costheta * temp)/length * (4.0/3.0 - masspole * costheta * costheta / total_mass))
xacc = m.Intermediate(temp - polemass_length * thetaacc * costheta / total_mass)

m.Equation(x.dt() == xdot)
m.Equation(xdot.dt() == xacc)
m.Equation(theta.dt() == thetadot)
m.Equation(thetadot.dt() == thetaacc)

m.options.IMODE = 6
m.options.solver = 1


# Construct RL agent
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model

    def memorize(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
#         if np.random.rand() <= self.epsilon:
#             return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

EPISODES = 50 # evaluation episodes
if __name__ == "__main__":
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size)
    agent.load("cartpole-dqn.h5")
    done = False
    batch_size = 32
    score_RL = []; score_MPC = []

    for e in range(EPISODES):
        state_init = env.reset()
        state_init = np.reshape(state_init, [1, state_size])
        
        state = state_init
        # DQNagent
        for time in range(500):
            # env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, state_size])
#             agent.memorize(state, action, reward, next_state, done)
            state = next_state
            if done:
                print("episode_RL: {}/{}, score: {}, e: {:.2}"
                      .format(e, EPISODES, time, agent.epsilon))
                break                    
        score_RL.append(time)
        
        state = state_init
        # MPC
        for time in range(500):
            # env.render()
            x.MEAS = state[0,0]; xdot.MEAS = state[0,1]; theta.MEAS = state[0,2]; thetadot.MEAS = state[0,3]
            print(m.path)
            m.solve(disp=True)
            action = u.NEWVAL
            print(action)
            next_state, reward, done, _ = env.step(action)
            reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, state_size])
#             agent.memorize(state, action, reward, next_state, done)
            state = next_state
            if done:
                print("episode_MPC: {}/{}, score: {}, e: {:.2}"
                      .format(e, EPISODES, time, agent.epsilon))
                break                    
        score_MPC.append(time)

#             if len(agent.memory) > batch_size:
#                 agent.replay(batch_size)
        # if e % 10 == 0:
        #     agent.save("./save/cartpole-dqn.h5")

Using TensorFlow backend.
W0115 11:25:20.998301  8780 deprecation_wrapper.py:119] From C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.



episode_RL: 0/50, score: 85, e: 1.0
C:\Users\rwer\AppData\Local\Temp\tmp1jiz0zxwgk_model0
apm 147.46.91.150_gk_model0 <br><pre> ----------------------------------------------------------------
 APMonitor, Version 0.9.2
 APMonitor Optimization Suite
 ----------------------------------------------------------------
 
 
 --------- APM Model Size ------------
 Each time step contains
   Objects      :            0
   Constants    :            0
   Variables    :           10
   Intermediates:            6
   Connections  :            0
   Equations    :           11
   Residuals    :            5
 
 Number of state variables:            342
 Number of total equations: -          336
 Number of slack variables: -            0
 ---------------------------------------
 Degrees of freedom       :              6
 
 ----------------------------------------------
 Dynamic Control with APOPT Solver
 ----------------------------------------------
Iter:     1 I:  0 Tm:      0.05 NLPi:   14 Dpth:    

--Integer Solution:   1.64E+02 Lowest Leaf:   1.41E+02 Gap:   1.47E-01
Iter:    69 I:  0 Tm:      0.02 NLPi:    6 Dpth:    6 Lvs:   28 Obj:  4.10E+02 Gap:  1.47E-01
--Integer Solution:   1.64E+02 Lowest Leaf:   1.57E+02 Gap:   4.13E-02
Iter:    70 I:  0 Tm:      0.02 NLPi:    6 Dpth:    6 Lvs:   27 Obj:  3.33E+02 Gap:  4.13E-02
--Integer Solution:   1.64E+02 Lowest Leaf:   1.57E+02 Gap:   4.13E-02
Iter:    71 I:  0 Tm:      0.02 NLPi:    5 Dpth:    6 Lvs:   26 Obj:  2.20E+02 Gap:  4.13E-02
--Integer Solution:   1.64E+02 Lowest Leaf:   1.57E+02 Gap:   4.13E-02
Iter:    72 I:  0 Tm:      0.02 NLPi:    6 Dpth:    6 Lvs:   25 Obj:  4.65E+02 Gap:  4.13E-02
--Integer Solution:   1.64E+02 Lowest Leaf:   1.57E+02 Gap:   4.13E-02
Iter:    73 I:  0 Tm:      0.02 NLPi:    6 Dpth:    6 Lvs:   24 Obj:  4.65E+02 Gap:  4.13E-02
--Integer Solution:   1.64E+02 Lowest Leaf:   1.59E+02 Gap:   2.56E-02
Iter:    74 I:  0 Tm:      0.02 NLPi:    5 Dpth:    6 Lvs:   23 Obj:  2.20E+02 Gap:  2.56E-02
Iter:    7

AssertionError: 1.0 (<class 'float'>) invalid

In [None]:
print(np.mean(score_RL))

In [None]:
print(action_size)

In [None]:
state[0,1]