In [1]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style(style="dark")

Using TensorFlow backend.


### Build Agent

In [2]:
def build_Deep_Q_Agent():
    model = Sequential()
    model.add(Dense(32, input_dim=state_size, activation='relu'))
    model.add(Dense(32, input_dim=state_size, activation='relu'))
    model.add(Dense(action_size, activation='linear')) # LEFT OR RIGHT
    model.compile(loss='mse', optimizer=Adam(lr=learning_rate), metrics=['acc'])
    return model

### Perform action from given state

In [3]:
def perform_action(model, state, epsilon):
    action = -1
    
    if np.random.rand() <= epsilon:
        action = random.randrange(3)
    else:
        action = agent.predict(state.reshape(-1, 2))
        action = np.argmax(action)
        
    return action

### Remember transition

In [4]:
def remember(state, action, reward, next_state, done):
    memory.append((state, action, reward, next_state, done))

### Offline - Pre-train agent

In [5]:
def replay(batch_size, epsilon, done):
    
    # Get a minibatch of previous training memories
    minibatch = random.sample(memory, batch_size)
    
    #print("Replay: Train model on sample of previous memories")

    # For each memory
    for state, action, reward, next_state, done in minibatch:
        
        if not done:
            # Predict future disconted reward if not a terminating step
            target = (reward + gamma * np.amax(agent.predict(next_state)))
        else:
            # Terminating step, no prediction needed
            target = reward
        
        # Map an approximation between current state to the future discounted reward
        target_f = agent.predict(state)
        target_f[0][action] = target
        
        # Train model to find map function from state to target
        # This will maximize the future reward based on current state
        agent.fit(state, target_f, epochs=1, verbose=0)
    
    # Decrease exploration successively
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay
    
    return epsilon

### Load and Save

In [6]:
def load(model, name):
    return model.load_weights(name)

def save(model, name):
    model.save_weights(name)

### Params

In [None]:
env = gym.make('MountainCar-v0')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n # left, right
memory = deque(maxlen=1000)
gamma = 0.95    # discount rate
epsilon = 1  # exploration rate
epsilon_min = 0.01 # min exploration rate
epsilon_decay = 0.995 # decrease exploration successively
learning_rate = 0.005
episodes = 1000
batch_size = 10
max_score = -1.2
max_score_ep = 0

### Offline mode

#### Training

In [135]:
gaming_score_threshold = -195
nbr_of_games = 1000
nbr_of_steps_per_game = 200
        
env.reset()

def train_agent():
    
    game_memory = []
    scores = []
    
    for game in range(nbr_of_games):
        
        env.reset()
        game_score = 0
        memory_current_game = []
        state = []
        next_state = [] 
        
        for step in range(nbr_of_steps_per_game):
            
            action = random.randrange(0, 3)
            next_state, reward, done, info = env.step(action)
            
            if len(state) > 0:
                memory_current_game.append((state, action))

            if next_state[0] > -0.25:
                reward = 0.5
                
            next_state = np.reshape(next_state, [1, state_size])    
            state = next_state
            game_score += reward
            
            if done: 
                #print(game_score)
                break
 
        if game_score >= gaming_score_threshold:
        
            scores.append(game_score)
            for action in memory_current_game:
                if action[1] == 0:
                    target = [1, 0, 0]
                elif action[1] == 1:
                    target = [0, 1, 0]
                elif action[1] == 2:
                    target = [0, 0, 1]
                game_memory.append((action[0], target))
                #print("Added")
        
    return scores, game_memory

In [136]:
scores, game_memory = train_agent()

In [137]:
X = np.array([m[0] for m in game_memory]).reshape(-1, 2)
y = np.array([m[1] for m in game_memory]).reshape(-1, 3)
agent = build_Deep_Q_Agent()
agent.fit(X, y, epochs=5)
agent.summary()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_130 (Dense)            (None, 64)                192       
_________________________________________________________________
dense_131 (Dense)            (None, 32)                2080      
_________________________________________________________________
dense_132 (Dense)            (None, 3)                 99        
Total params: 2,371
Trainable params: 2,371
Non-trainable params: 0
_________________________________________________________________


#### Evaluation

In [None]:
done = False
results = []
epsilon = 0.25

for episode in range(episodes):

    state = env.reset()
    state = np.reshape(state, [1, state_size])
    max_score = 0
    score = 0

    for trial in range(200):
        
        env.render()
        
        action = perform_action(agent, state, epsilon)
        next_state, reward, done, info = env.step(action)
        state = next_state
        
        if done and trial < 199: 
            print("Victory!")
            break
        if done:
            print("Loss!")
            break
    
    env.reset()
    results.append(score)

    # p: [[-0.4041538  0.       ]]

### Online Mode

In [7]:
env = gym.make('MountainCar-v0')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n # left, right
memory = deque(maxlen=20000)
gamma = 0.95    # discount rate
epsilon = 1  # exploration rate
epsilon_min = 0.01 # min exploration rate
epsilon_decay = 0.995 # decrease exploration successively
learning_rate = 0.005
episodes = 1000
batch_size = 10
max_score = -1.2
max_score_ep = 0

In [None]:
agent = build_Deep_Q_Agent()
#agent = load(agent, 'weights/cartpole-dqn.h5')
#epsilon = 0.01
done = False
results = []

for episode in range(episodes):

    state = env.reset()
    state = np.reshape(state, [1, state_size])
    max_score = 0
    score = 0
    ep_score = 0
    ep_mem = []

    for trial in range(200):
        
        env.render()
        
        action = perform_action(agent, state, epsilon)
        next_state, reward, done, info = env.step(action)

        state_pos = next_state[0]   
        
        if state_pos > -0.25: # -1 far left, 0 far right
            reward = 0.5
            
        ep_score += state_pos
            
        next_state = np.reshape(next_state, [1, state_size])
        ep_mem.append((state, action, reward, next_state, done))
        state = next_state
        
        if done and trial < 199: 
            print("Victory!")
        
        if done:
            
            print("\nEpisode: {}/{}, Score: {}, Epsilon: {:.3}, Top score: {}".format(episode, episodes, score, float(epsilon), max_score))
            print("Trial: ", trial)
            
            if ep_score > -103 or ep_score < -108: # IF NO RENDERING, IF RENDER [103, 108]
                [remember(state, action, reward, next_state, done) for state, action, reward, next_state, done in ep_mem]

            print(ep_score)
            
            break
            
        # ---------------------------------------------------------
        # Sample random minibatches from memory and train model to 
        # maximize future reward from current state
        # ---------------------------------------------------------
    
        if len(memory) >= 1000:
            epsilon = replay(batch_size, epsilon, done)
            
    print("Size: ", len(memory))

    #if episode % 50 == 0:
    #    print("Save weights!")
    #    save(agent, "weights/mountain-car-dqn-" + str(episode) + ".h5")

Next episode!

Episode: 0/1000, Score: 0, Epsilon: 1.0, Top score: 0
Trial:  199
-104.40631064368071
Size:  0
Next episode!

Episode: 1/1000, Score: 0, Epsilon: 1.0, Top score: 0
Trial:  199
-104.27534386631173
Size:  0
Next episode!

Episode: 2/1000, Score: 0, Epsilon: 1.0, Top score: 0
Trial:  199
-105.52279607015083
Size:  0
Next episode!

Episode: 3/1000, Score: 0, Epsilon: 1.0, Top score: 0
Trial:  199
-105.77352532701339
Size:  0
Next episode!

Episode: 4/1000, Score: 0, Epsilon: 1.0, Top score: 0
Trial:  199
-106.79388662807949
Size:  0
Next episode!

Episode: 5/1000, Score: 0, Epsilon: 1.0, Top score: 0
Trial:  199
-105.53419483139369
Size:  0
Next episode!

Episode: 6/1000, Score: 0, Epsilon: 1.0, Top score: 0
Trial:  199
-98.06599309851799
Size:  200
Next episode!

Episode: 7/1000, Score: 0, Epsilon: 1.0, Top score: 0
Trial:  199
-104.03778637819957
Size:  200
Next episode!

Episode: 8/1000, Score: 0, Epsilon: 1.0, Top score: 0
Trial:  199
-104.95045782921733
Size:  200
Next 