In [6]:
#Hasan -> RL Agent trainer for LunarLander-v2
import random
import gym
import numpy as np
from collections import deque
import os
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization
from keras.optimizers import Adam

EPISODES = 3000
export_episode = 50
mode_run = 'test' #test or train or retrain
train_every = 1  #Train every nth cycle
checkpoint = '\weights_bkup\lunar_model_800.h5'
max_time = 1000
print_episode = 10  #Print scores for every nth episode

In [7]:
class rlagent():
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=100000)
        self.gamma = 0.99    # future rewards discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995 #Epsilon decay rate
        self.learning_rate = 0.001
        self.tau = 0.001  # transition of weights: model to target_model( 1: Sharp transition, 0: No transition )
        
        #Declare main and target models
        self.model = self._build_model()
        self.target_model = self._build_model()
#         self.update_target_model()

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(32, input_dim=self.state_size, activation='relu',kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu',kernel_initializer='he_uniform'))
        model.add(Dense(self.action_size,activation='linear', kernel_initializer='he_uniform'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model

    #Soft update target model with model's weights
    def target_model_update(self):
        ## θ_target = τ*θ_local + (1 - τ)*θ_target
        model_weights = self.model.get_weights()
        target_model_weights = self.target_model.get_weights()
        
        #Loop and smoothen all weights of target model
        for i in range(len(target_model_weights)):
            target_model_weights[i] = model_weights[i] * self.tau + target_model_weights[i] * (1 - self.tau)
        
        #Set weights on target model
        self.target_model.set_weights(target_model_weights)
        
    #Soft update target model with model's weights        
#     def update_target_model(self):
#         self.target_model.set_weights(self.model.get_weights())        
    
    #Remember function
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def decay_epsilon(self):
        #Decay epsilon
        self.epsilon = max(self.epsilon* self.epsilon_decay, self.epsilon_min)
        
    #Decide next move
    def act(self, state, mode='train'):
        #For testing mode, just predict
        if mode == 'test':
            act_values = self.model.predict(state)
            return np.argmax(act_values[0])           
        
        #Exploration
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
        #Exploitation
            act_values = self.model.predict(state)            
            return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)

        arr_states = np.zeros((batch_size, self.state_size))
        arr_next_states = np.zeros((batch_size, self.state_size))
        arr_action, arr_reward, arr_done = [], [], []

        for i, _mb in enumerate(minibatch):
            state, action, reward, next_state, done = _mb
            
            arr_states[i] = state
            arr_next_states[i] = next_state
            arr_action.append(action)
            arr_reward.append(reward)            
            arr_done.append(done)

        #Predict Q values from states and next states
        target = self.model.predict(arr_states)  #( USE - model )
        target_val = self.target_model.predict(arr_next_states)  #( USE - target model )

        #Prepare Q value table
        for i, _mb in enumerate(minibatch):
            # Q Learning: get maximum Q value at s' from target model
            if arr_done[i]:
                target[i][arr_action[i]] = arr_reward[i]
            else:
                target[i][arr_action[i]] = arr_reward[i] + self.gamma * ( np.amax(target_val[i]) )

        #Train model to predict target
        self.model.fit(arr_states, target, batch_size=batch_size, epochs=1, verbose=0)


    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [8]:
env = gym.make('LunarLander-v2')
state_size = env.observation_space.shape
env.observation_space

Box(8,)

In [9]:
11 % 11

0

In [10]:
if __name__ == "__main__":
    env = gym.make('LunarLander-v2')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    print("Action size {}, State_size {}".format(action_size, state_size))
    agent = rlagent(state_size, action_size)
    
    #Load pre-trained model weights
    if checkpoint:
        print("Loaded pretrained model")
        agent.load(os.getcwd() + checkpoint)
        
    #if retrain set epsilon to 0.01
    if mode_run == 'retrain':
        agent.epsilon = 0.01
        
    done = False
    batch_size = 64
    scores = deque(maxlen=print_episode)
    
    #Train agent
    if mode_run == 'train':
        for e in range(EPISODES):
            state = env.reset()
            state = np.reshape(state, [1, state_size])
            
            #Initialize scores
            score = 0
            for time in range(max_time):
    #             env.render()  #Dont render environment in training mode

                #1- Agent model decides next move
                action = agent.act(state)

                #2- Agent takes next move and gets next state, reward, done flag from the environment
                next_state, reward, done, _ = env.step(action)
                next_state = np.reshape(next_state, [1, state_size])

                #3 - Remember this move
                agent.remember(state, action, reward, next_state, done)
                state = next_state
                
                #Calculate score ( only for reporting )
                score += reward                
                  
                #4- Train model on memorized moves
                if len(agent.memory) > batch_size and time % train_every == 0:
                    agent.replay(batch_size)
                    #Update target model's weight
                    agent.target_model_update()

                #5- Export model
                if e % export_episode == 0:
                    agent.save(os.getcwd() + "\lunar_model_{}.h5".format(e))                                
                    
                #6- If done, move to next episode
                if done:
                    break
            
            #Decay epsilon after episode ends
            agent.decay_epsilon()
        
            #Print scores for every nth episode
            scores.append(score)
            if e % print_episode == 0:
                print("episode: {}/{}, average_score({} episodes): {}, eps: {:.2}"
                              .format(e, EPISODES, print_episode, np.mean(scores), agent.epsilon))
    else:
        #Test agent
        while True:
            state = env.reset()
            state = np.reshape(state, [1, state_size])
            while True:
                env.render()

                #1- Agent model decides next move
                action = agent.act(state, mode='test')

                #2- Agent takes next move and gets next state, reward, done flag from the environment
                next_state, reward, done, _ = env.step(action)
                next_state = np.reshape(next_state, [1, state_size])
                
                #3- Set next state
                state = next_state
                
                #4- Reset
                if done:
                    break

Action size 4, State_size 8
Loaded pretrained model


KeyboardInterrupt: 