# Doom with DRQN 

In [6]:
import tensorflow as tf
import numpy as np
import math
from vizdoom import *
import timeit
import math
import os
import sys

In [7]:
def get_input_shape(Image,Filter,Stride):
    layer1 = math.ceil(((Image - Filter + 1) / Stride))
    
    o1 = math.ceil((layer1 / Stride))
    
    layer2 = math.ceil(((o1 - Filter + 1) / Stride))
    
    o2 = math.ceil((layer2 / Stride))
    
    layer3 = math.ceil(((o2 - Filter + 1) / Stride))
    
    o3 = math.ceil((layer3  / Stride))

    return int(o3)

In [8]:
get_input_shape(256, 5, 2)

3

In [15]:
class DRQN():
    def __init__(self, input_shape, num_actions, inital_learning_rate):
        
        """hyperparameter 초기화"""

        self.tfcast_type = tf.float32
        
        #입력 이미지 모양 (높이, 넓이, 채널)
        self.input_shape = input_shape  
        
        #액션 개수 
        self.num_actions = num_actions
        
        #학습률 
        self.learning_rate = inital_learning_rate
                
        
        """CNN hyperparameter"""
        
        #필터 크기  
        self.filter_size = 5
        
        #필터 개수 
        self.num_filters = [16, 32, 64]
        
        #stride 크기 
        self.stride = 2
        
        #pool 크기 
        self.poolsize = 2        
        
        #convolutional layer 모양 ///
        self.convolution_shape = get_input_shape(input_shape[0], self.filter_size, self.stride) * get_input_shape(input_shape[1], self.filter_size, self.stride) *self.num_filters[2]
        
        """RNN hyperparameter"""
        
        #cell 뉴런 개수 
        self.cell_size = 100
        
        #은닉층 개수 
        self.hidden_layer = 50
        
        #dropout 확률 
        self.dropout_probability = [0.3, 0.2]

        #최적화 관련 hyperparameter
        self.loss_decay_rate = 0.96
        self.loss_decay_steps = 180

        
        """CNN 변수 초기화"""

        #입력 이미지 (높이, 넓이, 채널)
        self.input = tf.placeholder(shape = (self.input_shape[0], self.input_shape[1], self.input_shape[2]), dtype = self.tfcast_type)
        
        #타겟 벡터 (액션 수, 1)
        self.target_vector = tf.placeholder(shape = (self.num_actions, 1), dtype = self.tfcast_type)

        #각 필터에 상응하는 특징맵 
        self.features1 = tf.Variable(initial_value = np.random.rand(self.filter_size, self.filter_size, input_shape[2], self.num_filters[0]),
                                     dtype = self.tfcast_type) #(5, 5, 3, 16)
        
        self.features2 = tf.Variable(initial_value = np.random.rand(self.filter_size, self.filter_size, self.num_filters[0], self.num_filters[1]),
                                     dtype = self.tfcast_type) #(5, 5, 16, 32)
                                     
        
        self.features3 = tf.Variable(initial_value = np.random.rand(self.filter_size, self.filter_size, self.num_filters[1], self.num_filters[2]),
                                     dtype = self.tfcast_type) #(5, 5, 32, 64)

        
        """RNN 변수 초기화"""
        
        #
        self.h = tf.Variable(initial_value = np.zeros((1, self.cell_size)), dtype = self.tfcast_type)
        
        #(입력-은닉) 가중치 
        self.rW = tf.Variable(initial_value = np.random.uniform(
                                            low = -np.sqrt(6. / (self.convolution_shape + self.cell_size)),
                                            high = np.sqrt(6. / (self.convolution_shape + self.cell_size)),
                                            size = (self.convolution_shape, self.cell_size)),
                              dtype = self.tfcast_type)
        
        #(은닉-은닉) 가중치 
        self.rU = tf.Variable(initial_value = np.random.uniform(
                                            low = -np.sqrt(6. / (2 * self.cell_size)),
                                            high = np.sqrt(6. / (2 * self.cell_size)),
                                            size = (self.cell_size, self.cell_size)),
                              dtype = self.tfcast_type)
        
        #(은닉-출력)가중치             
        self.rV = tf.Variable(initial_value = np.random.uniform(
                                            low = -np.sqrt(6. / (2 * self.cell_size)),
                                            high = np.sqrt(6. / (2 * self.cell_size)),
                                            size = (self.cell_size, self.cell_size)),
                              dtype = self.tfcast_type)
        
        #bias 
        self.rb = tf.Variable(initial_value = np.zeros(self.cell_size), dtype = self.tfcast_type)
        self.rc = tf.Variable(initial_value = np.zeros(self.cell_size), dtype = self.tfcast_type)

        
        """FC 변수 초기화"""
        
        #(rnn 출력-fc) 가중치 
        self.fW = tf.Variable(initial_value = np.random.uniform(
                                            low = -np.sqrt(6. / (self.cell_size + self.num_actions)),
                                            high = np.sqrt(6. / (self.cell_size + self.num_actions)),
                                            size = (self.cell_size, self.num_actions)),
                              dtype = self.tfcast_type)
                             
        #bias
        self.fb = tf.Variable(initial_value = np.zeros(self.num_actions), dtype = self.tfcast_type)

        #학습률 
        self.step_count = tf.Variable(initial_value = 0, dtype = self.tfcast_type)
        self.learning_rate = tf.train.exponential_decay(self.learning_rate,       
                                                   self.step_count,
                                                   self.loss_decay_steps,
                                                   self.loss_decay_steps,
                                                   staircase = False)
        
        
        """Network"""
        
        """CNN"""
        #첫번째 convolutional layer
        self.conv1 = tf.nn.conv2d(input = tf.reshape(self.input, 
                                                     shape = (1, self.input_shape[0], self.input_shape[1], self.input_shape[2])), 
                                  filter = self.features1, 
                                  strides = [1, self.stride, self.stride, 1], 
                                  padding = "VALID")
        
        self.relu1 = tf.nn.relu(self.conv1)
        
        self.pool1 = tf.nn.max_pool(self.relu1, 
                                    ksize = [1, self.poolsize, self.poolsize, 1], 
                                    strides = [1, self.stride, self.stride, 1], 
                                    padding = "SAME")

        #두번째 convolutional layer
        self.conv2 = tf.nn.conv2d(input = self.pool1, 
                                  filter = self.features2, 
                                  strides = [1, self.stride, self.stride, 1], 
                                  padding = "VALID")
        
        self.relu2 = tf.nn.relu(self.conv2)
        
        self.pool2 = tf.nn.max_pool(self.relu2, 
                                    ksize = [1, self.poolsize, self.poolsize, 1], 
                                    strides = [1, self.stride, self.stride, 1], padding = "SAME")

        #세번째 convolutional layer
        self.conv3 = tf.nn.conv2d(input = self.pool2, 
                                  filter = self.features3, 
                                  strides = [1, self.stride, self.stride, 1], 
                                  padding = "VALID")
        
        self.relu3 = tf.nn.relu(self.conv3)
        
        self.pool3 = tf.nn.max_pool(self.relu3, 
                                    ksize = [1, self.poolsize, self.poolsize, 1], 
                                    strides = [1, self.stride, self.stride, 1], 
                                    padding = "SAME")

        #dropout
        self.drop1 = tf.nn.dropout(self.pool3, self.dropout_probability[0])
        
        #reshape
        self.reshaped_input = tf.reshape(self.drop1, shape = [1, -1])

        """RNN"""
        #CNN의 출력이 입력이 됨 
        self.h = tf.tanh(tf.matmul(self.reshaped_input, self.rW) + tf.matmul(self.h, self.rU) + self.rb)
        
        self.o = tf.nn.softmax(tf.matmul(self.h, self.rV) + self.rc)

        #dropout
        self.drop2 = tf.nn.dropout(self.o, self.dropout_probability[1])
        
        
        """FC"""
        #RNN의 출력이 입력이 됨 
        self.output = tf.reshape(tf.matmul(self.drop2, self.fW) + self.fb, shape = [-1, 1])
        
        
        
        self.prediction = tf.argmax(self.output)
        
        #loss
        self.loss = tf.reduce_mean(tf.square(self.target_vector - self.output))
        
        #optimization
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
        
        #update
        self.gradients = self.optimizer.compute_gradients(self.loss)
        self.update = self.optimizer.apply_gradients(self.gradients)

        self.parameters = (self.features1, self.features2, self.features3,
                           self.rW, self.rU, self.rV, self.rb, self.rc,
                           self.fW, self.fb)
        

In [16]:
class ExperienceReplay():
    def __init__(self, buffer_size):
        
        #전이(transition)를 저장할 버퍼 
        self.buffer = []       
        
        #버퍼 크기 
        self.buffer_size = buffer_size
        
    #버퍼가 꽉 찰 경우, 오래된 경험 순으로 제거 
    
    def appendToBuffer(self, memory_tuplet):
        if len(self.buffer) > self.buffer_size: 
            for i in range(len(self.buffer) - self.buffer_size):
                self.buffer.remove(self.buffer[0]) 
                
        self.buffer.append(memory_tuplet)  
        
        
  
    #n개의 전이를 랜덤하게 샘플링 
    def sample(self, n):
        memories = []
        
        for i in range(n):
            memory_index = np.random.randint(0, len(self.buffer))       
            memories.append(self.buffer[memory_index])
            
        return memories

In [17]:
def train(num_episodes, episode_length, learning_rate, scenario = "/home/sohee/openai/ViZDoom/scenarios/deathmatch.cfg", map_path = 'map02', render = False):
  
    #할인 계수 (감마)
    discount_factor = .99
    
    #버퍼 내의 경험을 갱신할 빈도 
    update_frequency = 5
    store_frequency = 50
    
    #결과 출력 빈도 
    print_frequency = 1000

    #변수 초기화 
    total_reward = 0
    total_loss = 0
    old_q_value = 0

    #에피소드 별 보상, 손실 
    rewards = []
    losses = []

   
    """환경"""
    #게임 환경 초기화
    game = DoomGame()
    
    #시나리오 설정 (시나리오 경로 주의)
    game.set_doom_scenario_path(scenario)
    
   
    game.set_doom_map(map_path)

   
    game.set_screen_resolution(ScreenResolution.RES_256X160)    
    game.set_screen_format(ScreenFormat.RGB24)

   
    game.set_render_hud(False)
    game.set_render_minimal_hud(False)
    game.set_render_crosshair(False)
    game.set_render_weapon(True)
    game.set_render_decals(False)
    game.set_render_particles(False)
    game.set_render_effects_sprites(False)
    game.set_render_messages(False)
    game.set_render_corpses(False)
    game.set_render_screen_flashes(True)

    game.add_available_button(Button.MOVE_LEFT)
    game.add_available_button(Button.MOVE_RIGHT)
    game.add_available_button(Button.TURN_LEFT)
    game.add_available_button(Button.TURN_RIGHT)
    game.add_available_button(Button.MOVE_FORWARD)
    game.add_available_button(Button.MOVE_BACKWARD)
    game.add_available_button(Button.ATTACK)
    
   
    # okay,now we will add one more button called delta. The above button will only work 
    # like a keyboard keys and will have only boolean values. 

    # so we use delta button which emulates a mouse device which will have positive and negative values
    # and it will be useful in environment for exploring
    
    game.add_available_button(Button.TURN_LEFT_RIGHT_DELTA, 90)
    game.add_available_button(Button.LOOK_UP_DOWN_DELTA, 90)

    # initialize an array for actions (원핫 인코딩)
    actions = np.zeros((game.get_available_buttons_size(), game.get_available_buttons_size()))
    count = 0
    for i in actions:
        i[count] = 1
        count += 1
    actions = actions.astype(int).tolist()


    # then we add the game variables, ammo, health, and killcount
    game.add_available_game_variable(GameVariable.AMMO0)
    game.add_available_game_variable(GameVariable.HEALTH)
    game.add_available_game_variable(GameVariable.KILLCOUNT)

    # we set episode_timeout to terminate the episode after some time step
    # we also set episode_start_time which is useful for skipping intial events
    
    game.set_episode_timeout(6 * episode_length)
    game.set_episode_start_time(10)
    game.set_window_visible(render)
    
    # we can also enable sound by setting set_sound_enable to true
    game.set_sound_enabled(False)

    # we set living reward to 0 which the agent for each move it does even though the move is not useful
    game.set_living_reward(0)

    # doom has different modes such as player, spectator, asynchronous player and asynchronous spectator
    
    # in spectator mode humans will play and agent will learn from it.
    # in player mode, agent actually plays the game, so we use player mode.
    
    game.set_mode(Mode.PLAYER)

    #게임환경 초기화 
    game.init()

    #메인 네트워크, 타겟 네트워크 
    actionDRQN = DRQN((160, 256, 3), game.get_available_buttons_size() - 2, learning_rate)
    targetDRQN = DRQN((160, 256, 3), game.get_available_buttons_size() - 2, learning_rate)
    
    # 경험 버퍼: 크기 1000
    experiences = ExperienceReplay(1000)

    #모델 저장 
    saver = tf.train.Saver({v.name: v for v in actionDRQN.parameters}, max_to_keep = 1)

    
    """학습"""
    # we initialize variables for sampling and storing transistions from the experience buffer
    sample = 5 #샘플링 개수 
    store = 50 #저장 개수//
   
   
    with tf.Session() as sess:
        
        
        sess.run(tf.global_variables_initializer())
        
        for episode in range(num_episodes):
            
            game.new_episode()
            
            for frame in range(episode_length):
                
                # get the game state
                state = game.get_state()
                s = state.screen_buffer
                
                # select the action
                a = actionDRQN.prediction.eval(feed_dict = {actionDRQN.input: s})[0]
                action = actions[a]
                
                # perform the action and store the reward
                reward = game.make_action(action)
                
                # update total rewad
                total_reward += reward

               
                # if the episode is over then break
                if game.is_episode_finished():
                    break
                 
                # store transistion to our experience buffer
                if (frame % store) == 0:
                    experiences.appendToBuffer((s, action, reward))

                # sample experience form the experience buffer        
                if (frame % sample) == 0:
                    memory = experiences.sample(1)
                    mem_frame = memory[0][0]
                    mem_reward = memory[0][2]
                    
                    
                    # now, train the network
                    Q1 = actionDRQN.output.eval(feed_dict = {actionDRQN.input: mem_frame})
                    Q2 = targetDRQN.output.eval(feed_dict = {targetDRQN.input: mem_frame})

                    # set learning rate
                    learning_rate = actionDRQN.learning_rate.eval()

                    # calculate Q value
                    Qtarget = old_q_value + learning_rate * (mem_reward + discount_factor * Q2 - old_q_value)    
                    
                    # update old Q value
                    old_q_value = Qtarget

                    # compute Loss
                    loss = actionDRQN.loss.eval(feed_dict = {actionDRQN.target_vector: Qtarget, actionDRQN.input: mem_frame})
                    
                    # update total loss
                    total_loss += loss

                    # update both networks
                    actionDRQN.update.run(feed_dict = {actionDRQN.target_vector: Qtarget, actionDRQN.input: mem_frame})
                    targetDRQN.update.run(feed_dict = {targetDRQN.target_vector: Qtarget, targetDRQN.input: mem_frame})

            rewards.append((episode, total_reward))
            losses.append((episode, total_loss))

            
            print("Episode %d - Reward = %.3f, Loss = %.3f." % (episode, total_reward, total_loss))


            total_reward = 0
            total_loss = 0



In [28]:
scenario = "/home/sohee/openai/ViZDoom/scenarios/deathmatch.cfg"
map_path = 'map02'
num_episodes = 1
episode_length = 300
learning_rate = 0.01
render = True

In [29]:
game = DoomGame()
    
    #시나리오 설정 (시나리오 경로 주의)
game.set_doom_scenario_path(scenario)
    
   
game.set_doom_map(map_path)

   
game.set_screen_resolution(ScreenResolution.RES_256X160)    
game.set_screen_format(ScreenFormat.RGB24)

   
game.set_render_hud(False)
game.set_render_minimal_hud(False)
game.set_render_crosshair(False)
game.set_render_weapon(True)
game.set_render_decals(False)
game.set_render_particles(False)
game.set_render_effects_sprites(False)
game.set_render_messages(False)
game.set_render_corpses(False)
game.set_render_screen_flashes(True)

game.add_available_button(Button.MOVE_LEFT)
game.add_available_button(Button.MOVE_RIGHT)
game.add_available_button(Button.TURN_LEFT)
game.add_available_button(Button.TURN_RIGHT)
game.add_available_button(Button.MOVE_FORWARD)
game.add_available_button(Button.MOVE_BACKWARD)
game.add_available_button(Button.ATTACK)
    
   
    # okay,now we will add one more button called delta. The above button will only work 
    # like a keyboard keys and will have only boolean values. 

    # so we use delta button which emulates a mouse device which will have positive and negative values
    # and it will be useful in environment for exploring
    
game.add_available_button(Button.TURN_LEFT_RIGHT_DELTA, 90)
game.add_available_button(Button.LOOK_UP_DOWN_DELTA, 90)

    # initialize an array for actions (원핫 인코딩)
actions = np.zeros((game.get_available_buttons_size(), game.get_available_buttons_size()))
count = 0
for i in actions:
    i[count] = 1
    count += 1
actions = actions.astype(int).tolist()


    # then we add the game variables, ammo, health, and killcount
game.add_available_game_variable(GameVariable.AMMO0)
game.add_available_game_variable(GameVariable.HEALTH)
game.add_available_game_variable(GameVariable.KILLCOUNT)

    # we set episode_timeout to terminate the episode after some time step
    # we also set episode_start_time which is useful for skipping intial events
    
game.set_episode_timeout(6 * episode_length)
game.set_episode_start_time(10)
game.set_window_visible(render)
    
    # we can also enable sound by setting set_sound_enable to true
game.set_sound_enabled(False)

    # we set living reward to 0 which the agent for each move it does even though the move is not useful
game.set_living_reward(0)

    # doom has different modes such as player, spectator, asynchronous player and asynchronous spectator
    
    # in spectator mode humans will play and agent will learn from it.
    # in player mode, agent actually plays the game, so we use player mode.
    
game.set_mode(Mode.PLAYER)

    #게임환경 초기화 
game.init()

    #메인 네트워크, 타겟 네트워크 
actionDRQN = DRQN((160, 256, 3), game.get_available_buttons_size() - 2, learning_rate)
targetDRQN = DRQN((160, 256, 3), game.get_available_buttons_size() - 2, learning_rate)

In [31]:
actionDRQN.num_actions

7

In [32]:
game.get_available_buttons_size()

9

In [20]:
train(num_episodes = 1, episode_length = 300, learning_rate = 0.01, render = True)

Episode 0 - Reward = 0.000, Loss = 0.798.


In [22]:
actionDRQN = DRQN((160, 256, 3), game.get_available_buttons_size() - 2, learning_rate)

NameError: name 'game' is not defined