In [None]:
import sys, os
sys.path.append(os.path.dirname(os.getcwd()))

In [None]:
import numpy as np

from torch.utils.tensorboard import SummaryWriter

from ver2.dqn import DQNAgent
from ver2.model import LinearModel
from ver2.replay_buffer import ReplayBuffer
from simulator.simulator import Simulator

In [None]:
sensor_count = 8
action_size = 6

skip_frame = 5
stack_frame = 10

discount_factor = 0.99
learning_rate = 1e-4

epsilon = 1.0
decay = 1e-4

memory_start = 1000
memory_maxlen = int(1e+4)

discount_factor = 0.9

batch_size = 32

update_target_episode = 1000

max_step = 3000

model_save_episode = 50
model_save_comment = None

model_load_episode = None
model_load_comment = None

In [None]:
model = LinearModel(sensor_count, stack_frame, action_size)
target_model = LinearModel(sensor_count, stack_frame, action_size)

agent = DQNAgent(model, target_model, discount_factor, learning_rate, epsilon)

if model_load_comment:
    agent.load_model(model_load_episode, model_load_comment, eval=False)

env = Simulator(map="rally_map.png", fps=5, reward_radius=100)
replay_buffer = ReplayBuffer(stack_frame, memory_maxlen)

In [None]:
writer = SummaryWriter()

episode = 1

is_decay = False

try:
    while True: #episode < max_episode:
        step = 0
        gear_drive_cnt = 0
        gear_reverse_cnt = 0

        losses = []
        max_qs = []
        rewards = []
        
        obs, reward, _ = env.reset()
        gear = env.BREAK

        replay_buffer.reset()
        state = replay_buffer.get_state(obs)
        
        while not env.is_done and step < max_step:
            # env.render()

            action = agent.get_action(state)

            # 조향각 조정
            if action % 3 == 0:
                steering_deg = -env.max_steering_deg
            elif action % 3 == 1:
                steering_deg = 0
            elif action % 3 == 2:
                steering_deg = env.max_steering_deg
            
            # 기어 조정
            if action // 3 == 0:
                if gear != env.DRIVE:
                    env.car.reset()
                gear = env.DRIVE
                gear_drive_cnt += 1
            elif action // 3 == 1:
                if gear != env.REVERSE:
                    env.car.reset()
                gear = env.REVERSE
                gear_reverse_cnt += 1
                
            reward = 0
            next_state = None
            done = False

            # Skipping
            for _ in range(skip_frame):
                _obs, _reward, _done = env.step(gear, steering_deg)
                reward += _reward
                if _done:
                    done = _done
                    next_state = replay_buffer.get_state(_obs)
                    break
            else:
                next_state = replay_buffer.get_state(_obs)
                
            done = 0.0 if done else 1.0

            replay_buffer.put((state, action, reward, next_state, done))

            if replay_buffer.size() >= memory_start:
                is_decay = True
                for _ in range(10):
                    loss, max_q = agent.train_model(replay_buffer, batch_size)
                    losses.append(loss)
                    max_qs.append(max_q)

            rewards.append(reward)
            state = next_state
            step += 1

        print("episode: {: >7d} | loss: {: >7.2f} | rewards: {: >7} | epsilon: {: >7.2f}% | drive/reverse: ({: >7d} / {: >7d})".format(
            episode, np.mean(losses), np.sum(rewards), agent._epsilon * 100, gear_drive_cnt, gear_reverse_cnt))

        # target 네트워크 업데이트
        if episode % update_target_episode == 0:
            agent.update_target()
        
        # 모델 저장
        if episode % model_save_episode == 0:
            agent.save_model(episode, model_save_comment)

        # Epsilon decay
        if is_decay:
            agent.epsilon_decay(decay)

        episode += 1

        writer.add_scalar("DQN/loss", np.mean(losses), episode)
        writer.add_scalar("DQN/max_q", np.mean(max_qs), episode)
        writer.add_scalar("DQN/reward", np.sum(rewards), episode)
        writer.add_scalar("DQN/epsilon", agent._epsilon, episode)
        writer.flush()

finally:
    writer.close()