In [None]:
from torch.utils.tensorboard import SummaryWriter

from dqn import *
from simulator.simulator import Simulator

In [None]:
sensor_count = 8
action_size = 6

skip_frame = 3
stack_frame = 5

learning_rate = 0.0001
epsilon_init = 0.3
memory_maxlen = 1000000

epsilon_decay = 0.0001
discount_factor = 0.9

batch_size = 512

update_target_episode = 50

max_step = 1000
# max_episode = 1000000000000

model_save_episode = 50
model_save_comment = "break_-1_reverse_-1"

model_load_episode = None
model_load_comment = None

In [None]:
model = LinearModel(sensor_count, stack_frame, action_size)
agent = DQNAgent(model, learning_rate, epsilon_init, skip_frame, stack_frame, memory_maxlen)

if model_load_comment:
    agent.model_load(model_load_episode, model_load_comment, eval=False)

env = Simulator(map="rally_map.png", fps=5)

In [None]:
writer = SummaryWriter()

episode = 0

try:
    while True: #episode < max_episode:
        step = 0
        gear_drive_cnt = 0
        gear_reverse_cnt = 0

        losses = []
        max_qs = []
        rewards = []
        
        obs, _ = env.reset()
        gear = env.BREAK

        agent.reset(obs)

        state = agent.skip_stack_frame(obs)

        while not env.is_done and step < max_step:
            # env.render()

            action = agent.get_action(state)

            # 조향각 조정
            if action % 3 == 0:
                steering_deg = -env.max_steering_deg
            elif action % 3 == 1:
                steering_deg = 0
            elif action % 3 == 2:
                steering_deg = env.max_steering_deg
            
            # 기어 조정
            if action // 3 == 0:
                if gear != env.DRIVE:
                    env.car.reset()
                    reward = -1
                else:
                    reward = 0
                    
                reward += 1

                gear = env.DRIVE
                gear_drive_cnt += 1

            elif action // 3 == 1:
                if gear != env.REVERSE:
                    env.car.reset()
                    reward = -1
                else:
                    reward = 0

                reward += -1

                gear = env.REVERSE
                gear_reverse_cnt += 1

            next_obs, done = env.step(gear, steering_deg)
            next_state = agent.skip_stack_frame(next_obs)

            if done:
                reward += -1
                
            done = 1 if done else 0

            agent.append_sample(state, action, reward, next_state, done)

            if len(agent.experience_memory) >= batch_size * 5:
                for _ in range(10):
                    loss, max_q = agent.train_model(discount_factor, batch_size)
                    losses.append(loss)
                    max_qs.append(max_q)

            if agent.epsilon > agent.epsilon_min:
                agent.epsilon -= epsilon_decay

            rewards.append(reward)
            state = next_state
            step += 1

        print("episode: {: >10d} | loss: {: >10.2f} | rewards: {: >7d} | epsilon: {: >7.2f} | drive/reverse: ({: >7d} / {: >7d})".format(
            episode, np.mean(losses), np.sum(rewards), agent.epsilon, gear_drive_cnt, gear_reverse_cnt))

        # target 네트워크 업데이트
        if episode % update_target_episode == 0:
            agent.update_target()
        
        # 모델 저장
        if episode % model_save_episode == 0:
            agent.model_save(model_save_episode, model_save_comment)

        episode += 1

        writer.add_scalar("DQN/loss", np.mean(losses), episode)
        writer.add_scalar("DQN/max_q", np.mean(max_qs), episode)
        writer.add_scalar("DQN/reward", np.sum(rewards), episode)
        writer.add_scalar("DQN/epsilon", agent.epsilon, episode)
        writer.flush()

finally:
    writer.close()