In [3]:
from torch.utils.tensorboard import SummaryWriter

from dqn import *
from simulator.simulator import Simulator

In [4]:
skip_frame = 1
stack_frame = 5

epsilon_init = 0.3
epsilon_decay = 0.0001

learning_rate = 0.0001
discount_factor = 0.99
batch_size = 512

max_step = 5000
max_episode = 10000

save_episode = 10
update_target_episode = 20

comment = "break_-1_reverse_-1"

In [5]:
model = NN(8, stack_frame, 9)
target_model = NN(8, stack_frame, 9)
agent = DQNAgent(model, target_model, learning_rate, epsilon_init, skip_frame, stack_frame)

# agent.model_load(1750, eval=False)
# agent.epsilon = 0.278

env = Simulator()
writer = SummaryWriter(comment)

In [13]:
episode = 0

try:
    while episode < max_episode:
        step = 0

        losses = []
        max_qs = []
        rewards = []
        
        obs, _ = env.reset()
        agent.reset(obs)

        state = agent.skip_stack_frame(obs)

        while not env.is_done or step < max_step:
            env.render()

            action = agent.get_action(state)

            # 조향각 조정
            if action % 3 == 0:
                steering_deg = -env.max_steering_deg
            elif action % 3 == 1:
                steering_deg = 0
            elif action % 3 == 2:
                steering_deg = env.max_steering_deg
            
            # 기어 조정
            if action // 3 == 0:
                gear = env.DRIVE
                reward = 0
                print("DRIVE")
            elif action // 3 == 1:
                gear = env.BREAK
                reward = -1
            elif action // 3 == 2:
                gear = env.REVERSE
                print("REVERSE")
                reward = -2

            next_obs, done = env.step(gear, steering_deg)
            next_state = agent.skip_stack_frame(next_obs)
            done = 1 if done else 0

            agent.append_sample(state, action, reward, next_state, done)

            if len(agent.experience_memory) >= 1024:
                for _ in range(10):
                    loss, max_q = agent.train_model(discount_factor, batch_size)
                    losses.append(loss)
                    max_qs.append(max_q)

            if agent.epsilon > agent.epsilon_min:
                agent.epsilon -= epsilon_decay

            rewards.append(reward)
            state = next_state
            step += 1

        print("episode: {:d} | loss: {:.4f} | reward: {} | epsilon: {} | memory size: {:d}".format(
            episode, np.mean(losses), np.sum(rewards), agent.epsilon, len(agent.experience_memory)))

        writer.add_scalar("DQN/loss", np.mean(losses), episode)
        writer.add_scalar("DQN/max_q", np.mean(max_qs), episode)
        writer.add_scalar("DQN/reward", np.sum(rewards), episode)
        writer.add_scalar("DQN/epsilon", agent.epsilon, episode)
        writer.flush()

        episode += 1

        # target 네트워크 업데이트
        if episode % update_target_episode == 0:
            agent.update_target()
        
        # 모델 저장
        if episode % save_episode == 0:
            agent.model_save(episode, comment)

finally:
    writer.close()

DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
REVERSE
DRIVE
DRIVE
DRIVE
DRIVE
REVERSE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
REVERSE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
REVERSE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
REVERSE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
REVERSE
DRIVE
DRIVE
DRIVE
DRIVE
REVERSE
DRIVE
DRIVE
DRIVE
DRIVE
REVERSE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
REVERSE
REVERSE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
REVERSE
REVERSE
DRIVE
DRIVE
REVERSE
REVERSE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
DRIVE
