In [16]:
from torch import nn
import torch
import gym
from collections import deque
import itertools
import numpy as np
import random

In [17]:
GAMMA = 0.99
BATCH_SIZE = 32
BUFFER_SIZE = 50000
MIN_REPLAY_SIZE = 1000
EPSILON_START = 1.0
EPSILON_END = 0.02
EPSILON_DECAY = 10000
MAX_EP = 25000

REWARD_ACC = list()
LOSS_ACC = list()

torch.manual_seed(1234)
np.random.seed(1234)

In [18]:
class Network(nn.Module):
    def __init__(self,env):
        super().__init__()
        
        in_features = int(np.prod(env.observation_space.shape))
        
        self.net = nn.Sequential(
            nn.Linear(in_features, 64),
            nn.Tanh(),
            nn.Linear(64,env.action_space.n)
        )
        
    def forward(self, x):
        return self.net(x)
    
    def act(self,state):
        
        state_t = torch.as_tensor(state, dtype=torch.float32)
        
        q_values = self.forward(state_t.unsqueeze(0))
        
        max_q_index = torch.argmax(q_values, dim=1)[0]
        
        action = max_q_index.detach().item()
        
        return action

In [19]:
env = gym.make('CartPole-v1')
replay_buffer = deque(maxlen=BUFFER_SIZE)
reward_buffer = deque([0.0], maxlen=100)
episode_reward = 0.0

target_net = Network(env)

optimizer = torch.optim.Adam(target_net.parameters(), lr=5e-4)

In [20]:
# Initialize Replay Buffer
# 최소 MIN_REPLAY_SIZE 만큼의 transition을 buffer에 넣고 시작한다.

state = env.reset()

for _ in range(MIN_REPLAY_SIZE):
    
    action = env.action_space.sample()
    new_state, reward, done,_ = env.step(action)
    transition = (state, action, reward, done, new_state)
    replay_buffer.append(transition)
    state = new_state
    
    if done:
        state = env.reset()

In [21]:
# Main Training Loop

state = env.reset()                                                             # state의 예시: [-0.01713841 -0.00705756 -0.04146662 -0.04927411]

for step in itertools.count():                                                  # step starts from 0 and increases by 1 until it meets a break condition. This is same as 'While True' loop

    epsilon = np.interp(step, [0, EPSILON_DECAY], [EPSILON_START, EPSILON_END]) # epsilon value는 EPSILON_START에서 시작해서 EPSILON_END까지 step이 흘러갈수록 점점 더 감소한다. 

    random_sample = random.random()

    if random_sample <= epsilon:                                                # random_sample의 값이 epsilon보다 작으면, random한 action을 취하고, 그렇지 않다면 target_net에 현재 state를 넣어 가장 좋은 act를 가져온다. 
        action = env.action_space.sample()
    else:
        action = target_net.act(state)

    new_state, reward, done, _ = env.step(action)
    transition = (state, action, reward, done, new_state)
    replay_buffer.append(transition)                                            # step 1번에 transition 1번이 append된다. 
    state = new_state

    episode_reward  = episode_reward + reward

    if done:                                                                    # 게임이 끝나면(막대기가 넘어지면), done값이 True가 된다. 
        state = env.reset()                                                     # 끝난다면 env를 reset해주고, 
        reward_buffer.append(episode_reward)                                    # reward_buffer에 episode_reward를 append해준다. 
        episode_reward = 0.0

    # # -------------------------- TEST --------------------------
    # # After solved, watch it play
    # if len(reward_buffer) >= 100:
    #     if np.mean(reward_buffer) >= 195:
    #         while True:
    #             action = target_net.act(state)

    #             state, _, done, _ = env.step(action)
    #             env.render()
    #             if done:
    #                 env.reset()
    # # -------------------------- TEST --------------------------

    # Start Gradient Step
    transitions = random.sample(replay_buffer, BATCH_SIZE)                      # replay_buffer에서 batch_size만큼의 sample을 가져온다. 

    states     = np.asarray([t[0] for t in transitions])                        # len(states) == 32
    actions    = np.asarray([t[1] for t in transitions])                        # len(actions) == 32
    rewards    = np.asarray([t[2] for t in transitions])                        # len(rewards) == 32    
    dones      = np.asarray([t[3] for t in transitions])                        # len(dones) == 32    
    new_states = np.asarray([t[4] for t in transitions])                        # len(new_states) == 32

    # print("states -->", states[0])
    # print("actions -->", actions[0])
    # print("rewards -->", rewards[0])
    # print("dones -->", dones[0])
    # print("new_states -->", new_states[0])
    
    states_t     = torch.as_tensor(states, dtype=torch.float32)
    actions_t    = torch.as_tensor(actions, dtype=torch.int64).unsqueeze(-1)
    rewards_t    = torch.as_tensor(rewards, dtype=torch.float32).unsqueeze(-1)
    dones_t      = torch.as_tensor(dones, dtype=torch.float32).unsqueeze(-1)
    new_states_t = torch.as_tensor(new_states, dtype=torch.float32)

    # Compute Targets
    target_q_values = target_net.forward(new_states_t)
    max_target_q_values = target_q_values.max(dim=1, keepdim=True)[0]           
    targets = rewards_t + GAMMA * (1 - dones_t) * max_target_q_values           # 32 targets are computed. See Algorithm 1 of Human-level control through deep reinforcement learning (Nature14236). 

    # Compute Loss
    q_values = target_net.forward(states_t)                             
    action_q_values = torch.gather(input=q_values, dim=1, index=actions_t)      # 32 action values are computed 
    loss = nn.functional.smooth_l1_loss(action_q_values, targets) 
    # smooth_l1_loss -> V 형태로 미분 불가능한 지점이 있지만 L2 Loss에 비해 outlier에 의한 영향이 적다.
    # 요소별 절대 오차가 베타 미만이면 제곱항을 사용하고 그렇지 않으면 l1항을 사용하는 기준을 생성함.
    # 이상값에 덜 민감하고 torch.nn.MSELoss 경우에 따라 그래디언트 폭발을 방지함.
    # print(targets, action_q_values)
 
    # Gradient Descent
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Logging
    if step % 1000 == 0:
        print()
        print('Step', step)
        print('Avg Reward', np.mean(reward_buffer))                             # maximum length of reward_buffer is 100. Therefore, np.mean(reward_buffer) averages lastest 100 rewards
        print('Loss', loss)
        REWARD_ACC.append(np.mean(reward_buffer))
        LOSS_ACC.append(loss.item())
        
    if step == MAX_EP:
        break


Step 0
Avg Reward 0.0
Loss tensor(0.5570, grad_fn=<SmoothL1LossBackward0>)

Step 1000
Avg Reward 22.522727272727273
Loss tensor(0.4691, grad_fn=<SmoothL1LossBackward0>)

Step 2000
Avg Reward 22.681818181818183
Loss tensor(0.4528, grad_fn=<SmoothL1LossBackward0>)

Step 3000
Avg Reward 22.34
Loss tensor(0.4947, grad_fn=<SmoothL1LossBackward0>)

Step 4000
Avg Reward 22.35
Loss tensor(0.4559, grad_fn=<SmoothL1LossBackward0>)

Step 5000
Avg Reward 24.81
Loss tensor(0.4244, grad_fn=<SmoothL1LossBackward0>)

Step 6000
Avg Reward 26.44
Loss tensor(0.4505, grad_fn=<SmoothL1LossBackward0>)

Step 7000
Avg Reward 28.18
Loss tensor(0.4290, grad_fn=<SmoothL1LossBackward0>)

Step 8000
Avg Reward 28.36
Loss tensor(0.4659, grad_fn=<SmoothL1LossBackward0>)

Step 9000
Avg Reward 30.52
Loss tensor(0.4485, grad_fn=<SmoothL1LossBackward0>)

Step 10000
Avg Reward 34.23
Loss tensor(0.4820, grad_fn=<SmoothL1LossBackward0>)

Step 11000
Avg Reward 36.14
Loss tensor(0.4669, grad_fn=<SmoothL1LossBackward0>)

Step

In [22]:
with open('DQN_EP.txt', 'w') as f:
    f.write(str(REWARD_ACC))
    f.write('\n')
    f.write(str(LOSS_ACC))