### 현재 상태에 다음 행동에 대한 보상값 뿐만 아니라 
### 현재 상태에서 모든 행동에 대한 보상값을 평균으로 Q-value를 업데이트 시킴

In [6]:
from torch import nn
import torch
import gym
from collections import deque
import itertools
import numpy as np
import random
from torch.nn import functional as F

In [7]:
GAMMA = 0.99
BATCH_SIZE = 32
BUFFER_SIZE = 50000
MIN_REPLAY_SIZE = 1000
EPSILON_START = 1.0
EPSILON_END = 0.02
EPSILON_DECAY = 10000
TARGET_UPDATE_FREQ = 1000
MAX_EP = 25000

REWARD_ACC = list()
LOSS_ACC = list()

torch.manual_seed(1234)
np.random.seed(1234)

In [8]:
class Network(nn.Module):
    def __init__(self, env):
        super().__init__()
        
        in_features = int(np.prod(env.observation_space.shape))
        
        self.layer_v1 = torch.nn.Linear(in_features, 64)
        self.layer_v2 = torch.nn.Linear(64, 1)
        
        self.layer_a1 = torch.nn.Linear(in_features, 64)
        self.layer_a2 = torch.nn.Linear(64, env.action_space.n)
        
    def forward(self, x):
        
        # 비선형 활성화 함수 (수정된 선형 단위 함수를 요소별로 적용)
        V = F.relu(self.layer_v1(x))
        V = self.layer_v2(V)
        
        A = F.relu(self.layer_a1(x))
        A = self.layer_a2(A)
        
        Q = V + (A - A.mean())
        
        return Q
    
    def act(self, state):
        
        state_t = torch.as_tensor(state, dtype = torch.float32)
        
        q_values = self.forward(state_t.unsqueeze(0))
        
        max_q_index = torch.argmax(q_values, dim=1)[0]
        
        action = max_q_index.detach().item()
        
        return action     

In [9]:
env = gym.make('CartPole-v1')
replay_buffer = deque(maxlen=BUFFER_SIZE)
reward_buffer = deque([0.0], maxlen=100)
episode_reward = 0.0

online_net = Network(env)
target_net = Network(env)

target_net.load_state_dict(online_net.state_dict())

optimizer = torch.optim.Adam(online_net.parameters(), lr=5e-4)

In [10]:
state = env.reset()

for _ in range(MIN_REPLAY_SIZE):
    
    action = env.action_space.sample()
    new_state, reward, done, _ = env.step(action)
    transition = (state, action, reward, done, new_state)
    replay_buffer.append(transition)
    state = new_state
    
    if done:
        state = env.reset()

In [13]:
state = env.reset()

for step in itertools.count():
    
    epsilon = np.interp(step, [0, EPSILON_DECAY], [EPSILON_START, EPSILON_END])
    
    random_sample = random.random()
    
    if random_sample <= epsilon:
        action = env.action_space.sample()
    else:
        action = online_net.act(state)
        
    new_state, reward, done, _ = env.step(action)
    transition = (state, action, reward, done, new_state)
    replay_buffer.append(transition)
    state = new_state
    
    episode_reward = episode_reward + reward
    
    if done:
        state = env.reset()
        reward_buffer.append(episode_reward)
        episode_reward = 0.0
        
    transitions = random.sample(replay_buffer, BATCH_SIZE)
    
    states     = np.asarray([t[0] for t in transitions])
    actions    = np.asarray([t[1] for t in transitions])
    rewards    = np.asarray([t[2] for t in transitions])
    dones      = np.asarray([t[3] for t in transitions])
    new_states = np.asarray([t[4] for t in transitions])
    
    states_t = torch.as_tensor(states, dtype = torch.float32)
    actions_t = torch.as_tensor(actions, dtype = torch.int64).unsqueeze(-1)
    rewards_t = torch.as_tensor(rewards, dtype = torch.float32).unsqueeze(-1)
    dones_t = torch.as_tensor(dones, dtype = torch.float32).unsqueeze(-1)
    new_states_t = torch.as_tensor(new_states, dtype = torch.float32)
    
    online_with_new_states = online_net.forward(new_states_t)
    argmax_online_with_new_states = online_with_new_states.argmax(dim=1, keepdim=True)
    
    offline_with_new_states = target_net.forward(new_states_t)
    target_q_vals = torch.gather(input=offline_with_new_states , dim=1, index=argmax_online_with_new_states)
    targets = rewards_t + GAMMA * (1 - dones_t) * target_q_vals
    
    q_values = online_net.forward(states_t)
    action_q_values = torch.gather(input=q_values, dim=1, index=actions_t)
    loss = nn.functional.smooth_l1_loss(action_q_values, targets)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if step % TARGET_UPDATE_FREQ == 0:
        target_net.load_state_dict(online_net.state_dict())
        
    if step % 1000 == 0:
        print()
        print('Step', step)
        print('Avg Reward', np.mean(reward_buffer))
        print('Loss', loss)
        REWARD_ACC.append(np.mean(reward_buffer))
        LOSS_ACC.append(loss.item())
        
    if step == MAX_EP:
        break

  allow_unreachable=True, accumulate_grad=True)  # Calls into the C++ engine to run the backward pass



Step 0
Avg Reward 0.0
Loss tensor(0.5401, grad_fn=<SmoothL1LossBackward0>)

Step 1000
Avg Reward 20.142857142857142
Loss tensor(0.0082, grad_fn=<SmoothL1LossBackward0>)

Step 2000
Avg Reward 21.844444444444445
Loss tensor(0.0075, grad_fn=<SmoothL1LossBackward0>)

Step 3000
Avg Reward 25.04
Loss tensor(0.0167, grad_fn=<SmoothL1LossBackward0>)

Step 4000
Avg Reward 29.2
Loss tensor(0.0042, grad_fn=<SmoothL1LossBackward0>)

Step 5000
Avg Reward 36.21
Loss tensor(0.1780, grad_fn=<SmoothL1LossBackward0>)

Step 6000
Avg Reward 43.17
Loss tensor(0.1601, grad_fn=<SmoothL1LossBackward0>)

Step 7000
Avg Reward 50.84
Loss tensor(0.0045, grad_fn=<SmoothL1LossBackward0>)

Step 8000
Avg Reward 58.65
Loss tensor(0.1691, grad_fn=<SmoothL1LossBackward0>)

Step 9000
Avg Reward 67.56
Loss tensor(0.0056, grad_fn=<SmoothL1LossBackward0>)

Step 10000
Avg Reward 75.3
Loss tensor(0.2262, grad_fn=<SmoothL1LossBackward0>)

Step 11000
Avg Reward 83.74
Loss tensor(0.4946, grad_fn=<SmoothL1LossBackward0>)

Step 1

In [14]:
with open('Dueling_EP_SN.txt', 'w') as f:
    f.write(str(REWARD_ACC))
    f.write("\n")
    f.write(str(LOSS_ACC))