#### 기존의 Q-learning은 state-action(s,a)에 해당하는 Q-value인 Q(s,a)를 테이블 형식으로 저장하여 학습함
#### 이러한 방식은 state space와 action space가 커지면 모든 Q-value를 저장해야해서 memory와 exploration time 이슈가 생김
#### 딥러닝을 이용하여 Q-table.에 해당하는 Q-function을 비선형함수로 근사시켜 모든 S-A에 대한 Q-value값을 찾거나 저장할 필요를 없앰
#### 이때 딥러닝으로 근사한 Q-function의 weight parameter들은 일반적으로 세타로 표현함

In [29]:
from torch import nn
import torch
import gym
from collections import deque
import itertools
import numpy as np
import random

In [30]:
GAMMA = 0.99
EPSILON_START = 1.0
EPSILON_END = 0.02
EPSILON_DECAY = 10000
MAX_EP = 25000

REWARD_ACC = list()
LOSS_ACC = list()

# pytorch에서는 random seed를 고정하기 위한 함수로 manual_seed를 제공함.
# random값을 고정하기 위해 수동으로 설정하는 것
torch.manual_seed(1234)
np.random.seed(1234)

In [31]:
class Network(nn.Module):
    def __init__(self,env):
        super().__init__()
        
        # 숫자 혹은 array을 넣으면 요소들의 곱을 얻을 수 있음.
        in_features = int(np.prod(env.observation_space.shape))
        
        self.net = nn.Sequential(
            nn.Linear(in_features, 64),
            nn.Tanh(),
            nn.Linear(64, env.action_space.n)
        )
    
    def forward(self,x):
        return self.net(x)
    
    def act(self, state):
        
        # torch 타입에서 tensor로 타입 변환
        state_t = torch.as_tensor(state, dtype = torch.float32)
        
        # 1인 차원을 추가시켜줌(Q_values 는 두가지 벨류를 내뱉음 -> left or right)
        q_values = self.forward(state_t.unsqueeze(0))
        
        # 최대값에 해당되는 지수를 구한다
        max_q_index = torch.argmax(q_values, dim=1)[0]
        
        action = max_q_index.detach().item()
        
        return action

In [32]:
env = gym.make('CartPole-v1')
episode_reward = 0.0
episode = 0
reward_buffer = deque([0.0], maxlen=100)

net = Network(env)

optimizer = torch.optim.Adam(net.parameters(), lr = 1e-1)

In [35]:
# Main Training Loop

state = env.reset()

for step in itertools.count():
    
    epsilon = np.interp(step, [0, EPSILON_DECAY], [EPSILON_START, EPSILON_END])
    
    random_sample = random.random()
    
    # random_sample의 값이 epsilon보다 작으면, random한 action을 취한다.
    if random_sample <= epsilon:
        action = env.action_space.sample()
        
    else:
        action = net.act(state)
        
    new_state, reward, done, _ = env.step(action)
    
    state = new_state
    
    episode_reward = episode_reward + reward
    
    if done:
        # env reset
        state = env.reset()
        
        # reward_buffer에 reward를 더해준다
        reward_buffer.append(episode_reward)
        episode_reward = 0.0
        
    state_t = torch.as_tensor(state, dtype = torch.float32)
    action_t = torch.as_tensor(action, dtype=torch.int64).unsqueeze(-1)
    reward_t = torch.as_tensor(reward, dtype = torch.float32).unsqueeze(-1)
    done_t = torch.as_tensor(done, dtype = torch.float32).unsqueeze(-1)
    new_state_t = torch.as_tensor(new_state, dtype=torch.float32)
    
    # Compute Targets
    target_q_values = net.forward(new_state_t)
    max_target_q_values = target_q_values.max(dim=0, keepdim=True)[0]
    targets = reward_t + GAMMA * (1-done_t) * max_target_q_values
    
    q_values = net.forward(state_t)
    action_q_values = torch.gather(input=q_values, dim=0, index=action_t)
    
    loss = nn.functional.smooth_l1_loss(action_q_values, targets)
    
    # 0으로 설정하는 대신 등급을 없음으로 설정함
    optimizer.zero_grad()
    # 현재 텐서 wrt 그래프 리프의 기울기를 계산함
    loss.backward()
    optimizer.step()
    
    if step % 1000 == 0:
        print()
        print('Step', step)
        print('Avg Reward', np.mean(reward_buffer))
        print('Loss',loss)
        REWARD_ACC.append(np.mean(reward_buffer))
        LOSS_ACC.append(loss.item())
        
    if step == MAX_EP:
        break


Step 0
Avg Reward 0.0
Loss tensor(0.5948, grad_fn=<SmoothL1LossBackward0>)

Step 1000
Avg Reward 18.92156862745098
Loss tensor(0.5998, grad_fn=<SmoothL1LossBackward0>)

Step 2000
Avg Reward 21.063157894736843
Loss tensor(5.3067, grad_fn=<SmoothL1LossBackward0>)

Step 3000
Avg Reward 20.48
Loss tensor(0.3970, grad_fn=<SmoothL1LossBackward0>)

Step 4000
Avg Reward 17.7
Loss tensor(5.6945, grad_fn=<SmoothL1LossBackward0>)

Step 5000
Avg Reward 16.39
Loss tensor(0.2447, grad_fn=<SmoothL1LossBackward0>)

Step 6000
Avg Reward 15.43
Loss tensor(0.4341, grad_fn=<SmoothL1LossBackward0>)

Step 7000
Avg Reward 16.23
Loss tensor(1.0131, grad_fn=<SmoothL1LossBackward0>)

Step 8000
Avg Reward 12.43
Loss tensor(0.4443, grad_fn=<SmoothL1LossBackward0>)

Step 9000
Avg Reward 12.37
Loss tensor(0.2295, grad_fn=<SmoothL1LossBackward0>)

Step 10000
Avg Reward 10.62
Loss tensor(0.2475, grad_fn=<SmoothL1LossBackward0>)

Step 11000
Avg Reward 9.54
Loss tensor(0.0647, grad_fn=<SmoothL1LossBackward0>)

Step 12

In [None]:
with open('DQN.txt','w') as f:
    f.write(str(REWARD_ACC))
    f.write("\n")
    f.write(str(LOSS_ACC))