In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r drive/MyDrive/Test/SKT/gym_examples/gym_examples /content

# DQN으로 Shooring Airplane Game 강화학습

먼저 여러가지 설정 변수 정의

In [None]:
import numpy as np
import gym
import torch
import torch.nn as nn
import torchvision.transforms as T

# Configuration paramaters for the whole setup
seed = 42
gamma = 0.99  # Discount factor for past rewards
epsilon = 1.0  # Epsilon greedy parameter
epsilon_min = 0.1  # Minimum epsilon greedy parameter
epsilon_max = 1.0  # Maximum epsilon greedy parameter
epsilon_interval = epsilon_max - epsilon_min # Rate at which to reduce chance
                                             # of random action being taken
batch_size = 16  # Size of batch taken from replay buffer
max_steps_per_episode = 60
max_episodes = 5000

### 게임 환경 설정

상태(state) 정의
- 보드판의 모양: (8 * 8) 행렬 * 3 채널
- 채널 0: unseen
- 채널 1: hit
- 채녈 2: miss

액션 정의
- 돌의 가능한 위치 (8 * 8 = 64)

In [None]:
env = gym.make('gym_examples:gym_examples/ShootingAirplane-v0', render_mode="text")

  and should_run_async(code)
  deprecation(
  deprecation(


env에서 정의한 action_space, observation_space 의 모양 확인
- action_space: 3개의 값의 튜플 (벡터)
- observation_space: HWC 형태의 이미지 (마지막 축이 단일 값인 15 * 15 * 1 텐서) -> pytorch를 사용할 경우 적절히 1 * 15 * 15 텐서로 수정필요

In [None]:
env.action_space.shape

(2,)

In [None]:
env.observation_space.shape

(8, 8, 1)

### 네트워크 정의하기

참고: Conv2d 파라미터
* in_channels (int) – Number of channels in the input image
* out_channels (int) – Number of channels produced by the convolution
* kernel_size (int or tuple) – Size of the convolving kernel
* stride (int or tuple, optional) – Stride of the convolution. Default: 1
* padding (int, tuple or str, optional) – Padding added to all four sides of the input. Default: 0
* padding_mode (str, optional) – 'zeros', 'reflect', 'replicate' or 'circular'. Default: 'zeros'

In [None]:
import torch
import torch.nn as nn

num_actions = 64

class QModel(nn.Module):
    def __init__(self, num_actions):
        super(QModel, self).__init__()
        self.dropout = nn.Dropout(p=0.3)
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding='same')
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding='same')
        self.conv3 = nn.Conv2d(32, 32, kernel_size=3, stride=1)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(1152, 512)
        self.fc2 = nn.Linear(512, num_actions)

    def forward(self, x):
        x = nn.functional.relu(self.conv1(x))
        x = nn.functional.relu(self.conv2(x))
        x = self.dropout(x)
        x = nn.functional.relu(self.conv3(x))
        x = self.flatten(x)
        x = nn.functional.relu(self.fc1(x))
        x = self.dropout(x)
        action = self.fc2(x)
        return action

### 모델 빌딩 & 로스 및 최적화 계산기 만들기

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# The first model makes the predictions for Q-values which are used to
# make a action.
model = QModel(num_actions)
model.to(device)

# Build a target model for the prediction of future rewards.
# The weights of a target model get updated every 10000 steps thus when the
# loss between the Q-values is calculated the target Q-value is stable.
model_target = QModel(num_actions)
model_target.to(device)

loss_function = nn.SmoothL1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00025)

In [None]:
device

device(type='cuda', index=0)

### Replay Buffer 정의

In [None]:
# Experience replay buffers
action_history = []
action_mask_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []
episode_reward_history = []
running_reward = 0
episode_count = 0
frame_count = 0

# Number of frames to take random action and observe output
epsilon_random_frames = 50000
# Number of frames for exploration
epsilon_greedy_frames = 200000.0
# Maximum replay length
# Note: The Deepmind paper suggests 1000000 however this causes memory issues
max_memory_length = 500000
# Train the model after 4 actions
update_after_actions = 4
# How often to update the target network
update_target_network = 10000

### 전처리

- env가 리턴하는 observation은 일단 np.array이니 torch.tensor로 캐스팅
- env가 리턴하는 상태가 (8, 8, 1)의 HWC 이미지 텐서이므로 이를 (3, 15, 15)의 CHW 이미지로 변환
- One-hot 인코딩도 필요

In [None]:
# Function to preprocess the state
# note that player 1 = env player, player 2 = agent
def preprocess_state(env_observ):
    st = torch.from_numpy(env_observ).squeeze()
    st = st.to(torch.int64)
    st = torch.nn.functional.one_hot(st,num_classes=3)
    st = st.permute(2, 0, 1)
    return st.to(torch.float32)

### Epsilon-greedy 액션 선택 함수

학습시 에피소드 생성하면서 사용 (주의: 입력은 batch axis 없음)

In [None]:
# Function to select an action
# model: the torch model to compuate action-state value (i.e., q-value)
# state: a torch tensor (3 x 8 x 8) of float32, which is output by preprocess_state
# mask: a 64-size array (np.array)
def get_greedy_epsilon(model, state, mask):
    global epsilon

    #if frame_count < epsilon_random_frames or np.random.rand(1)[0] < epsilon:
    if np.random.rand(1)[0] < epsilon:
        action = np.random.choice([ i for i in range(num_actions) if mask[i] == 1 ])
    else:
        with torch.no_grad():
            # add a batch axis
            state_tensor = state.unsqueeze(0)
            # compute the q-values
            q_values = model(state_tensor)
            # select the q-values of valid actions
            action = torch.argmax(
                q_values.to('cpu').squeeze() + torch.from_numpy(mask) * 100., # trick to select a valid action
                dim=0)


            #valid_q = [ (i, q_values[0][i]) for i in range(64) if mask[i] == 1 ]
            # the action of maximum q-value
            #action, _ = max(valid_q, key=lambda e: e[1])

    # decay epsilon
    epsilon -= epsilon_interval / epsilon_greedy_frames
    epsilon = max(epsilon, epsilon_min)

    return action

### Greedy 액션 선택 함수

나중에 evaluation 시 사용

In [None]:
def get_greedy_action(model, state, mask):
    global epsilon

    with torch.no_grad():
        state_tensor = state.unsqueeze(0) # batch dimension
        q_values = model(state_tensor)

        action = torch.argmax(
                q_values.to('cpu').squeeze() + torch.from_numpy(mask) * 100., # trick to select a valid action
                dim=0)

    return action

  and should_run_async(code)


### Update 파트

- Replay buffer 에서 batch하나를 샘플링하고,
- model을 update한다.

In [None]:
# sample a batch of _batch_size from replay buffers
# return numpy.ndarrays
def sample_batch(_batch_size):
    # Get indices of samples for replay buffers
    indices = np.random.choice(range(len(done_history)), size=_batch_size, replace=False)

    state_sample = np.array([state_history[i].squeeze(0).numpy() for i in indices])
    state_next_sample = np.array([state_next_history[i].squeeze(0).numpy() for i in indices])
    rewards_sample = np.array([rewards_history[i] for i in indices], dtype=np.float32)
    action_sample = np.array([action_history[i] for i in indices])

    # action mask is the mask for the valid actions at the '''next''' state
    action_mask_sample = np.array([action_mask_history[i] for i in indices])
    done_sample = np.array([float(done_history[i]) for i in indices])

    return state_sample, state_next_sample, rewards_sample, action_sample, action_mask_sample, done_sample

In [None]:
# Function to update the Q-network
def update_network():
    # sample a batch of ...
    state_sample, state_next_sample, rewards_sample, action_sample, action_mask_sample, done_sample = \
        sample_batch(batch_size)

    # Convert numpy arrays to PyTorch tensors
    state_sample = torch.tensor(state_sample, dtype=torch.float32).to(device)
    state_next_sample = torch.tensor(state_next_sample, dtype=torch.float32).to(device)
    action_sample = torch.tensor(action_sample, dtype=torch.int64).to(device)
    action_mask_sample = torch.tensor(action_mask_sample, dtype=torch.int64).to(device)
    rewards_sample = torch.tensor(rewards_sample, dtype=torch.float32).to(device)
    done_sample = torch.tensor(done_sample, dtype=torch.float32).to(device)

    # Compute the target Q-values for the states
    with torch.no_grad():
        future_rewards = model_target(state_next_sample)
        #future_rewards = future_rewards.cpu()

        # compute the q-value for the next state and the action maximizing the q-value
        # note: the action should be valid (i.e., mask is set to 1)
        max_q_values = torch.max(
            future_rewards + action_mask_sample * 100., # trick to select a valid action
            dim=1).values.detach() - 100.

        # compute the target q-value
        # if the step was final, max_q_values should not be added
        # we assume that the negative return of the opposite player is the return of next step
        # that is, G(t) = r(t+1) - g*r(t+2) + g^2*r(t+3) - g^3*r(t+4) + ...
        target_q_values = rewards_sample + gamma * max_q_values * (1. - done_sample)

    # It's forward propagation! Compute the Q-values for the taken actions
    q_values = model(state_sample)
    #q_values = q_values.cpu()
    q_values_action = q_values.gather(dim=1, index=action_sample.unsqueeze(1)).squeeze(1)

    # Compute the loss
    loss = loss_function(q_values_action, target_q_values)

    # Perform the optimization step
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Run DQN Tranining

In [None]:
for _ in range(max_episodes):
    state, info = env.reset()
    state = preprocess_state(state)
    action_mask = info['action_mask'].reshape((-1,))
    episode_reward = 0

    for timestep in range(1, max_steps_per_episode):
        frame_count += 1

        # Select an action
        #state_cuda = state.to(device)
        action = get_greedy_epsilon(model,
                      state.to(device),
                      action_mask)
        if action < 0:
            print(action_mask)

        # Take the selected action
        state_next, reward, done, info = env.step((action // 8, action % 8))
        state_next = preprocess_state(state_next)
        action_mask = info['action_mask'].reshape((-1,))

        episode_reward += reward

        # Store the transition in the replay buffer
        action_history.append(action)
        action_mask_history.append(action_mask)
        state_history.append(state)
        state_next_history.append(state_next)
        rewards_history.append(reward)
        done_history.append(done)

        state = state_next

        # Update every fourth frame and once batch size is over 32
        if frame_count % update_after_actions == 0 and len(done_history) > batch_size:
            update_network()

        if frame_count % update_target_network == 0:
            model_target.load_state_dict(model.state_dict())

        # Limit the state and reward history
        if len(rewards_history) > max_memory_length:
            del rewards_history[:1]
            del state_history[:1]
            del state_next_history[:1]
            del action_history[:1]
            del action_mask_history[:1]
            del done_history[:1]

        if done:
            break

    episode_count += 1
    episode_reward_history.append(episode_reward)

    # Update running reward to check condition for solving
    if len(episode_reward_history) > 100:
        del episode_reward_history[:1]
    running_reward = np.mean(episode_reward_history)

    if episode_count % 10 == 0:
        print(f"Episode: {episode_count}, Frame count: {frame_count}, Running reward: {running_reward}")

    if episode_count % 5000 == 0:
        torch.save(model, 'model.{}'.format(episode_count))
    #if running_reward > 20:
    #    print(f"Solved at episode {episode_count}!")
    #    break


torch.save(model, 'model.final')

  and should_run_async(code)


Episode: 20, Frame count: 1261, Running reward: -39.25
Episode: 30, Frame count: 1838, Running reward: -38.93333333333333
Episode: 40, Frame count: 2423, Running reward: -39.225
Episode: 50, Frame count: 2964, Running reward: -38.48
Episode: 60, Frame count: 3545, Running reward: -38.61666666666667
Episode: 70, Frame count: 4133, Running reward: -38.871428571428574
Episode: 80, Frame count: 4711, Running reward: -38.9625
Episode: 90, Frame count: 5284, Running reward: -38.91111111111111
Episode: 100, Frame count: 5855, Running reward: -38.91
Episode: 110, Frame count: 6424, Running reward: -38.74
Episode: 120, Frame count: 7005, Running reward: -38.98
Episode: 130, Frame count: 7575, Running reward: -38.97
Episode: 140, Frame count: 8119, Running reward: -38.46
Episode: 150, Frame count: 8670, Running reward: -38.58
Episode: 160, Frame count: 9238, Running reward: -38.43
Episode: 170, Frame count: 9774, Running reward: -37.79
Episode: 180, Frame count: 10321, Running reward: -37.38
Epi

# Evaluation

In [None]:
import time, sys
from IPython.display import clear_output

board, info = env.reset()
state = preprocess_state(board)
action_mask = info['action_mask'].reshape((-1,))
done = False
env.render()

while not done:
    action = get_greedy_action(model, state.to(device), action_mask)
    print("action: ({}, {})".format(action // 8, action % 8))
    sys.stdout.flush()

    time.sleep(1.0)
    clear_output(wait=False)
    board, reward, done, info = env.step((action // 8, action % 8))
    state = preprocess_state(board)
    action_mask = info['action_mask'].reshape((-1,))
    env.render()

         |         
 M   H   |      H  
   HMH   |    H H  
   HHHH  |    HHHH 
  MH HM  |    H H  
   M H   |      H  
     M   |         
         |         

