In [1]:
from torch import nn
import torch
import gym
from collections import deque
import itertools
import numpy as np
import random

In [2]:
GAMMA = 0.99
BATCH_SIZE = 32
BUFFER_SIZE = 50000
MIN_REPLAY_SIZE = 1000
EPSILON_START = 1.0
EPSILON_END = 0.02
EPSILON_DECAY = 10000
MAX_EP = 25000

REWARD_ACC = list()
LOSS_ACC = list()

torch.manual_seed(1234)
np.random.seed(1234)

In [5]:
class Network(nn.Module):
    def __init__(self,env):
        super().__init__()
        
        in_features = int(np.prod(env.observation_space.shape))
        
        self.net = nn.Sequential(
            nn.Linear(in_features, 64),
            nn.Tanh(),
            nn.Linear(64,env.action_space.n)
        )
        
    def forward(self, x):
        return self.net(x)
    
    def act(self,state):
        
        state_t = torch.as_tensor(state, dtype=torch.float32)
        
        q_values = self.forward(state_t.unsqueeze(0))
        
        max_q_index = torch.argmax(q_values, dim=1)[0]
        
        action = max_q_index.detach().item()
        
        return action

In [6]:
env = gym.make('CartPole-v1')
replay_buffer = deque(maxlen=BUFFER_SIZE)
reward_buffer = deque([0.0], maxlen=100)
episode_reward = 0.0

target_net = Network(env)

optimizer = torch.optim.Adam(target_net.parameters(), lr=5e-4)

In [7]:
# Initialize Replay Buffer
# 최소 MIN_REPLAY_SIZE 만큼의 transition을 buffer에 넣고 시작한다.

state = env.reset()

for _ in range(MIN_REPLAY_SIZE):
    
    action = env.action_space.sample()
    new_state, reward, done,_ = env.step(action)
    transition = (state, action, reward, done, new_state)
    replay_buffer.append(transition)
    state = new_state
    
    if done:
        state = env.reset()

In [None]:
state = env.reset()

for step in itertools.count():
    
    epsilon = np.interp(step, [0, EPSILON_DECAY], [EPSILON_START, EPSILON_END])
    
    random_sample = random.random()
    
    if random_sample <= epsilone:
        action = env.action_space.sample()
        
    else: 
        action = target_net.act(state)
        
    new_state, reward, done, _ = env.step(action)
    transition = (state, action ,reward, done, new_state)
    replay_buffer.append(transition)
    state = new_state
    
    episode_reward = episode_reward + reward
    
    if done:
        state = env.reset()
        reward_buffer.append(episode_reward)
        episode_reward = 0.0
        
    transitions = random.sample(replay_buffer, BATCH_SIZE)
    
    state = np.asarray([t[0] for t in transitions])
    actions = np.asarray([t[1] for t in transitions])