In [0]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random

In [0]:
class DDPG_Mu(nn.Module):
    def __init__(self):
        super(DDPG_Mu, self).__init__()
        self.fc1 = nn.Linear(3, 512)
        self.fc_mu = nn.Linear(512, 1)
        self.optimizer = optim.Adam(self.parameters(), lr=0.0001)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        mu = torch.tanh(self.fc_mu(x))*2
        return mu
    
    def train(self, loss):
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
class DDPG_Q(nn.Module):
    def __init__(self):
        super(DDPG_Q, self).__init__()
        self.fc_a = nn.Linear(1, 64)
        self.fc_s = nn.Linear(3, 64)
        self.fc_1 = nn.Linear(128, 128)
        self.fc_q = nn.Linear(128, 1)
        self.optimizer = optim.Adam(self.parameters(), lr=0.001)
    
    def forward(self, x, a):
        x1 = F.relu(self.fc_a(a))
        x2 = F.relu(self.fc_s(x))
        x = torch.cat([x1, x2], dim=1)
        x = F.relu(self.fc_1(x))
        q = self.fc_q(x)
        return q

    
    def train(self, loss):
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [0]:
env = gym.make('Pendulum-v0')
Q, Q_p, Mu, Mu_p = DDPG_Q(), DDPG_Q(), DDPG_Mu(), DDPG_Mu()
GAMMA = 0.99 #discount factor
BATCH_SIZE = 32
BUFFER_SIZE = 30000 #replay buffer size
replay_buffer = [] #다른 자료구조로 바꾸어도 상관없음.(list, queue, dict 등)
TAU = 0.01 #soft update parameter
PARAMETER_NOISE_COEF = 0.0005
ITER = 10 #training 함수가 호출될때 학습 iteration 횟수.

In [0]:
def training():
    ###############Put your code here############
    #전역변수 ITER 횟수만큼 batch data를 가져와 Actor(DDPG_Mu)와 Critic(DDPG_Q)
    #네트워크를 학습하고, soft target update를 수행하는 함수
    for i in range(ITER):
        mini_batch = make_minibatch()
        state = torch.tensor([t[0].numpy() for t in mini_batch], dtype=torch.float)
        action = torch.tensor([t[1].numpy()for t in mini_batch], dtype=torch.float).view(-1, 1)
        reward = torch.tensor([t[2] for t in mini_batch], dtype=torch.float).view(-1, 1)
        state_prev = torch.tensor([t[3].numpy() for t in mini_batch], dtype=torch.float)
        done = torch.tensor([t[4] for t in mini_batch],dtype=torch.float).view(-1,1)
        
        global Q_p,Q,Mu,Mu_p
        
        with torch.no_grad():
             target = reward + (GAMMA * Q_p(state_prev,Mu_p(state_prev)) * (1-done))
             
        evaluation = Q(state,action)
        Q.train(F.smooth_l1_loss(evaluation,target))
        
        Mu.train(-Q(state,Mu(state)).mean())

        soft_target_update(Q,Q_p)
        soft_target_update(Mu,Mu_p)
        
    #############################################
    
def soft_target_update(model, model_p):
    ###############Put your code here############
    #기존 네트워크(Q or Mu)에서 target 네트워크(Q_p or Mu_p)로
    #weight를 전역변수 TAU만큼 soft update 하는 함수.
    #model은 기존 네트워크, model_p는 target 네트워크를 전달받는 인자이다.
    #training()에서 호출되는 함수이다.
    for m,mp in zip(model.parameters(),model_p.parameters()):
      mp.data.copy_(TAU*m.data + (1-TAU) * mp.data)
    #############################################
        
def init_target_param(model, model_p):
    ###############Put your code here############
    #학습 시작 전 target network의 weight값을 초기화 시키는 함수.
    #기존 네트워크(model)의 weight값과 똑같이 초기화되어야 하며, 
    #학습 시작 전에 한번만 호출되는 함수이다.
    model_p.load_state_dict(model.state_dict())
    #############################################
        
def parameter_noise(model):
    with torch.no_grad():
        for param in model.parameters():
            param.add_(torch.randn(param.size()) * PARAMETER_NOISE_COEF)
            
def store_transition(s, a, r, s_prime, done):
    ###############Put your code here############
    #현재 time step의 state(s), action(a), reward(r), 
    #next_state(s_prime), done(에피소드가 끝나면 True, 끝나지 않았을 때 False인
    #bool type 변수)를 입력으로 받아서 미리 선언된 
    #replay_buffer에 저장하는 함수이다.
    #단, replay_buffer에 BUFFER_SIZE만큼의 data가 들어있을 경우,
    #가장 먼저 들어왔던 data를 삭제한 후 추가한다.(FIFO)
    global replay_buffer
    new_data = []
    new_data.append(s)
    new_data.append(a)
    new_data.append((r+8.1368022)/8.1368022)
    new_data.append(s_prime)
    #print(done)
    if done:
      new_data.append([1])
    else:
      new_data.append([0])
    replay_buffer.append(new_data)
    if len(replay_buffer) > BUFFER_SIZE:
        del replay_buffer[0]
        #replay_buffer = replay_buffer[len(replay_buffer)-BUFFER_SIZE:]
    #print(type(new_data[0]))
    #############################################
    
def make_minibatch():
    ###############Put your code here############
    #replay_buffer에서 BATCH_SIZE만큼의 transition data를 
    #random sampling 하여 return해주는 함수.
    #training() 함수에서 호출되는 함수이다. 
    #(done은 terminal state value값을 0으로 만들어 주기 위해 필요하다.)
    idx = np.arange(0,len(replay_buffer))

    np.random.shuffle(idx)
    idx = idx[:BATCH_SIZE]
    shuffled = [replay_buffer[i] for i in idx]
    return shuffled
    #############################################

In [0]:
reward_sum = 0.0
reward_list = []
init_target_param(Mu, Mu_p)
init_target_param(Q, Q_p)

for ep in range(20000):
    observation = env.reset()
    while True:
        state = torch.tensor(observation, dtype=torch.float)
        parameter_noise(Mu)
        action = Mu(state).detach()
        observation, reward, done, _ = env.step([action.item()])
        reward_sum += reward
        next_state = torch.tensor(observation, dtype=torch.float)
        store_transition(state, action, reward, next_state, done)   
        if done:
            break
            
    if len(replay_buffer) >= 500:
        training()
            
    if ep % 20 == 19:
        print('Episode %d'%ep,', Reward mean : %f'%(reward_sum/20.0))
        if reward_sum/20.0 > -200.0:
            break
        reward_sum = 0.0

Episode 19 , Reward mean : -1602.609720
Episode 39 , Reward mean : -1708.350725
Episode 59 , Reward mean : -1570.443458
Episode 79 , Reward mean : -1539.465416
Episode 99 , Reward mean : -1534.141949
Episode 119 , Reward mean : -1547.988985
Episode 139 , Reward mean : -1383.283302
Episode 159 , Reward mean : -1147.604477
Episode 179 , Reward mean : -1129.390469
Episode 199 , Reward mean : -928.448008
Episode 219 , Reward mean : -876.730808
Episode 239 , Reward mean : -730.286245
Episode 259 , Reward mean : -664.381433
Episode 279 , Reward mean : -611.206902
Episode 299 , Reward mean : -422.814679
Episode 319 , Reward mean : -369.548631
Episode 339 , Reward mean : -211.543494
Episode 359 , Reward mean : -212.000850
Episode 379 , Reward mean : -208.511873
Episode 399 , Reward mean : -242.093678
Episode 419 , Reward mean : -251.851300
Episode 439 , Reward mean : -207.447564
Episode 459 , Reward mean : -404.888980
Episode 479 , Reward mean : -222.584052
Episode 499 , Reward mean : -223.297