## Cart-Pole 예제에서 Expected SARSA 에이전트 학습
### x : Cart의 가로상의 위치
### θ : Pole의 각도
### dx/dt : Cart의 속도
### dθ/dt : θ의 각속도
###### 세타가 15도 이상이 되거나, 원점으로부터의 x의 거리가 2.4이상이 되었을 때

In [2]:
import numpy as np
import pandas as pd
import random
from collections import defaultdict
import gym

import torch
import torch.nn as nn
import torch.optim as optim

import matplotlib.pyplot as plt

In [3]:
# Deep SARSA 에이전트 클래스를 만들어줌
# 인공신경망의 레이어 수는 세개 각각 32개의 노드를 사용한다.
# 옵티마이저는 Adam을 사용
# 활성화 함수 : 입력된 데이터의 가중 합을 출력 신호로 변환하는 함수
# ReLU → 입력값이 0보다 작으면 0을 0보다 크면 입력값 그대로 출력

class DeepSARSA:
    def __init__(self, num_states, num_actions):
        
        self.num_states = num_states
        self.num_actions = num_actions
        self.alpha = 0.001
        self.gamma = 0.99
        self.epsilon = 1.
        self.epsilon_decay = .99995
        self.epsilon_min = 0.01
          
        self.model =nn.Sequential(
            nn.Linear(self.num_states, 32),
            nn.ReLU(),
            nn.Linear(32,32),
            nn.ReLU(),
            nn.Linear(32, self.num_actions)
        )
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.alpha)
        
    # 현재상태, 현재행동, 보상, 다음상태, 다음행동 을가지고 시간차 학습
    # 에피소드가 끝나는 시점에는 미래의 값을 고려하지 않고 학습함
    def update(self, state, action, reward, next_state, next_action, done):
        
        self.decrease_epsilon()
        self.optimizer.zero_grad()
        
        q_value = self.model(state)[action]
        next_q_value = self.model(next_state)[next_action].detach()
        
        q_target = reward + (1 - int(done)) * self.gamma * next_q_value
        q_error = (q_target - q_value) ** 2
        
        q_error.backward()
        self.optimizer.step()
        
        return q_error.item()
    
    def decrease_epsilon(self):      
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    def act(self, state):        
        if np.random.rand() < self.epsilon:
            action = np.random.choice(self.num_actions)
            
        else:
            
            q_values = self.model(state)
            action = torch.argmax(q_values).item()
            
        return action

In [4]:
import gym
from gym import wrappers
env = gym.make('CartPole-v1')
env = wrappers.Monitor(env, "./video", force=True)
observation = env.reset()
agent = DeepSARSA(4,2)

observation

array([-0.00431391,  0.02705465, -0.01444569, -0.03511632])

In [5]:
rewards = []
for ep in range(500):
    done = False
    obs = torch.FloatTensor(env.reset())
    action = agent.act(obs)
    
    ep_rewards = 0
    losses = []
    while not done:
        next_obs, reward, done, info = env.step(action)
        next_obs = torch.FloatTensor(next_obs)
        
        next_action = agent.act(next_obs)
        
        loss = agent.update(obs, action, reward, next_obs, next_action, done)
        losses.append(loss)
        
        ep_rewards += reward
        obs = next_obs
        action = next_action
    rewards.append(ep_rewards)
    ep_loss =sum(losses) / len(losses)
    if (ep+1) % 10 == 0 :
        print("episode : {}, eps: {:.3f}, loss : {:.1f}, rewards: {}". format(ep+1, agent.epsilon,ep_loss, ep_rewards ))
    
env.close()

  allow_unreachable=True, accumulate_grad=True)  # Calls into the C++ engine to run the backward pass


episode : 10, eps: 0.991, loss : 1.6, rewards: 16.0
episode : 20, eps: 0.983, loss : 9.5, rewards: 19.0
episode : 30, eps: 0.971, loss : 8.1, rewards: 19.0
episode : 40, eps: 0.962, loss : 11.9, rewards: 23.0
episode : 50, eps: 0.953, loss : 5.5, rewards: 32.0
episode : 60, eps: 0.941, loss : 8.7, rewards: 45.0
episode : 70, eps: 0.933, loss : 30.1, rewards: 10.0
episode : 80, eps: 0.920, loss : 13.8, rewards: 17.0
episode : 90, eps: 0.911, loss : 11.3, rewards: 20.0
episode : 100, eps: 0.899, loss : 9.7, rewards: 20.0
episode : 110, eps: 0.889, loss : 12.1, rewards: 11.0
episode : 120, eps: 0.878, loss : 5.0, rewards: 30.0
episode : 130, eps: 0.867, loss : 2.7, rewards: 34.0
episode : 140, eps: 0.859, loss : 3.5, rewards: 10.0
episode : 150, eps: 0.846, loss : 8.4, rewards: 28.0
episode : 160, eps: 0.834, loss : 9.9, rewards: 21.0
episode : 170, eps: 0.819, loss : 7.9, rewards: 48.0
episode : 180, eps: 0.805, loss : 12.9, rewards: 27.0
episode : 190, eps: 0.794, loss : 7.3, rewards: 1

In [17]:
ls

'Deep SARSA_1.ipynb'   [0m[01;34mvideo[0m/
