In [2]:
import sys
sys.path.append('c:\\users\\sy\\ReinforcementLearningAtoZ')

#jupyter에서는 폴더경로를 추가할때는 처음에는 /를 안붙여도 된다.

In [3]:
import gym
import torch

from src.part3.MLP import MultiLayerPerceptron as MLP
from src.part4.PolicyGradient import REINFORCE
from src.common.train_utils import EMAMeter, to_tensor
from src.common.memory.episodic_memory import EpisodicMemory


In [4]:
import torch.nn as nn 
# nn = neural network
from torch.distributions.categorical import Categorical 
# Categorical은 Distribution의 종류중 하나이다.

#REINFORCE는 nn.Module을 상속받는다. 
class REINFORCE(nn.Module):

    def __init__(self,
                 policy: nn.Module,
                 gamma: float = 1.0,
                 lr: float = 0.0002):
        super(REINFORCE, self).__init__()
        self.policy = policy  # make sure that 'policy' returns logits!
        self.gamma = gamma
        self.opt = torch.optim.Adam(params=self.policy.parameters(),
                                    lr=lr)
        #_eps는 log probability가 -무한대가 되지 않기 위해서 더해주는 
        self._eps = 1e-25

    def get_action(self, state):
        with torch.no_grad():
            logits = self.policy(state)
            dist = Categorical(logits=logits)
            a = dist.sample()  # sample action from softmax policy
        return a

    def _pre_process_inputs(episode) :
        states, actions, rewards = episode #tuple로 저장이 된다.

            # s : torch.tensor [num.steps x state_dim] , Cart-Pole [steps x 4]
            # a : torch.tensor [num.steps]
            # r : torch.tensor [num.steps]

        states = states.flip(dims=[0])
        actions = actions.flip(dims=[0])
        rewards = rewards.flip(dims=[0])

        return states, actions, rewards

    # update는 delta theta의 값을 넣는 것이다.
    def update(self, episode) :
            # sample-by-sample update version of REINFORCE
            # sample-by-sample update version is highly inefficient in computation

        states, actions, rewards = self._pre_process_inputs(episode)

        g = 0 #return value
        for s, a, r in zip(states, actions, rewards) : 
            #zip은 같은 크기의 데이터를 묶는다
            g = r + self.gamma * g
            dist = Categorical(logits = self.policy(s))
            prob = dist.probs[a] #prob = Pi theta(a|s) 

            # Don't forget to put '-' in the front of pg_loss !!!!!!!!!!!!!!!!
            # the default behavior of pytorch's optimizer is to minimize the targets
            # add 'self_eps' to prevent numerical problems of logarithms
            pg_loss = - torch.log(prob + self._eps) * g

            self.opt.zero_grad()
            pg_loss.backward()
            self.opt.step()
            
    def update_episodes(self, states, actions, returns, use_norm=False) :
        # episode batch update version of REINFORCE
        
        if use_norm :
            returns = (returns - returns.mean()) / (returns.std() + self._eps)
        
        dist = Categorical(logits = self.policy(states))
        prob = dist.probs[range(states.shape[0]), actions]
        
        self.opt.zero_grad()
        
        #compute policy gradient loss
        pg_loss = -torch.log(prob+self._eps)*returns.squeeze() # [num. steps x 1]
        pg_loss = pg_loss.mean()
        pg_loss.backward()
        
        self.opt.step()
        


In [5]:
env = gym.make('CartPole-v1')
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.n

net = MLP(s_dim, a_dim, [128])
#여기서 net은 policy역활을 한다.
agent = REINFORCE(net, lr = 0.001)
ema = EMAMeter()
memory = EpisodicMemory(max_size=100, gamma=1.0)

n_eps = 10000
update_every = 1
print_every = 500

for ep in range(n_eps):
    s = env.reset()
    cum_r = 0
    states = []
    actions = []
    rewards = []

    while True:
        s = to_tensor(s, size=(1, 4))
        a = agent.get_action(s)
        ns, r, done, info = env.step(a.item())

        # preprocess data
        r = torch.ones(1, 1) * r
        done = torch.ones(1, 1) * done

        memory.push(s, a, r, torch.tensor(ns), done)

        s = ns
        cum_r += r
        if done:
            break

    ema.update(cum_r)
    if ep % print_every == 0:
        print("Episode {} || EMA: {} ".format(ep, ema.s))

    if ep % update_every == 0:
        s, a, _, _, done, g = memory.get_samples()
        agent.update_episodes(s, a, g, use_norm=False)
        memory.reset()




Episode 0 || EMA: tensor([[17.]]) 
Episode 500 || EMA: tensor([[258.4794]]) 
Episode 1000 || EMA: tensor([[424.2029]]) 
Episode 1500 || EMA: tensor([[301.2433]]) 
Episode 2000 || EMA: tensor([[489.7496]]) 
Episode 2500 || EMA: tensor([[500.]]) 
Episode 3000 || EMA: tensor([[497.8359]]) 
Episode 3500 || EMA: tensor([[500.]]) 
Episode 4000 || EMA: tensor([[500.]]) 
Episode 4500 || EMA: tensor([[500.]]) 
Episode 5000 || EMA: tensor([[500.0000]]) 
Episode 5500 || EMA: tensor([[482.1875]]) 
Episode 6000 || EMA: tensor([[384.2612]]) 
Episode 6500 || EMA: tensor([[435.6323]]) 
Episode 7000 || EMA: tensor([[500.]]) 
Episode 7500 || EMA: tensor([[500.]]) 
Episode 8000 || EMA: tensor([[492.5000]]) 
Episode 8500 || EMA: tensor([[500.]]) 
Episode 9000 || EMA: tensor([[499.9999]]) 
Episode 9500 || EMA: tensor([[500.]]) 


In [1]:
a.item()

NameError: name 'a' is not defined