<a href="https://colab.research.google.com/github/slcnvly/REINFORCE-Cartpole/blob/master/REINFORCE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

In [5]:
# Hyperparameters
learning_rate = 0.0002
gamma = 0.98

class PolicyNet(nn.Module):
    def __init__(self):
        super(PolicyNet, self).__init__()
        self.fc1 = nn.Linear(4, 128)
        self.fc2 = nn.Linear(128, 2) # CartPole Action: 0(왼), 1(오)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        # DQN은 그냥 값을 내보냈지만, REINFORCE는 '확률'을 내보내야 함
        prob = F.softmax(x, dim=1)
        return prob

In [6]:
class REINFORCE(nn.Module):
    def __init__(self):
        super(REINFORCE, self).__init__()
        self.model = PolicyNet()
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
        self.data = [] # 에피소드 동안의 데이터를 저장할 리스트

    def put_data(self, item):
        # (보상, 로그확률)을 저장
        self.data.append(item)

    def train_net(self):
        R = 0
        self.optimizer.zero_grad()
        loss = 0
        G_t = 0

        # G_t를 뒤에서부터 계산
        for r, log_prob in self.data[::-1]:
            # 여기에 구현해 주세요
            G_t += r * (gamma ** R)
            R += 1
            loss += -log_prob * G_t
            pass

        loss.backward()
        self.optimizer.step()
        self.data = [] # 학습했으면 데이터를 비워줌 (On-policy)

In [7]:
def main():
    env = gym.make('CartPole-v1')
    agent = REINFORCE()
    score = 0.0

    for n_epi in range(10000):
        s, _ = env.reset()
        done = False

        while not done:
            # =확률 분포(Categorical)를 이용해 샘플링
            s = torch.FloatTensor([s])
            probs = agent.model(s)
            m = Categorical(probs)
            a = m.sample()
            log_prob = m.log_prob(a)

            s_prime, r, done, truncated, _ = env.step(a.item())
            # 나중에 계산할 때 필요한 '보상'과 '우리가 선택한 확률의 로그값'만 저장
            agent.put_data((r, log_prob))

            s = s_prime
            score += r

            if done:
                break

        # DQN은 while문 안에서 학습했지만, REINFORCE는 에피소드가 끝나야 학습
        agent.train_net()

        if n_epi % 20 == 0 and n_epi != 0:
            print(f"Episode: {n_epi} | Score: {score/20.0:.1f}")
            score = 0.0

    env.close()

if __name__ == '__main__':
    main()

  s = torch.FloatTensor([s])


Episode: 20 | Score: 23.5
Episode: 40 | Score: 21.5
Episode: 60 | Score: 27.2
Episode: 80 | Score: 25.8
Episode: 100 | Score: 30.1
Episode: 120 | Score: 23.5
Episode: 140 | Score: 34.8
Episode: 160 | Score: 25.4
Episode: 180 | Score: 25.1
Episode: 200 | Score: 33.3
Episode: 220 | Score: 34.6
Episode: 240 | Score: 41.9
Episode: 260 | Score: 44.0
Episode: 280 | Score: 39.1
Episode: 300 | Score: 35.6
Episode: 320 | Score: 51.5
Episode: 340 | Score: 43.3
Episode: 360 | Score: 41.0
Episode: 380 | Score: 40.9
Episode: 400 | Score: 52.1
Episode: 420 | Score: 49.8
Episode: 440 | Score: 47.0
Episode: 460 | Score: 43.8
Episode: 480 | Score: 58.0
Episode: 500 | Score: 58.0
Episode: 520 | Score: 57.4
Episode: 540 | Score: 56.3
Episode: 560 | Score: 52.4
Episode: 580 | Score: 60.8
Episode: 600 | Score: 60.0
Episode: 620 | Score: 73.3
Episode: 640 | Score: 66.0
Episode: 660 | Score: 65.4
Episode: 680 | Score: 65.7
Episode: 700 | Score: 71.5
Episode: 720 | Score: 88.8
Episode: 740 | Score: 75.2
Episo

KeyboardInterrupt: 