- state : 하나의 float값 [-3,3]
- action : 네트워크가 예측한 y_hat, 실수
- reward : `-MSE(y_hat, y_true)` 또는 `-abs(y_hat - y_true)` 형태로 설정

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from torch.distributions import Normal

In [6]:
# 다항식 함수 정의 : `y = x^5 + 1.5x^4 - 5x^3 - 7.5x^2 + 4x + 6`

coeffs = [1, 1.5, -5.0, -7.5, 4.0, 6.0]

def target_function(x):
    # x는 torch tensor
    powers = torch.stack([x ** i for i in reversed(range(1, 6 + 1))], dim=-1)
    coeffs_tensor = torch.tensor(coeffs, dtype=torch.float32).to(x.device)
    return (powers * coeffs_tensor).sum(dim=-1, keepdim=True)


In [7]:
class PolicyNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(1, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        self.mu_head = nn.Linear(64, 1)
        self.log_std = nn.Parameter(torch.zeros(1))  # 학습 가능한 표준편차(log)

    def forward(self, x):
        features = self.net(x)
        mu = self.mu_head(features)
        std = self.log_std.exp() + 1e-5  # 안정성을 위해 epsilon 추가
        return mu, std


In [8]:
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())

2.7.0+cu118
11.8
True


In [9]:
def reward_fn(y_pred, y_true):
    return -((y_pred - y_true) ** 2)  # MSE 기반 보상

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("사용 중인 디바이스:", device)
print("CUDA 사용 가능 여부:", torch.cuda.is_available())
if device == 'cuda':
    print("사용 중인 GPU 이름:", torch.cuda.get_device_name(0))

policy = PolicyNet().to(device)
optimizer = optim.Adam(policy.parameters(), lr=1e-3)

reward_history = []

for epoch in range(1000):  # 에포크 수 조절 가능
    log_probs = []
    rewards = []

    for _ in range(256):  # 배치 사이즈
        x = torch.FloatTensor(1).uniform_(-3, 3).unsqueeze(0).to(device)  # (1, 1)
        y_true = target_function(x).to(device)

        mu, std = policy(x)
        dist = Normal(mu, std)
        y_sample = dist.sample()
        log_prob = dist.log_prob(y_sample)

        reward = reward_fn(y_sample, y_true)

        log_probs.append(log_prob)
        rewards.append(reward)

    log_probs = torch.cat(log_probs)
    rewards = torch.cat(rewards)
    
    # 보상 정규화
    rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-6)

    loss = -(log_probs.squeeze() * rewards.squeeze()).mean()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    avg_reward = rewards.mean().item()
    reward_history.append(avg_reward)

    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Avg Reward: {avg_reward:.4f}")


사용 중인 디바이스: cuda
CUDA 사용 가능 여부: True
사용 중인 GPU 이름: NVIDIA GeForce RTX 3070
Epoch 0, Avg Reward: 0.0000
Epoch 100, Avg Reward: -0.0000
Epoch 200, Avg Reward: 0.0000
Epoch 300, Avg Reward: -0.0000
Epoch 400, Avg Reward: 0.0000
Epoch 500, Avg Reward: -0.0000
Epoch 600, Avg Reward: -0.0000
Epoch 700, Avg Reward: -0.0000
Epoch 800, Avg Reward: -0.0000
Epoch 900, Avg Reward: 0.0000


In [None]:
policy.eval()

x_test = torch.linspace(-3, 3, 1000).unsqueeze(1).to(device)
with torch.no_grad():
    mu, _ = policy(x_test)
    y_pred = mu.cpu().numpy()

# 실제 값
y_true = np.polyval(coeffs, x_test.cpu().numpy())

# Plot
plt.figure(figsize=(8, 6))
plt.plot(x_test.cpu().numpy(), y_true, label="Target Function", color='red')
plt.scatter(x_test.cpu().numpy(), y_pred, s=1, label="REINFORCE Agent Output", color='blue')
plt.title("REINFORCE Agent vs Polynomial Function")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
plt.plot(reward_history)
plt.title("Reward Progress Over Training")
plt.xlabel("Epoch")
plt.ylabel("Average Reward")
plt.grid(True)
plt.show()
