# 策略梯度算法

In [2]:
import gymnasium as gym
import torch
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import rl_utils

定义策略网络 **PolicyNet**，其输入是某个状态，输出则是该状态下的动作概率分布

In [None]:
class PolicyNet(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(PolicyNet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, action_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return F.softmax(self.fc2(x), dim=1)



In [11]:
class REINFORCE:
    def __init__(self, state_dim, hidden_dim, action_dim, 
                learning_rate, gamma, device):
        self.policy_net = PolicyNet(state_dim = state_dim, hidden_dim= hidden_dim, action_dim=action_dim).to(device)

        self.optimizer = torch.optim.Adam(self.policy_net.parameters(), action_dim).to(device)

        self.gamma = gamma
        self.device = device

    def take_action(self, state):
        state = torch.tensor([state], dtype=torch.float).to(self.device)
        probs = self.policy_net(state)
        action_dist = torch.distributions.Categorical(probs)
        action = action_dist.sample()
        return action.item()
    
    def update(self, transitioin_dict):
        reward_list = transitioin_dict['rewards']
        state_list = transitioin_dict['states']
        action_list = transitioin_dict['acitons']

        G = 0
        self.optimizer.zero_grad()
        for i in reversed(range(len(reward_list))):
            reward = reward_list[i]
            state = torch.tensor([state_list[i]], dtype=torch.float).to(self.device)
            action = torch.tensor([action_list[i]]).view(-1, 1).to(self.device)

            log_prob = torch.log(self.policy_net(state).gather(1, action))
            G = self.gamma * G + reward
            loss = -G * log_prob
            loss.backward()
        self.optimizer.step()


