In [55]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import gymnasium as gym
from sklearn.preprocessing import normalize


map_size = '8x8'; slippery = False; training_lr = 0.9; advice_lr = 0.9; gamma = 1
env = gym.make('FrozenLake-v1', map_name = map_size, is_slippery = slippery)
num_states = env.observation_space.n
num_actions = env.action_space.n

In [56]:
class Policy_Grad(nn.Module):
    def __init__(self, num_states, num_actions, hidden_1):
        super().__init__()

        self.fc1 = nn.Linear(num_states, hidden_1)
        self.act1 = nn.ReLU()

        self.out = nn.Linear(hidden_1, num_actions)
        self.act_output = nn.Softmax(dim = 0)

    def forward(self, x):
        x = self.act1(self.fc1(x))
        x = self.act_output(self.out(x))
        return x

In [58]:
def state_to_tensor(state):
    state_tensor = torch.zeros(num_states, dtype = torch.float32)
    state_tensor[state] = 1
    return state_tensor

In [59]:
model = Policy_Grad(num_states, num_actions, num_states)

In [60]:
state = 14
state_tensor = state_to_tensor(state)
untrained = model.forward(state_tensor)
print(untrained)

tensor([0.2550, 0.2328, 0.2634, 0.2488], grad_fn=<SoftmaxBackward0>)


In [68]:
state = 14
state_tensor = state_to_tensor(state)
action_probs = model.forward(state_tensor)
fused_probs = torch.tensor([0.17, 0.17, 0.49, 0.17], dtype = torch.float32) 
loss_fn = nn.MSELoss()
loss = loss_fn(action_probs, fused_probs)
optimizer = optim.Adam(model.parameters(), advice_lr)
optimizer.zero_grad()
loss.backward()
optimizer.step()

In [70]:
state = 14
state_tensor = state_to_tensor(state)
trained = model.forward(state_tensor)
print(trained)

tensor([1.4569e-24, 1.5365e-24, 1.0000e+00, 8.2851e-25],
       grad_fn=<SoftmaxBackward0>)


In [64]:
print(trained - untrained)

tensor([-0.2550, -0.2328,  0.7366, -0.2488], grad_fn=<SubBackward0>)
