In [2]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import gymnasium as gym
env = gym.make('FrozenLake-v1')
num_states = env.observation_space.n
num_actions = env.action_space.n

In [3]:
class Policy_Grad(nn.Module):
    def __init__(self, num_states, num_actions, hidden_1):
        super().__init__()

        self.fc1 = nn.Linear(num_states, hidden_1)
        self.act1 = nn.ReLU()

        self.out = nn.Linear(hidden_1, num_actions)
        self.act_output = nn.Softmax(dim = 0)

    def forward(self, x):
        x = self.act1(self.fc1(x))
        x = self.act_output(self.out(x))
        return x
    
model = Policy_Grad(num_states, num_actions, num_states)

In [4]:
def state_to_tensor(state):
    state_tensor = torch.zeros(num_states, dtype = torch.float32)
    state_tensor[state] = 1
    return state_tensor

In [5]:
def run(envName):
    ep_states = []
    ep_actions = []
    ep_rewards = []


    env = gym.make('FrozenLake-v1')

    state = env.reset()[0]
    terminated = False
    truncated = False

    while (not terminated and not truncated):
        ep_states.append(state)

        action = env.action_space.sample()
        ep_actions.append(action)

        state, reward, terminated, truncated, __ = env.step(action)
        ep_rewards.append(reward)

    env.close()

    print("EPISODE SUMMARY")
    print(f"States: {ep_states}")
    print(f"Actions: {ep_actions}")
    print(f"Rewards: {ep_rewards}")
    print(f"Total timesteps: {len(ep_rewards)}")

In [6]:
def run_NN(MAX_EPISODES = 1):
    env = gym.make('FrozenLake-v1', render_mode = 'human')
    
    total_reward = []
    total_successes = 0

    for episode in range(MAX_EPISODES):
        state = env.reset()[0]
        ep_states, ep_actions, ep_probs, ep_rewards, total_ep_rewards = [], [], [], [], 0
        terminated, truncated = False, False

        # gather trajectory
        while not terminated and not truncated:
            # encode state in tensor
            state_tensor = state_to_tensor(state)

            # add state to ep_states list
            ep_states.append(state)

            # pass state_tensor thru model to get action_probs
            action_probs = model.forward(state_tensor)

            #choose an action
            action = np.random.choice(4, p = action_probs.detach().numpy())

            # add action to ep_actions list
            ep_actions.append(action)

            # take step in environment
            state, reward, terminated, truncated, __ = env.step(action)

            # add reward to ep_rewards list
            ep_rewards.append(reward)
            total_ep_rewards += reward
            if reward == 1: total_successes += 1

    print("EPISODE SUMMARY")
    print(f"States: {ep_states}")
    print(f"Actions: {ep_actions}")
    print(f"Rewards: {ep_rewards}")
    print(f"Total timesteps: {len(ep_rewards)}")
