In [2]:
import torch
from torch import nn
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
import gymnasium as gym
from tqdm import tqdm
import numpy as np

## Model

In [454]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(8, 30),
            nn.ReLU(),
            nn.Linear(30, 30),
            nn.ReLU(),
            nn.Linear(30, 4)
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork().to(device)
print(model)

Using cpu device
NeuralNetwork(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=8, out_features=30, bias=True)
    (1): ReLU()
    (2): Linear(in_features=30, out_features=30, bias=True)
    (3): ReLU()
    (4): Linear(in_features=30, out_features=4, bias=True)
  )
)


### Define Loss Function and Optimizer

### Create Gymnasium Environment Wrapper Class

In [3]:
class LunarLander(gym.Wrapper):
   def __init__(self, env):
      super(LunarLander, self).__init__(env)
      self.observation, info = env.reset()

   def step_env(self,action:int):
      reset = False
      next_observation, reward, terminated, truncated, info = self.step(action)
      observation = self.observation
      self.observation = next_observation

      if terminated or truncated:
         self.observation, info = self.reset()
         reset = True

      return reward, observation, next_observation, reset
   
   @property
   def terminated(self):
      return self.terminated or self.truncated
   
env = gym.make("LunarLander-v2")
lander = LunarLander(env)

In [18]:
env.action_space.shape

()

In [457]:
def train(model, loss_fn, optimizer):
    model.train()
    size = 100_000
    gamma = 0.4
    epsillon = 0.4
    for step in tqdm(range(size)):
        # Compute prediction error
        pred = model(torch.from_numpy(lander.observation))
        pred = pred/pred.sum()
        
        #run environment
        action = torch.argmax(pred).item()
        action = action if np.random.rand() > epsillon else np.random.randint(0,4)
        reward, observation, next_observation, reset = lander.step_env(action)
        next_pred = model(torch.from_numpy(lander.observation))
        next_pred = next_pred/next_pred.sum()
        update_pred = pred.clone()
        update_pred[action] = reward + gamma*next_pred[action]
        update_pred = update_pred/update_pred.sum()

        loss = loss_fn(update_pred, pred)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if step % (size // 10) == 0:
            loss, current = loss.item(), step
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [458]:
epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train( model, loss_fn, optimizer)
print("Done!")

Epoch 1
-------------------------------


  0%|          | 57/100000 [00:00<02:59, 557.40it/s]

loss: 80.933586  [    0/100000]


 10%|█         | 10111/100000 [00:16<02:24, 621.56it/s]

loss: 0.497794  [10000/100000]


 20%|██        | 20105/100000 [00:36<01:58, 672.78it/s]

loss: 0.005048  [20000/100000]


 30%|███       | 30088/100000 [00:51<01:42, 680.60it/s]

loss: 0.126986  [30000/100000]


 40%|████      | 40086/100000 [01:06<01:48, 554.54it/s]

loss: 0.349433  [40000/100000]


 50%|█████     | 50062/100000 [01:28<01:51, 447.35it/s]

loss: 0.009533  [50000/100000]


 60%|██████    | 60043/100000 [01:51<01:16, 519.06it/s]

loss: 0.211757  [60000/100000]


 70%|███████   | 70047/100000 [02:13<01:04, 461.79it/s]

loss: 0.008781  [70000/100000]


 80%|████████  | 80077/100000 [02:36<00:43, 453.57it/s]

loss: 0.181116  [80000/100000]


 90%|█████████ | 90062/100000 [02:54<00:13, 738.14it/s]

loss: 0.001065  [90000/100000]


100%|██████████| 100000/100000 [03:08<00:00, 529.46it/s]


Epoch 2
-------------------------------


  0%|          | 122/100000 [00:00<02:46, 599.07it/s]

loss: 0.006043  [    0/100000]


  2%|▏         | 2361/100000 [00:03<02:42, 602.67it/s]


KeyboardInterrupt: 

In [None]:
gamma = 0.55
epsillon = 0.1
pred = model(torch.from_numpy(lander.observation))

#run environment
action = torch.argmax(pred).item()
action = action if np.random.rand() > epsillon else np.random.randint(0,4)
reward, observation, next_observation, reset = lander.step_env(0)
next_pred = model(torch.from_numpy(lander.observation))
update_pred = pred.clone()
update_pred[action] = reward + gamma*next_pred[action]

loss = loss_fn(pred, update_pred)
print(action, reward)

2 -0.9421270524817942


In [459]:

env = gym.make("LunarLander-v2", render_mode="human")
observation, info = env.reset(seed=42)
for _ in range(1000):
   pred = model(torch.from_numpy(lander.observation))
   action = torch.argmax(pred).item()
   observation, reward, terminated, truncated, info = env.step(action)

   if terminated or truncated:
      observation, info = env.reset()

env.close()

In [None]:
torch.save(model.state_dict(), "model.pth")
print("Saved PyTorch Model State to model.pth")

In [None]:
model = NeuralNetwork().to(device)
model.load_state_dict(torch.load("model.pth"))

In [None]:
model.eval()
x, y = test_data[0][0], test_data[0][1]
with torch.no_grad():
    x = x.to(device)
    pred = model(x)
    predicted, actual = classes[pred[0].argmax(0)], classes[y]
    print(f'Predicted: "{predicted}", Actual: "{actual}"')