In [4]:
# a few packages we need to import
import IPython
import pendulum
import torch
from pathlib import Path
from tqdm import tqdm
import matplotlib.animation as animation
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
%matplotlib widget

# we define the neural network to be used for Q-learning
# 2 hidden layers with 64 nodes
# 2 inputs (state)
# 3 outputs for the 3 possible controls
D_in, H, D_out = 2, 64, 3

q_function = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out)
)

# we initialize the network parameters to 0
for params in q_function.parameters():
    params = torch.zeros_like(params)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# possible controls
possible_controls = np.array([-5., 0., 5.])

checkpoint = torch.load("models/lipm_q_function_working.pth")
q_function.load_state_dict(checkpoint)
q_function.to(device)

print(f"Loaded model:\n{q_function}")
print(f"Model on device:\n{next(q_function.parameters()).device}")


# Initial state of pendulum for animation
x0 = np.zeros((2, 1))


def controller(x):
    '''Using the same given controller function with the modification of including the device to run it on.'''
    u_pred = torch.argmin(q_function(torch.as_tensor(
        x, dtype=torch.float, device=device).unsqueeze(0))).item()
    u = possible_controls[u_pred]
    return u


# Animate the pendulum
pendulum.animate_robot(x0, controller, push=False, save_movie=True)

  checkpoint = torch.load("models/lipm_q_function_working.pth")


Loaded model:
Sequential(
  (0): Linear(in_features=2, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=64, bias=True)
  (5): ReLU()
  (6): Linear(in_features=64, out_features=3, bias=True)
)
Model on device:
cuda:0


In [3]:
pendulum.animate_robot(x0, controller, push=True, save_movie=True)

In [None]:
# Plot cost per episode
plt.figure()
plt.plot(cost_per_episode)
plt.xlabel('Episode')
plt.ylabel('Cost')
plt.title('Cost per Episode')
plt.show()

# Plot learned value function and policy
theta_vals = np.linspace(0, 2 * np.pi, 100)
theta_dot_vals = np.linspace(-10, 10, 100)
Theta, ThetaDot = np.meshgrid(theta_vals, theta_dot_vals)

ValueFunction = np.zeros_like(Theta)
Policy = np.zeros_like(Theta)

with torch.no_grad():
    for i in range(Theta.shape[0]):
        for j in range(Theta.shape[1]):
            xi = torch.tensor([Theta[i, j], ThetaDot[i, j]],
                              device=device, dtype=torch.float)
            q_values = q_function(xi.unsqueeze(0)).cpu().numpy().squeeze()
            # Negative because it's a cost
            ValueFunction[i, j] = -np.min(q_values)
            Policy[i, j] = possible_controls[np.argmin(q_values)]

# Plot the value function
plt.figure()
plt.contourf(Theta, ThetaDot, ValueFunction, levels=50, cmap='viridis')
plt.colorbar(label='Value Function')
plt.xlabel('Theta')
plt.ylabel('Theta Dot')
plt.title('Learned Value Function')
plt.show()

# Plot the policy
plt.figure()
plt.contourf(Theta, ThetaDot, Policy, levels=50, cmap='coolwarm')
plt.colorbar(label='Policy (Control)')
plt.xlabel('Theta')
plt.ylabel('Theta Dot')
plt.title('Learned Policy')
plt.show()