In [26]:
import numpy as np
import gym
import math

def random_shooting_mpc(start_state, model, reward_fn, horizon, n_samples, gamma):
    """
    Random shooting model predictive control for the Pendulum-v1 environment in OpenAI Gym.
    
    Args:
    - start_state (numpy array): the initial state of the environment
    - model (function): a function that takes a state and an action, and returns the next state
    - reward_fn (function): a function that takes a state and returns a reward
    - horizon (int): the number of timesteps in the MPC horizon
    - n_samples (int): the number of control sequences to sample
    - gamma (float): the discount factor
    
    Returns:
    - optimal_control (numpy array): the optimal control sequence for the MPC horizon
    """
    
    # Initialize the control sequences randomly
    control_sequences = np.random.uniform(low=-2.0, high=2.0, size=(n_samples, horizon))

    # tensorize control_sequences
    control_sequences = torch.tensor(control_sequences, dtype=torch.float32)
    
    # Evaluate the control sequences
    rewards = np.zeros(n_samples)
    for i in range(n_samples):
        state = start_state
        for t in range(horizon):
            action = control_sequences[i, t]
            next_state = model(state, action)
            rewards[i] += reward_fn(next_state, action) * gamma**t
            state = next_state
    
    # Find the optimal control sequence
    optimal_index = np.argmax(rewards)
    optimal_control = control_sequences[optimal_index]
    
    return optimal_control


def pendulum_model(state, perturbed_action):
    # true dynamics from gym
    # state: [cos(theta), sin(theta), theta_dot]
    # th is angle from vertical
    th = math.atan2(state[1], state[0])
    thdot = state[2]

    g = 10
    m = 1
    l = 1
    dt = 0.05

    u = perturbed_action
    u = torch.clamp(u, -2, 2)

    newthdot = thdot + (-3 * g / (2 * l) * np.sin(th + np.pi) + 3. / (m * l ** 2) * u) * dt
    newth = th + newthdot * dt
    newthdot = torch.clamp(newthdot, -8, 8)

    state = torch.cat((newth, newthdot), dim=1)
    return state

def pendulum_reward(state, action):
    cos_theta, sin_theta, theta_dot = state[0], state[1], state[2]
    return -(theta_dot**2 + 0.1*cos_theta**2 + 0.001*(action**2))




In [27]:
import torch

env = gym.make('Pendulum-v1')
horizon = 10
n_samples = 1000
gamma = 0.98

state = env.reset()
rewards = []
print(state)
for i in range(10000):
    action = random_shooting_mpc(state, pendulum_model, pendulum_reward, horizon, n_samples, gamma)[0]
    next_state, reward, done, info = env.step([action])
    state = next_state
    rewards.append(reward)
    if i % 1000 == 0:
        print("at step " + str(i))

# calculate statistics
rewards_mean = np.mean(rewards)
rewards_std = np.std(rewards)
rewards_max = np.max(rewards)
rewards_min = np.min(rewards)
rewards_median = np.median(rewards)
rewards_25 = np.percentile(rewards, 25)
rewards_75 = np.percentile(rewards, 75)

print("mean: " + str(rewards_mean))
print("std: " + str(rewards_std))
print("max: " + str(rewards_max))
print("min: " + str(rewards_min))
print("median: " + str(rewards_median))
print("25: " + str(rewards_25))
print("75: " + str(rewards_75))

env.close()

[-0.9620051   0.27303156 -0.505954  ]


RuntimeError: zero-dimensional tensor (at position 0) cannot be concatenated