In [1]:
import os
import gym
import gymfc
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.models import Sequential
from keras import layers
%matplotlib inline

import nest_asyncio
nest_asyncio.apply()


class RewScale(gym.RewardWrapper):
    def __init__(self, env, scale):
        gym.RewardWrapper.__init__(self, env)
        self.scale = scale

    def reward(self, r):
        return r * self.scale

Using TensorFlow backend.


In [7]:
# env = gym.make("CartPole-v0")
current_dir = os.getcwd()
config_path = os.path.join(current_dir, "../configs/iris.config")
os.environ["GYMFC_CONFIG"] = config_path
env = gym.make('AttFC_GyroErr-MotorVel_M4_Con-v0')
env = RewScale(env, 0.1)

env.reset()
n_actions = env.action_space.shape[0]
state_dim = env.observation_space.shape

Starting gzserver with process ID= 3157


In [8]:
# create input variables. We only need <s,a,R> for REINFORCE
states = tf.placeholder('float32', (None,)+state_dim, name="states")
actions = tf.placeholder('int32', name="action_ids")
cumulative_rewards = tf.placeholder('float32', name="cumulative_returns")

model = Sequential()
model.add(layers.InputLayer(state_dim))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(n_actions))

logits = model(states)
policy = tf.nn.softmax(logits)
log_policy = tf.nn.log_softmax(logits)

indices = tf.stack([tf.range(tf.shape(log_policy)[0]), actions], axis=-1)
log_policy_for_actions = tf.gather_nd(log_policy, indices)

probabilities = log_policy_for_actions * cumulative_rewards
J = tf.reduce_mean(probabilities)

# actions_probability = tf.gather_nd(policy[0], actions)
entropy = -tf.reduce_sum(policy * tf.log(policy)) / tf.cast(tf.shape(policy)[0], tf.float32)
# entropy = -tf.reduce_mean(policy * tf.log(policy))


loss = -J - 0.1*entropy
update = tf.train.AdamOptimizer().minimize(loss, var_list=model.weights)

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [9]:
def get_cumulative_rewards(rewards, gamma=0.99):
    cumulatived_rewards = np.zeros(len(rewards))
    local_sum = 0
    for i in range(len(rewards) - 1, -1, -1):
        local_sum *= gamma
        local_sum += rewards[i]
        cumulatived_rewards[i] = local_sum 

    return np.array(cumulatived_rewards)

def train_step(_states, _actions, _rewards):
    """given full session, trains agent with policy gradient"""
    _cumulative_rewards = get_cumulative_rewards(_rewards)
    update.run({states: _states, actions: _actions,
                cumulative_rewards: _cumulative_rewards})

def get_action_proba(s): 
    result = policy.eval({states: [s]})[0]
    return result

def generate_session(t_max=1000):
    """play env with REINFORCE agent and train at the session end"""

    # arrays to record session
    states, actions, rewards = [], [], []
    s = env.reset()
    for t in range(t_max):
        # action probabilities array aka pi(a|s)
        action_probas = get_action_proba(s)
        a = np.random.choice([0, 1], 1, p=action_probas)[0]
        new_s, r, done, info = env.step(a)
        # record session history to train later
        states.append(s)
        actions.append(a)
        rewards.append(r)

        s = new_s
        if done:
            break

    train_step(states, actions, rewards)
    return sum(rewards)

def test_policy():
    actuals = []
    desireds = []
    env = gym.make()
    ob = env.reset()
    while True:
        desired = env.omega_target
        actual = env.omega_actual
        actuals.append(actual)
        desireds.append(desired)
        print("sp=", desired, " rate=", actual)
        action_probas = get_action_proba(ob)
        action = np.random.choice([1, 1, 1, 1], 1, p=action_probas)[0]
        ob, _, done, _ = env.step(action)
        if done:
            break
    plot_step_response(np.array(desireds), np.array(actuals))

In [10]:
import time
s = tf.InteractiveSession()
s.run(tf.global_variables_initializer())

start = time.time()
for i in range(10000):
    
    rewards = [generate_session() for _ in range(100)]  # generate new sessions

    print('Epoch: {}'.format(i))
    print("Time: {:.2}s".format(time.time() - start))
    print("Mean reward: %.3f" % (np.mean(rewards)))
    print("=====================================")
    if np.mean(rewards) > 19:
        print("You Win!")
        break

ValueError: 'a' and 'p' must have same size

In [2]:
env.action_space.sample()

NameError: name '__file__' is not defined

In [46]:
def plot_step_response(desired, actual,
                       end=1., title=None,
                       step_size=0.001, threshold_percent=0.1):
    """
        Args:
            threshold (float): Percent of the start error
    """

    # actual = actual[:,:end,:]
    end_time = len(desired) * step_size
    t = np.arange(0, end_time, step_size)

    # desired = desired[:end]
    threshold = threshold_percent * desired

    plot_min = -math.radians(350)
    plot_max = math.radians(350)

    subplot_index = 3
    num_subplots = 3

    f, ax = plt.subplots(num_subplots, sharex=True, sharey=False)
    f.set_size_inches(10, 5)
    if title:
        plt.suptitle(title)
    ax[0].set_xlim([0, end_time])
    res_linewidth = 2
    linestyles = ["c", "m", "b", "g"]
    reflinestyle = "k--"
    error_linestyle = "r--"

    # Always
    ax[0].set_ylabel("Roll (rad/s)")
    ax[1].set_ylabel("Pitch (rad/s)")
    ax[2].set_ylabel("Yaw (rad/s)")

    ax[-1].set_xlabel("Time (s)")

    """ ROLL """
    # Highlight the starting x axis
    ax[0].axhline(0, color="#AAAAAA")
    ax[0].plot(t, desired[:, 0], reflinestyle)
    ax[0].plot(t, desired[:, 0] - threshold[:, 0], error_linestyle, alpha=0.5)
    ax[0].plot(t, desired[:, 0] + threshold[:, 0], error_linestyle, alpha=0.5)

    r = actual[:, 0]
    ax[0].plot(t[:len(r)], r, linewidth=res_linewidth)

    ax[0].grid(True)

    """ PITCH """

    ax[1].axhline(0, color="#AAAAAA")
    ax[1].plot(t, desired[:, 1], reflinestyle)
    ax[1].plot(t, desired[:, 1] - threshold[:, 1], error_linestyle, alpha=0.5)
    ax[1].plot(t, desired[:, 1] + threshold[:, 1], error_linestyle, alpha=0.5)
    p = actual[:, 1]
    ax[1].plot(t[:len(p)], p, linewidth=res_linewidth)
    ax[1].grid(True)

    """ YAW """
    ax[2].axhline(0, color="#AAAAAA")
    ax[2].plot(t, desired[:, 2], reflinestyle)
    ax[2].plot(t, desired[:, 2] - threshold[:, 2], error_linestyle, alpha=0.5)
    ax[2].plot(t, desired[:, 2] + threshold[:, 2], error_linestyle, alpha=0.5)
    y = actual[:, 2]
    ax[2].plot(t[:len(y)], y, linewidth=res_linewidth)
    ax[2].grid(True)

    plt.savefig("gymfc-ppo-step-response.pdf")


In [47]:
test_policy()

AttributeError: 'RewScale' object has no attribute 'omega_target'