### Ideas

* Standardize state
    * Use upper_bounds and lower_bounds where known ahead of time. (defined by environment, i.e. in sim)
    * Where bounds in a state dimension are unknown, scale by mean and std of everything in memory.
    * Consider scaling some dimensions if they are not normally distributed.
* Standardize and scale rewards
    * Try automatically detecting if scaling (e.g. logarathmic) results in a more normal distribution.
* Modulate noise while training
    * Try implementing this using hyperopt.
        * After each episode report training score and ask hyperopt for new noise parameters.
        

In [6]:
%reload_ext autoreload
%autoreload 2

import warnings; warnings.simplefilter('ignore')

import numpy as np
from ddpg_agent.contrib.physics_sim import PhysicsSim

class Task():
    """Task (environment) that defines the goal and provides feedback to the agent."""
    def __init__(self, init_pose=None, init_velocities=None, 
        init_angle_velocities=None, runtime=10., target_pos=None, 
        vert_dist_thresh=1, horiz_dist_thresh=1,
        target_steps_within_goal=1,):
        """Initialize a Task object.
        Params
        ======
            init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles
            init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions
            init_angle_velocities: initial radians/second for each of the three Euler angles
            runtime: time limit for each episode
            target_pos: target/goal (x,y,z) position for the agent
        """
        # Simulation
        self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) 
        # TODO: Make action_repeat align with agent.action_repeat
        self.action_repeat = 3

        self.state_size = self.action_repeat * 6 + 6
        self.observation_space = Space( list(list(self.sim.lower_bounds) + [ -1 ]*3)*self.action_repeat + [-1]*6, 
                                       list(list(self.sim.upper_bounds) + [ 1 ]*3)*self.action_repeat + [1]*6 )
#         self.state_size = 6
        self.action_space = Space([0,0,0,0], [900,900,900,900])
        self.action_size = 4

        # Goal
        self.target_pos = target_pos if target_pos is not None else np.array([0., 0., 10.])
        self.target_steps_within_goal = target_steps_within_goal
        self.steps_within_goal = 0
        self.horiz_dist_thresh = horiz_dist_thresh
        self.vert_dist_thresh = vert_dist_thresh
        
    def reached_goal(self):
        horiz_distance_from_goal = np.sqrt((self.sim.pose[0]-self.target_pos[0])**2
                                           +(self.sim.pose[1]-self.target_pos[1])**2)
        vert_distance_from_goal = np.abs(self.sim.pose[2]-self.target_pos[2])
        return horiz_distance_from_goal < self.horiz_dist_thresh and \
                vert_distance_from_goal <= self.vert_dist_thresh

    def get_reward(self):
        """Uses current pose of sim to return reward."""
        #reward = 1.-.3*(abs(self.sim.pose[:3] - self.target_pos)).sum()
        reward = 0
        # Reward for staying at target altitude
        target_alt=self.target_pos[2]
        reward = .1*(target_alt - np.abs(self.sim.pose[2] - target_alt))/target_alt
        #Intermediate reward for flying at altitude
#         if np.abs(self.sim.pose[2] - self.target_pos[2]) < 1:
#             reward += 1
        # Punishment for crashing (altitude < 1 m)
#         if self.sim.pose[2]<=0: reward -= 1000
#         if self.sim.pose[2]<2: reward -= 1
        # Reward for being within goal radius
#         horiz_distance_from_goal = np.sqrt((self.sim.pose[0]-self.target_pos[0])**2
#                                            +(self.sim.pose[1]-self.target_pos[1])**2)
        # Reward for going up
#         if self.sim.v[2]>0:
#             reward += .001
            
        if self.reached_goal(): 
            self.steps_within_goal += 1
            reward += 1
#             if self.steps_within_goal / self.action_repeat >= self.target_steps_within_goal: 
#                 reward += 1000
        else:
            self.steps_within_goal = 0
        return reward

    def step(self, rotor_speeds):
        """Uses action to obtain next state, reward, done."""
        reward = 0
        pose_all = []
        for _ in range(self.action_repeat):
            done = self.sim.next_timestep(rotor_speeds) # update the sim pose and velocities
            reward += self.get_reward() 
            pose_all.append(self.sim.pose)
        next_state = list(np.concatenate(pose_all))+list(self.sim.v)+list(self.sim.angular_v)
#             import pdb; pdb.set_trace()
        # Punish and end episode for crashing
        if self.sim.pose[2]<=0: 
            reward -= 100
            done = True
        # end episode if at goal state
        if self.steps_within_goal / self.action_repeat >= self.target_steps_within_goal: 
            reward += 100
            done = True
        # Scale reward. 
        # TODO: How can the agent detect need for reward scaling automatically?
        reward = np.log1p(reward) if reward>0 else np.log1p(-reward)
        return next_state, reward, done, None

    def reset(self):
        """Reset the sim to start a new episode."""
        self.sim.reset()
        state = list(np.concatenate([self.sim.pose] * self.action_repeat)) + \
                list(self.sim.v) + list(self.sim.angular_v)
#         state = self.sim.pose
        self.steps_within_goal = 0
        return state

class Space():
    def __init__(self, low, high):
        low = np.array(low)
        high = np.array(high)
        assert low.shape == high.shape,\
            "Expected bounds to be of same shape."
        self.low = low
        self.high = high
        self.shape = low.shape

In [7]:
# extreme=max(np.abs(min(episode.rewards)),np.abs(max(episode.rewards)))


In [8]:
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import pyplot as plt
import matplotlib as mpl
import matplotlib.gridspec as gridspec
from sklearn.preprocessing import normalize
%matplotlib inline

def plot_episode(episode):
    # goal_position = agent.env.target_pos
    fig = plt.figure(figsize=(15,7))
    fig.suptitle("Episode %i, score: %.3f, epsilon: %.4g"%(episode.episode_idx, episode.score, episode.epsilon))

    main_cols = gridspec.GridSpec(1, 3, figure=fig)
    right_col_grid = main_cols[1:].subgridspec(2,3,wspace=.2,hspace=.3)

    min_reward=min(episode.rewards)
    max_reward=max(episode.rewards)
    if min_reward==max_reward:
        min_reward=-1
        max_reward=1
    extreme=max(np.abs(min_reward),np.abs(max_reward))
    reward_norm = mpl.colors.SymLogNorm(linthresh=1, linscale=3, vmin=-extreme, vmax=extreme)
    reward_cmap = mpl.cm.ScalarMappable(norm=reward_norm, cmap=mpl.cm.get_cmap('RdYlGn'))
    reward_cmap.set_array([])
#     reward_cmap.set_clim(min(episode.rewards), max(episode.rewards))
    
    pos_ax = fig.add_subplot(main_cols[0], projection='3d', title="Flight Path")
    pos_scatter = pos_ax.scatter([s[0] for s in episode.states], [s[1] for s in episode.states], 
                                 [s[2] for s in episode.states], 
                                 c=[reward_cmap.to_rgba(r) for r in episode.rewards],
                                 edgecolor='k', )

    fig.colorbar(reward_cmap, ax=pos_ax, shrink=.8, pad=.02, label="reward", orientation='horizontal')

    alt_ax = fig.add_subplot(right_col_grid[0,0], title="Altitude", xlabel='step')
    alt_ax.plot([s[2] for s in episode.states], color='magenta')


    actions_grid = right_col_grid[0,2].subgridspec(4,1)
    def plot_action(i):
#         a_colors=['darkorange','darkgoldenrod','peru','lightsalmon']
        ax = fig.add_subplot(actions_grid[i], ylim=(-100,1000), xlabel='step', yticks=[0,400,800])
#         if i==0: ax.set_title('Actions')
        ax.plot([a[i] for a in episode.raw_actions], color='gray', label='raw action')
        ax.plot([a[i] for a in episode.actions], label='action + noise', color='darkorange')#a_colors[i])
        if i==0: ax.legend(loc='lower center', bbox_to_anchor=(.5,.9))
    for i in range(4): plot_action(i)
    
    def plot_state(ax,i):
        s_colors=['slateblue','royalblue','magenta','steelblue','skyblue','deepskyblue']
        s_labels=['x pos','y pos','altitude','roll', 'pitch', 'yaw']
        state=np.array([s[i] for s in episode.states])
        ax.plot(normalize(state.reshape(-1,1),axis=0), color=s_colors[i], label=s_labels[i])

    rot_ax = fig.add_subplot(right_col_grid[1,0], title="Orientation", xlabel='step')
    for i in range(3,6): plot_state(rot_ax,i)
    rot_ax.legend(loc='upper right')
    
    v_ax = fig.add_subplot(right_col_grid[1,1], title="Velocity", xlabel='step')
    #import pdb; pdb.set_trace()
    v_ax.plot([s[18] for s in episode.states], label="x")
    v_ax.plot([s[19] for s in episode.states], label="y")
    v_ax.plot([s[20] for s in episode.states], label="z")
    v_ax.legend(loc='upper right')
    
    
    ang_v_ax = fig.add_subplot(right_col_grid[1,2], title="Angular Velocity", xlabel='step')
    ang_v_ax.plot([s[21] for s in episode.states], label="x")
    ang_v_ax.plot([s[22] for s in episode.states], label="y")
    ang_v_ax.plot([s[23] for s in episode.states], label="z")
    ang_v_ax.legend(loc='upper right')

    horiz_pos_ax = fig.add_subplot(right_col_grid[0,1], title="Horizontal Position")
    horiz_pos_ax.scatter([s[0] for s in episode.states], [s[1] for s in episode.states], 
                      c=[reward_cmap.to_rgba(r) for r in episode.rewards],
                      edgecolor='k',)
        
    fig.show()
# for ep in agent.history.training_episodes[:6]:
#     plot_episode(ep)
# plot_episode(agent.history.test_episodes[71])
# plot_episode(agent.history.training_episodes[-1])

In [9]:
# import csv
from ddpg_agent.agent import DDPG, Q_a_frames_spec

task = Task(init_pose=np.array([0., 0., 8.5, 0., 0., 0.]), 
            init_velocities=np.array([0., 0., 0.]), 
            init_angle_velocities=np.array([0., 0., 0.]), 
            runtime=10., 
            vert_dist_thresh=1, horiz_dist_thresh=1, 
            target_steps_within_goal=25,
            target_pos=np.array([0., 0., 10.]),
           )

q_a_frames_spec = Q_a_frames_spec(task, nx=16, ny=16, na=11, x_dim=4, y_dim=2, a_dim=0)

agent = DDPG(task, ou_mu=0, ou_theta=.1, ou_sigma=.25, 
             discount_factor=.999, replay_buffer_size=100000, replay_batch_size=1024,
             tau_actor=.1, tau_critic=.2, 
             relu_alpha_actor=.01, relu_alpha_critic=.01,
             lr_actor=.0001, lr_critic=.0005, activation_fn_actor='tanh',
#              l2_reg_actor=.01, l2_reg_critic=.01, 
             bn_momentum_actor=0, bn_momentum_critic=.7, 
             q_a_frames_spec=q_a_frames_spec, do_preprocessing=False,
             input_bn_momentum_actor=.7,
             input_bn_momentum_critic=.7,
#              activity_l2_reg=.01,
             output_action_regularizer=10,
            )
# agent.print_summary()

labels = ['time', 'x', 'y', 'z', 'phi', 'theta', 'psi', 'x_velocity',
          'y_velocity', 'z_velocity', 'phi_velocity', 'theta_velocity',
          'psi_velocity', 'rotor_speed1', 'rotor_speed2', 'rotor_speed3', 'rotor_speed4']
# [task.sim.time] + list(task.sim.pose) + list(task.sim.v) + list(task.sim.angular_v) + list(rotor_speeds)

In [10]:
agent.train_n_episodes(100, eps=2000, eps_decay=20, action_repeat=1, run_tests=True, gen_q_a_frames_every_n_steps=0 )

Episode 1 - epsilon: 1980, memory size: 27, num steps: 27, training score: -95.15, test score: -95.90
Episode 2 - epsilon: 1960, memory size: 49, num steps: 22, training score: -96.25, test score: -95.90
Episode 3 - epsilon: 1940, memory size: 71, num steps: 22, training score: -96.48, test score: -95.90
Episode 4 - epsilon: 1920, memory size: 95, num steps: 24, training score: -96.10, test score: -95.90
Episode 5 - epsilon: 1900, memory size: 120, num steps: 25, training score: -95.88, test score: -95.90
Episode 6 - epsilon: 1880, memory size: 144, num steps: 24, training score: -96.16, test score: -95.90
Episode 7 - epsilon: 1860, memory size: 167, num steps: 23, training score: -96.12, test score: -95.90
Episode 8 - epsilon: 1840, memory size: 190, num steps: 23, training score: -96.18, test score: -95.90
Episode 9 - epsilon: 1820, memory size: 211, num steps: 21, training score: -96.60, test score: -95.90
Episode 10 - epsilon: 1800, memory size: 235, num steps: 24, training score: 

Episode 80 - epsilon: 400, memory size: 1865, num steps: 24, training score: -95.84, test score: -95.10
Episode 81 - epsilon: 380, memory size: 1890, num steps: 25, training score: -95.75, test score: -94.97
Episode 82 - epsilon: 360, memory size: 1913, num steps: 23, training score: -95.92, test score: -94.92
Episode 83 - epsilon: 340, memory size: 1938, num steps: 25, training score: -95.89, test score: -95.03
Episode 84 - epsilon: 320, memory size: 1963, num steps: 25, training score: -95.70, test score: -95.56
Episode 85 - epsilon: 300, memory size: 1986, num steps: 23, training score: -96.12, test score: -95.21
Episode 86 - epsilon: 280, memory size: 2014, num steps: 28, training score: -94.82, test score: -95.50
Episode 87 - epsilon: 260, memory size: 2035, num steps: 21, training score: -96.52, test score: -95.70
Episode 88 - epsilon: 240, memory size: 2058, num steps: 23, training score: -96.00, test score: -95.78
Episode 89 - epsilon: 220, memory size: 2085, num steps: 27, tra

In [None]:
agent.train_n_episodes(100, eps=100, eps_decay=1, action_repeat=1, run_tests=True, gen_q_a_frames_every_n_steps=0 )

In [None]:
agent.train_n_episodes(100, eps=100, eps_decay=1, action_repeat=1, run_tests=True, gen_q_a_frames_every_n_steps=0 )

In [None]:
agent.train_n_episodes(100, eps=100, eps_decay=1, action_repeat=1, run_tests=True, gen_q_a_frames_every_n_steps=0 )

In [None]:
agent.train_n_episodes(100, eps=100, eps_decay=1, action_repeat=1, run_tests=True, gen_q_a_frames_every_n_steps=0 )

In [None]:
agent.train_n_episodes(100, eps=100, eps_decay=1, action_repeat=1, run_tests=True, gen_q_a_frames_every_n_steps=0 )

In [None]:
agent.train_n_episodes(100, eps=100, eps_decay=1, action_repeat=1, run_tests=True, gen_q_a_frames_every_n_steps=0 )

In [None]:
agent.train_n_episodes(100, eps=100, eps_decay=1, action_repeat=1, run_tests=True, gen_q_a_frames_every_n_steps=0 )

In [None]:
agent.train_n_episodes(100, eps=100, eps_decay=1, action_repeat=1, run_tests=True, gen_q_a_frames_every_n_steps=0 )

In [None]:
agent.train_n_episodes(100, eps=100, eps_decay=1, action_repeat=1, run_tests=True, gen_q_a_frames_every_n_steps=0 )

In [None]:
# agent.env.target_pos=np.array([0,0,11])

In [None]:
# agent.memory.memory.clear()
# agent.env.target_steps_within_goal=25

In [None]:
for ep in agent.history.training_episodes[-10:]:
    plot_episode(ep)

In [None]:
plot_episode(agent.history.training_episodes[403])

In [None]:
plot_episode(agent.history.test_episodes[-1])

In [None]:
# import tensorflow as tf
# with tf.keras.backend.get_session() as sess:
#     tf.global_variables_initializer().run(session=sess)
#     summary, _ = sess.run(t)

In [None]:
# agent.actor_local.model.get_weights()

In [None]:
# import keras
# bn=[l for l in agent.actor_local.model.layers if type(l) is keras.layers.normalization.BatchNormalization][0]
# bn.get_weights()

In [None]:
np.mean([e.score for e in agent.history.training_episodes ]), \
    np.mean([e.score for e in agent.history.test_episodes])

In [None]:
min(agent.history.test_episodes[-1].actions)