# Open Sim RL Training
#### Imports

In [10]:
# Dependencies
import numpy as np
import tensorflow as tf
# Environment
from osim.env import L2RunEnv as ENV # rename environment to be used for training

### Agent Class
#### Imports

In [11]:
import keras
import keras.backend as K
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Input, Concatenate
from keras.optimizers import Adam

from rl.processors import WhiteningNormalizerProcessor
from rl.agents import DDPGAgent
from rl.memory import SequentialMemory
from rl.random import OrnsteinUhlenbeckProcess

#### Class
Reference: https://github.com/keras-rl/keras-rl/blob/master/examples/ddpg_mujoco.py

In [12]:
class Agent:
    def __init__(self,env):
        nb_actions = env.action_space.shape[0]
        
        self.env = env
        self.actor = self.build_actor(env)
        self.critic, action_input = self.build_critic(env)
        self.loss = self.build_loss()

        self.memory = SequentialMemory(limit=100000, window_length=1)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.1)
        self.agent = DDPGAgent(   nb_actions=nb_actions, actor=self.actor, 
                                  critic=self.critic, critic_action_input=action_input,
                                  memory=self.memory, nb_steps_warmup_critic=1000, 
                                  nb_steps_warmup_actor=1000,
                                  random_process=self.random_process, 
                                  gamma=.99, target_model_update=1e-3,
                                  processor=WhiteningNormalizerProcessor()  )
        self.agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=self.loss)

    def build_loss(self):
        return ['mse']

    def build_actor(self,env):
        nb_actions = env.action_space.shape[0]
        actor = Sequential()
        actor.add(Flatten(input_shape=(1,) + env.observation_space.shape))
        actor.add(Dense(400))
        actor.add(Activation('relu'))
        actor.add(Dense(300))
        actor.add(Activation('relu'))
        actor.add(Dense(nb_actions,
                        activation='tanh',
                        kernel_constraint=  keras.constraints.min_max_norm(
                                            min_value=0,
                                            max_value=nb_actions,
                                            axis=1) ) )
        actor.summary()

        inD = Input(shape=(1,) + env.observation_space.shape)
        out = actor(inD)

        return Model(inD,out)

    def build_critic(self,env):
        nb_actions = env.action_space.shape[0]
        action_input = Input(shape=(nb_actions,), name='action_input')
        observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
        flattened_observation = Flatten()(observation_input)
        x = Dense(400)(flattened_observation)
        x = Activation('relu')(x)
        x = Concatenate()([x, action_input])
        x = Dense(300)(x)
        x = Activation('relu')(x)
        x = Dense(1)(x)
        x = Activation('linear')(x)

        critic = Model(inputs=[action_input, observation_input], outputs=x)
        critic.summary()

        return critic, action_input
    
    def fit(self, **kwargs):
        return self.agent.fit(self.env,**kwargs)
    
    def test(self, **kwargs):
        return self.agent.test(self.env,**kwargs)
    
    def save_weights(self,filename='ddpg_{}_weights.h5f'):
        self.agent.save_weights(filename.format("opensim"), overwrite=True)
        
    def load_weights(self,filename='ddpg_{}_weights.h5f'):
        self.agent.load_weights(filename.format("opensim"))

### Environment Class

In [13]:
class TrainEnv(ENV):
    pass
# TODO: define virtual assistant forces on agent
# TODO: define search through easier environments
# TODO: make environment harder once the agent has trained for challenge

# Run Simulation
#### Environment

In [14]:
env = TrainEnv(visualize=False)
observation = env.reset( )

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m




#### Agent

In [15]:
agent = Agent(env)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 41)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 400)               16800     
_________________________________________________________________
activation_3 (Activation)    (None, 400)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 300)               120300    
_________________________________________________________________
activation_4 (Activation)    (None, 300)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 18)                5418      
Total params: 142,518
Trainable params: 142,518
Non-trainable params: 0
_________________________________________________________________
____

### Load previously trained weights

In [16]:
agent.load_weights( )

### Train new weights

In [None]:
for i in range(200): # Train in smaller batches to allow for interuption
    agent.fit(nb_steps=2000, visualize=False, verbose=2)
    ## Always save new weights
    agent.save_weights( )

Training for 2000 steps ...
   82/2000: episode: 1, duration: 8.452s, episode steps: 82, steps per second: 10, episode reward: 0.345, mean reward: 0.004 [0.001, 0.006], mean action: -0.100 [-0.895, 0.780], mean observation: -0.013 [-6.145, 5.816], loss: --, mean_squared_error: --, mean_q: --
  220/2000: episode: 2, duration: 4.299s, episode steps: 138, steps per second: 32, episode reward: 0.528, mean reward: 0.004 [-0.000, 0.011], mean action: -0.056 [-0.888, 0.628], mean observation: 0.140 [-4.317, 6.370], loss: --, mean_squared_error: --, mean_q: --
  349/2000: episode: 3, duration: 26.163s, episode steps: 129, steps per second: 5, episode reward: -0.539, mean reward: -0.004 [-0.011, 0.001], mean action: -0.036 [-0.817, 0.844], mean observation: 0.071 [-4.258, 4.407], loss: --, mean_squared_error: --, mean_q: --
  444/2000: episode: 4, duration: 6.436s, episode steps: 95, steps per second: 15, episode reward: 0.330, mean reward: 0.003 [0.001, 0.007], mean action: -0.071 [-0.894, 0.6

done, took 100.188 seconds
Training for 2000 steps ...
  120/2000: episode: 1, duration: 4.426s, episode steps: 120, steps per second: 27, episode reward: -0.760, mean reward: -0.006 [-0.021, 0.009], mean action: 0.261 [-1.084, 1.228], mean observation: 0.012 [-7.950, 9.813], loss: --, mean_squared_error: --, mean_q: --
  240/2000: episode: 2, duration: 4.479s, episode steps: 120, steps per second: 27, episode reward: -0.753, mean reward: -0.006 [-0.021, 0.009], mean action: 0.230 [-1.149, 1.259], mean observation: 0.014 [-7.951, 9.848], loss: --, mean_squared_error: --, mean_q: --
  360/2000: episode: 3, duration: 4.609s, episode steps: 120, steps per second: 26, episode reward: -0.757, mean reward: -0.006 [-0.021, 0.009], mean action: 0.232 [-1.140, 1.183], mean observation: 0.013 [-7.946, 9.957], loss: --, mean_squared_error: --, mean_q: --
  480/2000: episode: 4, duration: 4.493s, episode steps: 120, steps per second: 27, episode reward: -0.759, mean reward: -0.006 [-0.021, 0.009],

 1600/2000: episode: 14, duration: 6.467s, episode steps: 114, steps per second: 18, episode reward: -0.620, mean reward: -0.005 [-0.018, 0.009], mean action: 0.424 [-1.147, 1.262], mean observation: 0.029 [-13.629, 12.752], loss: 0.000730, mean_squared_error: 0.001460, mean_q: 0.699925
 1690/2000: episode: 15, duration: 6.472s, episode steps: 90, steps per second: 14, episode reward: 0.004, mean reward: 0.000 [-0.005, 0.010], mean action: 0.378 [-1.259, 1.214], mean observation: 0.022 [-11.431, 12.595], loss: 0.002422, mean_squared_error: 0.004844, mean_q: 0.709731
 1814/2000: episode: 16, duration: 6.807s, episode steps: 124, steps per second: 18, episode reward: -0.745, mean reward: -0.006 [-0.020, 0.010], mean action: 0.405 [-1.121, 1.148], mean observation: 0.018 [-22.600, 13.032], loss: 0.000661, mean_squared_error: 0.001321, mean_q: 0.708557
 1886/2000: episode: 17, duration: 4.795s, episode steps: 72, steps per second: 15, episode reward: 0.126, mean reward: 0.002 [-0.002, 0.01

  771/2000: episode: 6, duration: 6.045s, episode steps: 129, steps per second: 21, episode reward: -0.786, mean reward: -0.006 [-0.021, 0.011], mean action: 0.278 [-1.067, 1.236], mean observation: 0.015 [-9.515, 12.662], loss: --, mean_squared_error: --, mean_q: --
  900/2000: episode: 7, duration: 6.055s, episode steps: 129, steps per second: 21, episode reward: -0.781, mean reward: -0.006 [-0.021, 0.011], mean action: 0.303 [-1.114, 1.215], mean observation: 0.015 [-9.537, 12.662], loss: --, mean_squared_error: --, mean_q: --
 1028/2000: episode: 8, duration: 8.070s, episode steps: 128, steps per second: 16, episode reward: -0.760, mean reward: -0.006 [-0.019, 0.011], mean action: 0.247 [-1.107, 1.166], mean observation: 0.051 [-9.601, 12.587], loss: 0.001785, mean_squared_error: 0.003570, mean_q: 0.708173
 1252/2000: episode: 9, duration: 13.561s, episode steps: 224, steps per second: 17, episode reward: -0.877, mean reward: -0.004 [-0.021, 0.011], mean action: 0.275 [-1.346, 1.21

### Test Agent

In [None]:
# Finally, evaluate our algorithm for 5 episodes.
agent.test(nb_episodes=5, visualize=True, nb_max_episode_steps=1000)