In [None]:
# # import comet_ml in the top of your file
# from comet_ml import Experiment

# # Add the following code anywhere in your machine learning file
# experiment = Experiment(api_key="[REDACTED]")

# # Run your code and go to https://www.comet.ml



In [None]:
import tensorflow as tf
from keras import backend as K

In [None]:
config = tf.ConfigProto(device_count = {'GPU': 1})
sess = tf.Session(config=config)
K.set_session(sess)

In [None]:
import numpy as np
import gym

from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Input, Concatenate
from keras.optimizers import Adam

from rl.agents import NAFAgent
from rl.memory import SequentialMemory
from rl.random import OrnsteinUhlenbeckProcess
from rl.core import Processor

In [None]:
import gym
from gym import Env
from gym.spaces import *

In [None]:
class PendulumProcessor(Processor):
    def process_reward(self, reward):
        # The magnitude of the reward can be important. Since each step yields a relatively
        # high reward, we reduce the magnitude by two orders.
        return reward / 100.


ENV_NAME = 'Pendulum-v0'
gym.undo_logger_setup()


# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
assert len(env.action_space.shape) == 1
nb_actions = env.action_space.shape[0]

# Build all necessary models: V, mu, and L networks.
V_model = Sequential()
V_model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
V_model.add(Dense(16))
V_model.add(Activation('relu'))
V_model.add(Dense(16))
V_model.add(Activation('relu'))
V_model.add(Dense(16))
V_model.add(Activation('relu'))
V_model.add(Dense(1))
V_model.add(Activation('linear'))
print(V_model.summary())

mu_model = Sequential()
mu_model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
mu_model.add(Dense(16))
mu_model.add(Activation('relu'))
mu_model.add(Dense(16))
mu_model.add(Activation('relu'))
mu_model.add(Dense(16))
mu_model.add(Activation('relu'))
mu_model.add(Dense(nb_actions))
mu_model.add(Activation('linear'))
print(mu_model.summary())

action_input = Input(shape=(nb_actions,), name='action_input')
observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
x = Concatenate()([action_input, Flatten()(observation_input)])
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x)
x = Activation('linear')(x)
L_model = Model(inputs=[action_input, observation_input], outputs=x)
print(L_model.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
processor = PendulumProcessor()
memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3, size=nb_actions)
agent = NAFAgent(nb_actions=nb_actions, V_model=V_model, L_model=L_model, mu_model=mu_model,
                 memory=memory, nb_steps_warmup=100, random_process=random_process,
                 gamma=.99, target_model_update=1e-3, processor=processor)
agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
agent.fit(env, nb_steps=50000, visualize=False, verbose=1, nb_max_episode_steps=200)

# After training is done, we save the final weights.
agent.save_weights('cdqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
agent.test(env, nb_episodes=10, visualize=False, nb_max_episode_steps=200)

In [None]:
class Test(Env):
    
    def __init__(self):
        self.action_space = gym.spaces.Box(low=0, high=1, shape=(2,))
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(1,))
#         self.observation_space = Tuple((gym.spaces.Box(low=0, high=1, shape=(1,)), Box(0, 1, (2,))))
        self.idx = 0

    def close(self):
        pass

    def reset(self):
        self.state[0] = np.ones((1,))
        self.state[1] = np.ones((1,))
        self.idx = 0
        return self.state

    def step(self, action):
#         print(action)
        reward = action[1] / action[0]
        self.idx += 1
        terminal = self.idx == 100
        if self.idx == 100:
            self.idx = 0
        return self.state, reward / 10, terminal, {}

In [None]:
env = Test()

In [None]:
nb_actions = env.action_space.shape[0]

In [None]:
# Build all necessary models: V, mu, and L networks.
# V_model = Sequential()
# i1 = Input((1,))
# i2 = Input((1,))
# c = Concatenate()
# V_model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
# V_model.add(Dense(16))
# V_model.add(Activation('relu'))
# V_model.add(Dense(16))
# V_model.add(Activation('relu'))
# V_model.add(Dense(16))
# V_model.add(Activation('relu'))
# V_model.add(Dense(1))
# V_model.add(Activation('linear'))
i1 = Input((1,))
i2 = Input((1,))
c = Concatenate()([i1, i2])
h = Dense(16, activation='relu')(c)
h = Dense(16, activation='relu')(h)
h = Dense(16, activation='relu')(h)
o = Dense(1, activation='linear')(c)
V_model = Model([i1, i2], o)
print(V_model.summary())

mu_model = Sequential()
mu_model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
mu_model.add(Dense(16))
mu_model.add(Activation('relu'))
mu_model.add(Dense(16))
mu_model.add(Activation('relu'))
mu_model.add(Dense(16))
mu_model.add(Activation('relu'))
mu_model.add(Dense(nb_actions))
mu_model.add(Activation('linear'))
print(mu_model.summary())

action_input = Input(shape=(nb_actions,), name='action_input')
observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
x = Concatenate()([action_input, Flatten()(observation_input)])
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x)
x = Activation('linear')(x)
L_model = Model(inputs=[action_input, observation_input], outputs=x)
print(L_model.summary())

In [None]:
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
# processor = PendulumProcessor()
memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3, size=nb_actions)
agent = NAFAgent(nb_actions=nb_actions, V_model=V_model, L_model=L_model, mu_model=mu_model,
                 memory=memory, nb_steps_warmup=100, random_process=random_process,
                 gamma=.99, target_model_update=1e-3, processor=None)
agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

In [None]:
# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
agent.fit(env, nb_steps=50000, visualize=False, verbose=1, nb_max_episode_steps=200)