In [1]:
import gym
env = gym.make('Marvin-v0')

In [2]:
init_observation = env.reset()
done = False
i = 0
r_sum = 0
while not done:
    observation, reward, done, _ = env.step(env.action_space.sample())
    i += 1
    r_sum += reward
print(f"Episode end after {i} iterations with reward = {r_sum}")

Episode end after 203 iterations with reward = -125.17475404388978


In [3]:
env.action_space.low, env.action_space.high

(array([-1., -1., -1., -1.], dtype=float32),
 array([1., 1., 1., 1.], dtype=float32))

In [4]:
env.observation_space.low, env.observation_space.high

(array([-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
        -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
        -inf, -inf], dtype=float32),
 array([inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf,
        inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf],
       dtype=float32))

In [5]:
from multilayer_perceptron.mlp.nn import *

In [6]:
nn = NeuralNetwork('mse', 'adam', random_state=42)
nn.add(Dense(24, 'relu'))
nn.add(Dense(4, 'tanh'))
nn.compile(env.observation_space.shape[0])

In [7]:
nn.summary()

|Dense(24, 24, relu)              	|	600
|Dense(24, 4, tanh)               	|	100
Total number of parameters:		 700
Total number of trainable params:	 700


In [8]:
def deep_copy(params):
    return [[np.copy(p) for p in layer] for layer in params]

In [9]:
def params_perturbation(params, sigma=0.1, seed=None):
    """
    Obtain weights perturbation for the whole network architecture
    """
    if seed is not None:
        np.random.seed(42)
    return [[np.random.randn(*weights.shape) * sigma for weights in layer] for layer in params]

In [10]:
def combine_weights(params, delta_params):
    return [[weights + delta_weights for weights, delta_weights in zip(layer, pert_layer)] for layer, pert_layer in zip(params, delta_params)]

In [11]:
def evaluate_model(nn, env, max_iter=100, verbose=False):
    observation = env.reset()
    done = False
    i = 0
    r_sum = 0
    while not done and i < max_iter:
        observation, reward, done, _ = env.step(nn.predict(observation)[0])
        i += 1
        r_sum += reward
    if verbose:
        print(f"Episode end after {i} iterations with reward = {r_sum} and done status {done}")
    return r_sum

In [12]:
n = 10
lr = 0.01
sigma = 0.1

In [13]:
def update_params(params, population, rewards, learning_rate=0.01, sigma=0.1):
    """
    Inplace update of parameters
    """
    n = len(population)
    for candidate, reward in zip(population, rewards):
        for i in range(len(params)):
            layer_params = params[i]
            for j in range(len(layer_params)):
                W = layer_params[j]
                dW = candidate[i][j]
                layer_params[j] = W + learning_rate / n * reward * dW  # In-place modification

In [14]:
def generation_update(model, environment, sigma=0.1, lr=0.01, population_size=10, seed=None, normalize_rewards=True):
    original_params = deep_copy(model.get_params())
    
    if seed is not None:
        np.random.seed(seed)

    population = []
    rewards = []
    for i in range(population_size):
        candidate = params_perturbation(original_params, sigma=sigma)
        modified_params = combine_weights(original_params, candidate)
        model.set_params(modified_params)
        reward = evaluate_model(model, environment)
        population.append(deep_copy(candidate))
        rewards.append(reward)

    rewards = np.array(rewards)
    r_mean, r_std = rewards.mean(), rewards.std()
    if normalize_rewards:
        rewards = (rewards - r_mean) / r_std
        
    update_params(original_params, population, rewards, learning_rate=lr, sigma=sigma)
    model.set_params(deep_copy(original_params))
    return r_mean, r_std

In [15]:
generation_update(nn, env, population_size=n)

(-7.806491252029072, 0.2825624180664404)

In [16]:
nn.predict(observation)

array([[ 0.0001021 , -0.00063826, -0.00019837, -0.00017913]])

In [17]:
for i in range(10000):
    mean_rewards, std_rewards = generation_update(nn, env, population_size=n, sigma=0.01, lr=0.05, normalize_rewards=True)
    if i % 10 == 0:
        print(mean_rewards, std_rewards)

-7.994329110085475 0.38637578416445134
-7.967279082265989 0.30632692556318414
-7.912985546097204 0.25077913909731325
-8.018081763983984 0.377919475982578
-7.99051507156887 0.32112670490725886
-7.943812158954161 0.4116352565459375
-7.824128123867252 0.21912486283767485
-7.850346427103102 0.4321528470737482
-8.068224562604328 0.5139387564552522
-8.044000850397058 0.5659902237310457
-8.038551978709515 0.40276603027681057
-7.820542311839256 0.3625146808148527
-8.103341639458682 0.3826753228167356
-8.159191380193672 0.38416412017559093
-7.811043444908807 0.25035003990580934
-7.841327672270053 0.11404862208321428
-7.858800949855244 0.32999353295940814
-7.779326828841749 0.24047054186663364
-8.006294411638248 0.18320004107608234
-8.031956773834704 0.4074108209066684
-7.933311545000551 0.36522809858690203
-7.7834911115297345 0.17092689537200742
-7.893810433809207 0.2584035682512281
-7.796965307548132 0.1510744333429039
-8.057600080395735 0.42567010002983435
-7.89957133439585 0.4298769925547389

KeyboardInterrupt: 