In [1]:
import gym
env = gym.make('Marvin-v0')

In [2]:
init_observation = env.reset()
done = False
i = 0
r_sum = 0
while not done:
    observation, reward, done, _ = env.step(env.action_space.sample())
    i += 1
    r_sum += reward
print(f"Episode end after {i} iterations with reward = {r_sum}")

Episode end after 142 iterations with reward = -124.05923001822892


In [3]:
env.action_space.low, env.action_space.high

(array([-1., -1., -1., -1.], dtype=float32),
 array([1., 1., 1., 1.], dtype=float32))

In [4]:
env.observation_space.low, env.observation_space.high

(array([-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
        -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf,
        -inf, -inf], dtype=float32),
 array([inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf,
        inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf],
       dtype=float32))

In [5]:
from multilayer_perceptron.mlp.nn import *

In [8]:
x = np.arange(-100, 100, .1)
y = 1 / (1 + np.exp(-.5 * x))

In [43]:
nn = NeuralNetwork('mse', 'adam', random_state=42)
nn.add(Dense(1, 'identity'))
nn.add(Dense(1, 'sigmoid'))
nn.compile(1)

In [44]:
nn.summary()

|Dense(1, 1, identity)                	|	2
|Dense(1, 1, sigmoid)                	|	2
Total number of parameters:		 4
Total number of trainable params:	 4


In [13]:
def deep_copy(params):
    return [[np.copy(p) for p in layer] for layer in params]

In [14]:
def params_perturbation(params, sigma=0.1, seed=None):
    """
    Obtain weights perturbation for the whole network architecture
    """
    if seed is not None:
        np.random.seed(42)
    return [[np.random.randn(*weights.shape) * sigma for weights in layer] for layer in params]

In [15]:
def combine_weights(params, delta_params):
    return [[weights + delta_weights for weights, delta_weights in zip(layer, pert_layer)] for layer, pert_layer in zip(params, delta_params)]

In [34]:
def evaluate_model(nn, env, max_iter=100, verbose=False):
    y_pred = nn.predict(x.reshape(-1, 1))
    return ((y - y_pred) ** 2).sum()
    observation = env.reset()
    done = False
    i = 0
    r_sum = 0
    while not done and i < max_iter:
        observation, reward, done, _ = env.step(nn.predict(observation)[0])
        i += 1
        r_sum += reward
    if verbose:
        print(f"Episode end after {i} iterations with reward = {r_sum} and done status {done}")
    return r_sum

In [35]:
n = 10
lr = 0.01
sigma = 0.1

In [46]:
def update_params(params, population, rewards, learning_rate=0.01, sigma=0.1):
    """
    Inplace update of parameters
    """
    n = len(population)
    for candidate, reward in zip(population, rewards):
        for i in range(len(params)):
            layer_params = params[i]
            for j in range(len(layer_params)):
                W = layer_params[j]
                dW = candidate[i][j]
                layer_params[j] = W + learning_rate / n * reward * dW  # In-place modification

In [47]:
def generation_update(model, environment, sigma=0.1, lr=0.01, population_size=10, seed=None, normalize_rewards=True):
    original_params = deep_copy(model.get_params())
    print("Params input:", original_params)
    if seed is not None:
        np.random.seed(seed)

    population = []
    rewards = []
    for i in range(population_size):
        candidate = params_perturbation(original_params, sigma=sigma)
        modified_params = combine_weights(original_params, candidate)
        model.set_params(modified_params)
        reward = evaluate_model(model, environment)
        population.append(deep_copy(candidate))
        rewards.append(reward)

    rewards = np.array(rewards)
    r_mean, r_std = rewards.mean(), rewards.std()
    if normalize_rewards:
        rewards = (rewards - r_mean) / r_std
        
    update_params(original_params, population, rewards, learning_rate=lr, sigma=sigma)
    model.set_params(deep_copy(original_params))
    print("Params output:", original_params)
    return r_mean, r_std

In [48]:
generation_update(nn, env, population_size=n)

Params input: [[array([[0.01082697]]), array([[0.]])], [array([[-0.01221528]]), array([[0.]])]]
Params output: [[array([[0.01070276]]), array([[-3.44329639e-05]])], [array([[-0.01235455]]), array([[2.36700042e-05]])]]


(960014.5790880125, 1.1641532182693481e-10)

In [49]:
nn.predict(x.reshape(-1, 1))

array([[0.50330631],
       [0.50330301],
       [0.5032997 ],
       ...,
       [0.49670361],
       [0.4967003 ],
       [0.49669699]])

In [50]:
for i in range(10000):
    mean_rewards, std_rewards = generation_update(nn, env, population_size=n, sigma=0.01, lr=0.05, normalize_rewards=True)
    if i % 10 == 0:
        print(mean_rewards, std_rewards)

Params input: [[array([[0.01070276]]), array([[-3.44329639e-05]])], [array([[-0.01235455]]), array([[2.36700042e-05]])]]
Params output: [[array([[0.01070855]]), array([[-0.00010487]])], [array([[-0.01253504]]), array([[-0.00012239]])]]
960014.5790880125 1.1641532182693481e-10
Params input: [[array([[0.01070855]]), array([[-0.00010487]])], [array([[-0.01253504]]), array([[-0.00012239]])]]
Params output: [[array([[0.01073419]]), array([[-0.00025466]])], [array([[-0.01263478]]), array([[-0.00037037]])]]
Params input: [[array([[0.01073419]]), array([[-0.00025466]])], [array([[-0.01263478]]), array([[-0.00037037]])]]
Params output: [[array([[0.01066]]), array([[8.50171843e-05]])], [array([[-0.01246617]]), array([[-0.00059468]])]]
Params input: [[array([[0.01066]]), array([[8.50171843e-05]])], [array([[-0.01246617]]), array([[-0.00059468]])]]
Params output: [[array([[0.01061535]]), array([[0.00016699]])], [array([[-0.01220184]]), array([[-0.00064989]])]]
Params input: [[array([[0.01061535]])

KeyboardInterrupt: 