In [5]:
import gym
env = gym.make('CartPole-v1')

In [740]:
nn = NeuralNetwork('mse', 'adam', random_state=42)
nn.add(Dense(100, 'relu'))
nn.add(Dense(100, 'relu'))
nn.add(Dense(1, 'sigmoid'))
nn.compile(env.observation_space.shape[0])

In [741]:
nn.summary()

|Dense(4, 100, relu)              	|	500
|Dense(100, 100, relu)            	|	10100
|Dense(100, 1, sigmoid)              	|	101
Total number of parameters:		 10701
Total number of trainable params:	 10701


In [742]:
ev = EvolutionStrategy(nn.get_params(), evaluate_model)

In [743]:
ev.run(10)

AttributeError: 'list' object has no attribute 'shape'

In [720]:
def deep_copy(params):
    return [[np.copy(p) for p in layer] for layer in params]

In [721]:
def params_perturbation(params, sigma=0.1, seed=None):
    """
    Obtain weights perturbation for the whole network architecture
    """
    if seed is not None:
        np.random.seed(42)
    return [[np.random.randn(*weights.shape) * sigma for weights in layer] for layer in params]

In [722]:
def combine_weights(params, delta_params):
    return [[weights + delta_weights for weights, delta_weights in zip(layer, pert_layer)] for layer, pert_layer in zip(params, delta_params)]

In [723]:
def evaluate_model(nn, env, max_iter=1000000, verbose=False):
    observation = env.reset()
    done = False
    i = 0
    r_sum = 0
    while not done and i < max_iter:
        observation, reward, done, _ = env.step(int(nn.predict(observation)[0] > .5))
        i += 1
        r_sum += reward
    if verbose:
        print(f"Episode end after {i} iterations with reward = {r_sum} and done status {done}")
    return r_sum

In [724]:
n = 10
lr = 0.01
sigma = 0.1

In [725]:
def update_params(params, population, rewards, learning_rate=0.05, sigma=0.1):
    """
    Inplace update of parameters
    """
    n = len(population)
    for candidate, reward in zip(population, rewards):
        for i in range(len(params)):
            layer_params = params[i]
            for j in range(len(layer_params)):
                W = layer_params[j]
                dW = candidate[i][j]
                layer_params[j] = W + learning_rate / (n * sigma) * reward * dW  # In-place modification

In [726]:
def generation_update(model, environment, sigma=0.1, lr=0.01, population_size=10, seed=None, normalize_rewards=True):
    original_params = deep_copy(model.get_params())
    
    if seed is not None:
        np.random.seed(seed)

    population = []
    rewards = []
    for i in range(population_size):
        candidate = params_perturbation(original_params, sigma=sigma)
        modified_params = combine_weights(original_params, candidate)
        model.set_params(modified_params)
        reward = evaluate_model(model, environment)
        population.append(deep_copy(candidate))
        rewards.append(reward)

    rewards = np.array(rewards)
    r_mean, r_std = rewards.mean(), rewards.std()
    if normalize_rewards:
        rewards = (rewards - r_mean) / (r_std + 1e-9)
        
    update_params(original_params, population, rewards, learning_rate=lr, sigma=sigma)
    model.set_params(deep_copy(original_params))
    return r_mean, r_std

In [727]:
generation_update(nn, env, population_size=100, normalize_rewards=False)

(9.39, 0.661740130262628)

In [728]:
def render_env(model, env, max_iter=None, verbose=True):
    observation = env.reset()
    done = False
    i = 0
    r_sum = 0
    while not done or (max_iter is not None and i < max_iter):
        env.render()
        observation, reward, done, _ = env.step(int(nn.predict(observation)[0] > .5))
        i += 1
        r_sum += reward
    if verbose:
        print(f"Episode end after {i} iterations with reward = {r_sum} and done status {done}")

In [731]:
for i in range(1000):
    mean_rewards, std_rewards = generation_update(nn, env, population_size=n, sigma=0.01, lr=0.05, normalize_rewards=True)
    if i % 100 == 0:
        print(mean_rewards, std_rewards, nn.get_params()[0][0].mean())

9.2 0.7483314773547883 0.01710031522654289
9.7 1.1 0.011105729718565533
9.4 0.9165151389911681 0.011814390567226713
9.6 0.8 0.007581753304346686
9.1 0.5385164807134504 -0.00017162083453708732
9.9 0.8306623862918074 0.0006792549025048533


KeyboardInterrupt: 

In [737]:
render_env(nn, env)

Episode end after 9 iterations with reward = 9.0 and done status True
