In [13]:
import numpy as np
from evostra import EvolutionStrategy
from evostra.models import FeedForwardNetwork
import gym
env = gym.make('BipedalWalker-v2')
# A feed forward neural network with input size of 5, two hidden layers of size 4 and output of size 3
model = FeedForwardNetwork(layer_sizes=[24, 24, 4])

In [14]:
def evaluate_model(weights, max_iter=1000, verbose=False):
    global model, env
    model.set_weights(weights)
    observation = env.reset()
    done = False
    i = 0
    r_sum = 0
    while not done and i < max_iter:
        observation, reward, done, _ = env.step(model.predict(observation))
        i += 1
        r_sum += reward
    if verbose:
        print(f"Episode end after {i} iterations with reward = {r_sum} and done status {done}")
    return r_sum

In [22]:
# if your task is computationally expensive, you can use num_threads > 1 to use multiple processes;
# if you set num_threads=-1, it will use number of cores available on the machine; Here we use 1 process as the
#  task is not computationally expensive and using more processes would decrease the performance due to the IPC overhead.
es = EvolutionStrategy(model.get_weights(), evaluate_model, population_size=20, sigma=0.1, learning_rate=0.03, decay=0.995, num_threads=-1)
es.run(100, print_step=10)

iter 10. reward: 39.451833
iter 20. reward: 11.499112
iter 30. reward: 81.827902
iter 40. reward: 87.108316
iter 50. reward: 61.945428
iter 60. reward: 103.783760
iter 70. reward: 95.749895
iter 80. reward: 91.983586
iter 90. reward: 74.450997
iter 100. reward: 82.053281


In [23]:
def render_env(model, env, max_iter=None, verbose=True):
    observation = env.reset()
    done = False
    i = 0
    r_sum = 0
    while not done or (max_iter is not None and i < max_iter):
        env.render()
        observation, reward, done, _ = env.step(model.predict(observation))
        i += 1
        r_sum += reward
    if verbose:
        print(f"Episode end after {i} iterations with reward = {r_sum} and done status {done}")

In [24]:
render_env(model, env)

Episode end after 2121 iterations with reward = 178.83645594217964 and done status True
