In [1]:
import numpy as np
from evostra import EvolutionStrategy
from evostra.models import FeedForwardNetwork
import gym
env = gym.make('CartPole-v1')
# A feed forward neural network with input size of 5, two hidden layers of size 4 and output of size 3
model = FeedForwardNetwork(layer_sizes=[4, 4, 4, 1])

In [10]:
def evaluate_model(weights, max_iter=1000, verbose=False):
    global model, env
    model.set_weights(weights)
    observation = env.reset()
    done = False
    i = 0
    r_sum = 0
    while not done and i < max_iter:
        observation, reward, done, _ = env.step(int(model.predict(observation)[0] > .5))
        i += 1
        r_sum += reward
    if verbose:
        print(f"Episode end after {i} iterations with reward = {r_sum} and done status {done}")
    return r_sum

In [11]:
# if your task is computationally expensive, you can use num_threads > 1 to use multiple processes;
# if you set num_threads=-1, it will use number of cores available on the machine; Here we use 1 process as the
#  task is not computationally expensive and using more processes would decrease the performance due to the IPC overhead.
es = EvolutionStrategy(model.get_weights(), evaluate_model, population_size=20, sigma=0.1, learning_rate=0.03, decay=0.995, num_threads=-1)
es.run(500, print_step=10)

iter 10. reward: 1000.000000
iter 20. reward: 58.000000
iter 30. reward: 1000.000000
iter 40. reward: 1000.000000
iter 50. reward: 1000.000000
iter 60. reward: 1000.000000
iter 70. reward: 96.000000
iter 80. reward: 1000.000000
iter 90. reward: 1000.000000
iter 100. reward: 1000.000000
iter 110. reward: 1000.000000
iter 120. reward: 1000.000000
iter 130. reward: 1000.000000
iter 140. reward: 1000.000000
iter 150. reward: 75.000000
iter 160. reward: 1000.000000
iter 170. reward: 162.000000
iter 180. reward: 160.000000
iter 190. reward: 1000.000000
iter 200. reward: 106.000000
iter 210. reward: 1000.000000
iter 220. reward: 186.000000
iter 230. reward: 1000.000000
iter 240. reward: 1000.000000
iter 250. reward: 1000.000000
iter 260. reward: 1000.000000
iter 270. reward: 816.000000
iter 280. reward: 1000.000000
iter 290. reward: 1000.000000
iter 300. reward: 1000.000000
iter 310. reward: 1000.000000
iter 320. reward: 1000.000000
iter 330. reward: 1000.000000
iter 340. reward: 1000.000000


In [7]:
def render_env(model, env, max_iter=None, verbose=True):
    observation = env.reset()
    done = False
    i = 0
    r_sum = 0
    while not done or (max_iter is not None and i < max_iter):
        env.render()
        observation, reward, done, _ = env.step(int(model.predict(observation)[0] > .5))
        i += 1
        r_sum += reward
    if verbose:
        print(f"Episode end after {i} iterations with reward = {r_sum} and done status {done}")

In [13]:
render_env(model, env)

KeyboardInterrupt: 