In [1]:
import numpy as np
from evostra import EvolutionStrategy
from evostra.models import FeedForwardNetwork
import gym
env = gym.make('CartPole-v1')
# A feed forward neural network with input size of 5, two hidden layers of size 4 and output of size 3
model = FeedForwardNetwork(layer_sizes=[4, 4, 4, 1])

In [2]:
solution = np.array([0.1, -0.4, 0.5])
inp = np.asarray([1, 2, 3, 4, 5])

def evaluate_model(weights, max_iter=100, verbose=False):
    global model, env
    model.set_weights(weights)
    observation = env.reset()
    done = False
    i = 0
    r_sum = 0
    while not done and i < max_iter:
        observation, reward, done, _ = env.step(int(model.predict(observation)[0] > .5))
        i += 1
        r_sum += reward
    if verbose:
        print(f"Episode end after {i} iterations with reward = {r_sum} and done status {done}")
    return r_sum

In [3]:
[l.shape for l in model.get_weights()]

[(4, 4), (4, 4), (4, 1)]

In [4]:
# if your task is computationally expensive, you can use num_threads > 1 to use multiple processes;
# if you set num_threads=-1, it will use number of cores available on the machine; Here we use 1 process as the
#  task is not computationally expensive and using more processes would decrease the performance due to the IPC overhead.
es = EvolutionStrategy(model.get_weights(), evaluate_model, population_size=20, sigma=0.1, learning_rate=0.03, decay=0.995, num_threads=1)
es.run(1000, print_step=10)

iter 10. reward: 10.000000
iter 20. reward: 10.000000
iter 30. reward: 9.000000
iter 40. reward: 8.000000
iter 50. reward: 36.000000
iter 60. reward: 30.000000
iter 70. reward: 100.000000
iter 80. reward: 41.000000
iter 90. reward: 41.000000
iter 100. reward: 80.000000
iter 110. reward: 65.000000
iter 120. reward: 100.000000
iter 130. reward: 100.000000
iter 140. reward: 100.000000
iter 150. reward: 100.000000
iter 160. reward: 100.000000
iter 170. reward: 100.000000
iter 180. reward: 100.000000
iter 190. reward: 100.000000
iter 200. reward: 100.000000
iter 210. reward: 100.000000
iter 220. reward: 100.000000
iter 230. reward: 100.000000
iter 240. reward: 100.000000
iter 250. reward: 100.000000
iter 260. reward: 100.000000
iter 270. reward: 100.000000
iter 280. reward: 100.000000
iter 290. reward: 100.000000
iter 300. reward: 100.000000
iter 310. reward: 100.000000
iter 320. reward: 100.000000
iter 330. reward: 100.000000
iter 340. reward: 100.000000
iter 350. reward: 100.000000
iter 3

KeyboardInterrupt: 

In [None]:
def render_env(model, env, max_iter=None, verbose=True):
    observation = env.reset()
    done = False
    i = 0
    r_sum = 0
    while not done or (max_iter is not None and i < max_iter):
        env.render()
        observation, reward, done, _ = env.step(int(model.predict(observation)[0] > .5))
        i += 1
        r_sum += reward
    if verbose:
        print(f"Episode end after {i} iterations with reward = {r_sum} and done status {done}")

In [None]:
render_env(model, env)