<a href="https://colab.research.google.com/github/skozh/RL/blob/master/ES_sample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Simple example: Minimize a quadratic around some solution point
# Code from https://gist.github.com/karpathy/77fbb6a8dac5395f1b73e7a89300318d
# Explanation at https://openai.com/blog/evolution-strategies/
import numpy as np

In [2]:
solution = np.array([0.5, 0.1, -0.3])

In [3]:
def fun(w):
  return -np.sum((w - solution)**2)

In [4]:
npop = 50                                # population size
sigma = 0.1                             # noise standard deviation
alpha = 0.001                         # learning rate
w = np.random.randn(3)    # Initial guess (random)

In [5]:
for i in range(300):
  N = np.random.randn(npop, 3)                                  # Random noise initialization
  R = np.zeros(npop)                                                        # Reward initialization
  for j in range(npop):
    w_try = w + sigma* N[j]                                              # Add noise to the guessed parameter
    R[j] = fun(w_try)
  if i % 50 == 0:
    print ('iter %d. w: %s, solution: %s, reward: %s' % 
           (i, str(w), str(solution), str(fun(w))))
  A = (R - np.mean(R))/np.std(R)                                    # Standardize Rewards to Gaussian Distribution
  w = w + alpha/(npop * sigma) * np.dot(N.T, A)      # Update Parameter. 

iter 0. w: [-0.09706066 -0.66642031  0.56226398], solution: [ 0.5  0.1 -0.3], reward: -1.6873806861048783
iter 50. w: [ 0.13016351 -0.36686387  0.23463083], solution: [ 0.5  0.1 -0.3], reward: -0.6405710198785806
iter 100. w: [ 0.34605869 -0.09086411 -0.08780975], solution: [ 0.5  0.1 -0.3], reward: -0.10515173535606477
iter 150. w: [ 0.4822386   0.08484504 -0.29531559], solution: [ 0.5  0.1 -0.3], reward: -0.0005670837761888932
iter 200. w: [ 0.49438263  0.09942668 -0.30377657], solution: [ 0.5  0.1 -0.3], reward: -4.6146030478698655e-05
iter 250. w: [ 0.50635328  0.0968545  -0.29983505], solution: [ 0.5  0.1 -0.3], reward: -5.0285538221981195e-05


In [6]:
w

array([ 0.50112319,  0.09715664, -0.30218905])