In [1]:
"""
Taken from:
    https://gist.github.com/karpathy/77fbb6a8dac5395f1b73e7a89300318d
    
A bare bones examples of optimizing a black-box function (f) using
Natural Evolution Strategies (NES), where the parameter distribution is a 
gaussian of fixed standard deviation.
"""

import numpy as np
np.random.seed(0)

# the function we want to optimize
def f(w):
    # here we would normally:
    # ... 1) create a neural network with weights w
    # ... 2) run the neural network on the environment for some time
    # ... 3) sum up and return the total reward

    # but for the purposes of an example, lets try to minimize
    # the L2 distance to a specific solution vector. So the highest reward
    # we can achieve is 0, when the vector w is exactly equal to solution
    reward = -np.sum(np.square(solution - w))
    
    return reward

# hyperparameters
npop = 50 # population size
sigma = 0.1 # noise standard deviation
alpha = 0.001 # learning rate

# start the optimization
solution = np.array([0.5, 0.5])
n = len(solution)

w = np.random.randn(n) # our initial guess is random
for i in range(300):

    # print current fitness of the most likely parameter setting
    if i % 20 == 0:
        print('iter %d. w: %s, solution: %s, reward: %f' % 
              (i, str(w), str(solution), f(w)))

    # initialize memory for a population of w's, and their rewards
    N = np.random.randn(npop, n) # samples from a normal distribution N(0,1)
    R = np.zeros(npop)
    for j in range(npop):
        w_try = w + sigma*N[j] # jitter w using gaussian of sigma 0.1
        R[j] = f(w_try) # evaluate the jittered version

    # standardize the rewards to have a gaussian distribution
    A = (R - np.mean(R)) / np.std(R)
    # perform the parameter update. The matrix multiply below
    # is just an efficient way to sum up all the rows of the noise matrix N,
    # where each row N[j] is weighted by A[j]
    w = w + alpha/(npop*sigma) * np.dot(N.T, A)

iter 0. w: [1.76405235 0.40015721], solution: [0.5 0.5], reward: -1.607797
iter 20. w: [1.7289955  0.40343095], solution: [0.5 0.5], reward: -1.519756
iter 40. w: [1.69344637 0.4057992 ], solution: [0.5 0.5], reward: -1.433188
iter 60. w: [1.65839631 0.40983034], solution: [0.5 0.5], reward: -1.350013
iter 80. w: [1.62200231 0.41376429], solution: [0.5 0.5], reward: -1.266326
iter 100. w: [1.58727951 0.4169361 ], solution: [0.5 0.5], reward: -1.189076
iter 120. w: [1.55179607 0.41833756], solution: [0.5 0.5], reward: -1.112944
iter 140. w: [1.51746132 0.42234937], solution: [0.5 0.5], reward: -1.041257
iter 160. w: [1.48304957 0.42263013], solution: [0.5 0.5], reward: -0.972373
iter 180. w: [1.44922526 0.42291134], solution: [0.5 0.5], reward: -0.906971
iter 200. w: [1.41569923 0.42379508], solution: [0.5 0.5], reward: -0.844312
iter 220. w: [1.38357115 0.42603484], solution: [0.5 0.5], reward: -0.786169
iter 240. w: [1.34899709 0.42768843], solution: [0.5 0.5], reward: -0.726025
iter 

iter 2240. w: [0.50339248 0.4950839 ], solution: [0.5 0.5], reward: -0.000036
iter 2260. w: [0.50062164 0.4972776 ], solution: [0.5 0.5], reward: -0.000008
iter 2280. w: [0.50094594 0.49630434], solution: [0.5 0.5], reward: -0.000015
iter 2300. w: [0.50118251 0.49772366], solution: [0.5 0.5], reward: -0.000007
iter 2320. w: [0.50052764 0.50073047], solution: [0.5 0.5], reward: -0.000001
iter 2340. w: [0.49865966 0.50227656], solution: [0.5 0.5], reward: -0.000007
iter 2360. w: [0.4961962  0.50087029], solution: [0.5 0.5], reward: -0.000015
iter 2380. w: [0.49598951 0.5015833 ], solution: [0.5 0.5], reward: -0.000019
iter 2400. w: [0.49608119 0.4971371 ], solution: [0.5 0.5], reward: -0.000024
iter 2420. w: [0.50109269 0.49601071], solution: [0.5 0.5], reward: -0.000017
iter 2440. w: [0.50400689 0.50188273], solution: [0.5 0.5], reward: -0.000020
iter 2460. w: [0.50423582 0.50477281], solution: [0.5 0.5], reward: -0.000041
iter 2480. w: [0.50567831 0.50634921], solution: [0.5 0.5], rewa

iter 4380. w: [0.49882256 0.496995  ], solution: [0.5 0.5], reward: -0.000010
iter 4400. w: [0.50188828 0.49601757], solution: [0.5 0.5], reward: -0.000019
iter 4420. w: [0.50128717 0.49336946], solution: [0.5 0.5], reward: -0.000046
iter 4440. w: [0.49960108 0.49229749], solution: [0.5 0.5], reward: -0.000059
iter 4460. w: [0.49957301 0.49234579], solution: [0.5 0.5], reward: -0.000059
iter 4480. w: [0.4979338  0.49186496], solution: [0.5 0.5], reward: -0.000070
iter 4500. w: [0.50316008 0.49096624], solution: [0.5 0.5], reward: -0.000092
iter 4520. w: [0.50249911 0.48972065], solution: [0.5 0.5], reward: -0.000112
iter 4540. w: [0.50352527 0.48901128], solution: [0.5 0.5], reward: -0.000133
iter 4560. w: [0.50446469 0.48854047], solution: [0.5 0.5], reward: -0.000151
iter 4580. w: [0.50338383 0.48940264], solution: [0.5 0.5], reward: -0.000124
iter 4600. w: [0.50676438 0.48506655], solution: [0.5 0.5], reward: -0.000269
iter 4620. w: [0.50438528 0.48809476], solution: [0.5 0.5], rewa

iter 6660. w: [0.50522998 0.50533526], solution: [0.5 0.5], reward: -0.000056
iter 6680. w: [0.50596875 0.50510156], solution: [0.5 0.5], reward: -0.000062
iter 6700. w: [0.50656468 0.50326078], solution: [0.5 0.5], reward: -0.000054
iter 6720. w: [0.50641408 0.50413801], solution: [0.5 0.5], reward: -0.000058
iter 6740. w: [0.50701378 0.50576744], solution: [0.5 0.5], reward: -0.000082
iter 6760. w: [0.51003208 0.5033579 ], solution: [0.5 0.5], reward: -0.000112
iter 6780. w: [0.51269972 0.50240138], solution: [0.5 0.5], reward: -0.000167
iter 6800. w: [0.51413198 0.50319608], solution: [0.5 0.5], reward: -0.000210
iter 6820. w: [0.51503213 0.50584787], solution: [0.5 0.5], reward: -0.000260
iter 6840. w: [0.51647259 0.50251412], solution: [0.5 0.5], reward: -0.000278
iter 6860. w: [0.515416   0.50119913], solution: [0.5 0.5], reward: -0.000239
iter 6880. w: [0.51447834 0.50046173], solution: [0.5 0.5], reward: -0.000210
iter 6900. w: [0.51126982 0.50278093], solution: [0.5 0.5], rewa

iter 8900. w: [0.49855715 0.50314509], solution: [0.5 0.5], reward: -0.000012
iter 8920. w: [0.50129387 0.50276435], solution: [0.5 0.5], reward: -0.000009
iter 8940. w: [0.50089296 0.5017936 ], solution: [0.5 0.5], reward: -0.000004
iter 8960. w: [0.49902326 0.4996294 ], solution: [0.5 0.5], reward: -0.000001
iter 8980. w: [0.50031257 0.49697484], solution: [0.5 0.5], reward: -0.000009
iter 9000. w: [0.49596123 0.49891624], solution: [0.5 0.5], reward: -0.000017
iter 9020. w: [0.49650445 0.49752405], solution: [0.5 0.5], reward: -0.000018
iter 9040. w: [0.50050822 0.49883149], solution: [0.5 0.5], reward: -0.000002
iter 9060. w: [0.50039383 0.49962014], solution: [0.5 0.5], reward: -0.000000
iter 9080. w: [0.50146715 0.50012784], solution: [0.5 0.5], reward: -0.000002
iter 9100. w: [0.5009667  0.50084735], solution: [0.5 0.5], reward: -0.000002
iter 9120. w: [0.50018527 0.50095262], solution: [0.5 0.5], reward: -0.000001
iter 9140. w: [0.49493269 0.49939422], solution: [0.5 0.5], rewa

iter 11040. w: [0.50391128 0.49602221], solution: [0.5 0.5], reward: -0.000031
iter 11060. w: [0.5033943  0.49483607], solution: [0.5 0.5], reward: -0.000038
iter 11080. w: [0.50365784 0.49321147], solution: [0.5 0.5], reward: -0.000059
iter 11100. w: [0.50210577 0.49552661], solution: [0.5 0.5], reward: -0.000024
iter 11120. w: [0.49848878 0.49619339], solution: [0.5 0.5], reward: -0.000017
iter 11140. w: [0.499239   0.49389792], solution: [0.5 0.5], reward: -0.000038
iter 11160. w: [0.49916624 0.49440452], solution: [0.5 0.5], reward: -0.000032
iter 11180. w: [0.50165366 0.49160861], solution: [0.5 0.5], reward: -0.000073
iter 11200. w: [0.50324938 0.49270271], solution: [0.5 0.5], reward: -0.000064
iter 11220. w: [0.50285699 0.49371991], solution: [0.5 0.5], reward: -0.000048
iter 11240. w: [0.50200143 0.49779022], solution: [0.5 0.5], reward: -0.000009
iter 11260. w: [0.50106369 0.49789945], solution: [0.5 0.5], reward: -0.000006
iter 11280. w: [0.50283214 0.50124034], solution: [0

iter 13120. w: [0.49851049 0.50000749], solution: [0.5 0.5], reward: -0.000002
iter 13140. w: [0.49930313 0.5018699 ], solution: [0.5 0.5], reward: -0.000004
iter 13160. w: [0.5012264  0.50093807], solution: [0.5 0.5], reward: -0.000002
iter 13180. w: [0.49966754 0.50069465], solution: [0.5 0.5], reward: -0.000001
iter 13200. w: [0.50379996 0.50110019], solution: [0.5 0.5], reward: -0.000016
iter 13220. w: [0.50384458 0.50168101], solution: [0.5 0.5], reward: -0.000018
iter 13240. w: [0.50462969 0.50202854], solution: [0.5 0.5], reward: -0.000026
iter 13260. w: [0.5053341  0.50202601], solution: [0.5 0.5], reward: -0.000033
iter 13280. w: [0.50356168 0.50164807], solution: [0.5 0.5], reward: -0.000015
iter 13300. w: [0.50088455 0.50022261], solution: [0.5 0.5], reward: -0.000001
iter 13320. w: [0.50016995 0.49969517], solution: [0.5 0.5], reward: -0.000000
iter 13340. w: [0.50210819 0.50248166], solution: [0.5 0.5], reward: -0.000011
iter 13360. w: [0.50085518 0.50496581], solution: [0

iter 15300. w: [0.50019225 0.49956229], solution: [0.5 0.5], reward: -0.000000
iter 15320. w: [0.50148803 0.4969271 ], solution: [0.5 0.5], reward: -0.000012
iter 15340. w: [0.5019649  0.49732127], solution: [0.5 0.5], reward: -0.000011
iter 15360. w: [0.49852931 0.49385172], solution: [0.5 0.5], reward: -0.000040
iter 15380. w: [0.49865611 0.49885935], solution: [0.5 0.5], reward: -0.000003
iter 15400. w: [0.50100456 0.50051236], solution: [0.5 0.5], reward: -0.000001
iter 15420. w: [0.49777914 0.49925165], solution: [0.5 0.5], reward: -0.000005
iter 15440. w: [0.49795523 0.49844083], solution: [0.5 0.5], reward: -0.000007
iter 15460. w: [0.4980801  0.49713594], solution: [0.5 0.5], reward: -0.000012
iter 15480. w: [0.49907681 0.49804971], solution: [0.5 0.5], reward: -0.000005
iter 15500. w: [0.49903424 0.4949386 ], solution: [0.5 0.5], reward: -0.000027
iter 15520. w: [0.49663828 0.49687188], solution: [0.5 0.5], reward: -0.000021
iter 15540. w: [0.49611969 0.49907985], solution: [0

iter 17420. w: [0.50188659 0.50039723], solution: [0.5 0.5], reward: -0.000004
iter 17440. w: [0.50123298 0.49945467], solution: [0.5 0.5], reward: -0.000002
iter 17460. w: [0.50149757 0.50099915], solution: [0.5 0.5], reward: -0.000003
iter 17480. w: [0.49994723 0.50255582], solution: [0.5 0.5], reward: -0.000007
iter 17500. w: [0.50284811 0.50306844], solution: [0.5 0.5], reward: -0.000018
iter 17520. w: [0.5045662 0.5038537], solution: [0.5 0.5], reward: -0.000036
iter 17540. w: [0.50378805 0.50557918], solution: [0.5 0.5], reward: -0.000045
iter 17560. w: [0.49997885 0.50678348], solution: [0.5 0.5], reward: -0.000046
iter 17580. w: [0.50188872 0.50648233], solution: [0.5 0.5], reward: -0.000046
iter 17600. w: [0.49856268 0.50499435], solution: [0.5 0.5], reward: -0.000027
iter 17620. w: [0.49979477 0.50192352], solution: [0.5 0.5], reward: -0.000004
iter 17640. w: [0.50033641 0.50101068], solution: [0.5 0.5], reward: -0.000001
iter 17660. w: [0.49903535 0.49982662], solution: [0.5

iter 19600. w: [0.49703823 0.49501037], solution: [0.5 0.5], reward: -0.000034
iter 19620. w: [0.49321257 0.49048963], solution: [0.5 0.5], reward: -0.000137
iter 19640. w: [0.49047259 0.49145041], solution: [0.5 0.5], reward: -0.000164
iter 19660. w: [0.4922944  0.49280048], solution: [0.5 0.5], reward: -0.000111
iter 19680. w: [0.48937624 0.49501977], solution: [0.5 0.5], reward: -0.000138
iter 19700. w: [0.48762325 0.49332724], solution: [0.5 0.5], reward: -0.000198
iter 19720. w: [0.48914549 0.49480769], solution: [0.5 0.5], reward: -0.000145
iter 19740. w: [0.48811388 0.49516143], solution: [0.5 0.5], reward: -0.000165
iter 19760. w: [0.49112745 0.49761336], solution: [0.5 0.5], reward: -0.000084
iter 19780. w: [0.49432605 0.49671146], solution: [0.5 0.5], reward: -0.000043
iter 19800. w: [0.49338238 0.49638841], solution: [0.5 0.5], reward: -0.000057
iter 19820. w: [0.49301415 0.49889034], solution: [0.5 0.5], reward: -0.000050
iter 19840. w: [0.49669295 0.49995656], solution: [0

iter 21900. w: [0.48991809 0.51065864], solution: [0.5 0.5], reward: -0.000215
iter 21920. w: [0.48679182 0.50658047], solution: [0.5 0.5], reward: -0.000218
iter 21940. w: [0.48899057 0.50740251], solution: [0.5 0.5], reward: -0.000176
iter 21960. w: [0.48723648 0.50736407], solution: [0.5 0.5], reward: -0.000217
iter 21980. w: [0.48871195 0.50532044], solution: [0.5 0.5], reward: -0.000156
iter 22000. w: [0.48504752 0.50286445], solution: [0.5 0.5], reward: -0.000232
iter 22020. w: [0.48452935 0.50052536], solution: [0.5 0.5], reward: -0.000240
iter 22040. w: [0.48470661 0.50295525], solution: [0.5 0.5], reward: -0.000243
iter 22060. w: [0.4857088  0.50514243], solution: [0.5 0.5], reward: -0.000231
iter 22080. w: [0.48692541 0.5027602 ], solution: [0.5 0.5], reward: -0.000179
iter 22100. w: [0.48563171 0.50412392], solution: [0.5 0.5], reward: -0.000223
iter 22120. w: [0.48449833 0.50385456], solution: [0.5 0.5], reward: -0.000255
iter 22140. w: [0.48561274 0.50477886], solution: [0

iter 23980. w: [0.50354333 0.49596529], solution: [0.5 0.5], reward: -0.000029
iter 24000. w: [0.49933252 0.49439403], solution: [0.5 0.5], reward: -0.000032
iter 24020. w: [0.49878786 0.49154037], solution: [0.5 0.5], reward: -0.000073
iter 24040. w: [0.49954326 0.49076694], solution: [0.5 0.5], reward: -0.000085
iter 24060. w: [0.5019125  0.49208044], solution: [0.5 0.5], reward: -0.000066
iter 24080. w: [0.50199224 0.49348999], solution: [0.5 0.5], reward: -0.000046
iter 24100. w: [0.50294343 0.4961859 ], solution: [0.5 0.5], reward: -0.000023
iter 24120. w: [0.50018392 0.49889129], solution: [0.5 0.5], reward: -0.000001
iter 24140. w: [0.4979554  0.50046852], solution: [0.5 0.5], reward: -0.000004
iter 24160. w: [0.49915461 0.50444918], solution: [0.5 0.5], reward: -0.000021
iter 24180. w: [0.49760502 0.50180684], solution: [0.5 0.5], reward: -0.000009
iter 24200. w: [0.49745424 0.49666287], solution: [0.5 0.5], reward: -0.000018
iter 24220. w: [0.49788675 0.49881939], solution: [0

KeyboardInterrupt: 