In [1]:
import numpy as np
import gym
env = gym.make('Marvin-v0')

In [2]:
class NN:
    def __init__(self, layer_sizes, seed=None):
        if seed is not None:
            np.random.seed(seed)
        self.weights = [np.zeros((m, n)) * 1e-3 for m, n in zip(layer_sizes[1:], layer_sizes)]
    
    def predict(self, X):
        out = X
        for W in self.weights:
            Z = out @ W.T
            out = np.tanh(Z)
        if out.shape[0] == 1 and len(out.shape) == 1:
            return out.item()
        return out

    def set_weights(self, weights, copy=False):
        if copy:
            self.weights = [np.copy(l) for l in weights]
        else:
            self.weights = weights
        
    def get_weights(self, copy=False):
        if copy:
            return [np.copy(l) for l in self.weights]
        return self.weights
    
    def sample_like(self, sigma=1):
        return [np.random.randn(*l.shape) * sigma for l in self.weights]

In [3]:
# class NN:
#     def __init__(self, layer_sizes):
#         self.weights = []
#         for index in range(len(layer_sizes)-1):
#             self.weights.append(np.zeros(shape=(layer_sizes[index], layer_sizes[index+1])))

#     def predict(self, inp):
#         out = np.expand_dims(inp.flatten(), 0)
#         for i, layer in enumerate(self.weights):
#             out = np.dot(out, layer)
#             out = np.tanh(out)
#         return out[0]

#     def get_weights(self, copy=False):
#         if copy:
#             return [np.copy(l) for l in self.weights]
#         return self.weights

#     def set_weights(self, weights):
#         self.weights = weights

#     def save(self, filename='weights.pkl'):
#         with open(filename, 'wb') as fp:
#             pickle.dump(self.weights, fp)

#     def load(self, filename='weights.pkl'):
#         with open(filename, 'rb') as fp:
#             self.weights = pickle.load(fp)
            
#     def sample_like(self, sigma=1):
#         return [np.random.randn(*l.shape) * sigma for l in self.weights]

In [4]:
np.random.seed(42)
nn = NN([24, 24, 4])

In [5]:
def combine_weights(params, delta_params, sigma):
    return [W + dW * sigma for W, dW in zip(params, delta_params)]

In [6]:
def evaluate_model(weights, max_iter=1500, verbose=False):
    global nn, env
    
    nn.set_weights(weights)
    observation = env.reset()
    done = False
    i = 0
    r_sum = 0
    while not done and i < max_iter:
        action = nn.predict(observation)
        if verbose:
            print("Observation:", observation)
            print("prediction:", action)
        observation, reward, done, _ = env.step(action)
        i += 1
        r_sum += reward
    if verbose:
        print(f"Episode end after {i} iterations with reward = {r_sum} and done status {done}")
    return r_sum

In [7]:
n = 30
lr = 0.03
sigma = 0.1

In [8]:
def update_params(params, population, rewards, lr=0.05, sigma=0.1):
    """
    Inplace update of parameters
    """
    n = len(population)
    for candidate, reward in zip(population, rewards):
        for i in range(len(params)):
            W = params[i]
            dW = candidate[i]
            W_new =  W + lr / (n * sigma) * reward * dW
#             b_new = b + learning_rate / (n * sigma) * reward * db
            params[i] = (W_new)
    return params

In [9]:
def update_params(params, population, rewards, lr=0.05, sigma=0.1):
    """
    Inplace update of parameters
    """
    n = len(population)
    for i in range(len(params)):
        W = params[i]
        
        dW_accum = np.zeros_like(W)
        for candidate, reward in zip(population, rewards):
            dW = candidate[i]
            dW_accum += reward * dW
        W_new = W + lr / (n * sigma) * dW_accum
        params[i] = W_new
    return params

In [10]:
def generation_update(model, environment, sigma=0.1, lr=0.01, population_size=10, seed=None, normalize_rewards=True):
    original_params = model.get_weights(copy=True)
    
    if seed is not None:
        np.random.seed(seed)

    population = [model.sample_like() for _ in range(population_size)]
    rewards = [evaluate_model(combine_weights(original_params, weights, sigma)) for weights in population]
    rewards = np.array(rewards)
    r_mean, r_std = rewards.mean(), rewards.std()
    if r_std > 0:
        if normalize_rewards:
            rewards = (rewards - r_mean) / r_std

        update_params(original_params, population, rewards, lr=lr, sigma=sigma)
        model.set_weights(original_params, copy=True)
    else:
        print("Not Updating")
    return r_mean, r_std

In [11]:
np.random.seed(42)
for i in range(100):
    mean_rewards, std_rewards = generation_update(nn, env, population_size=n, sigma=sigma, lr=lr, normalize_rewards=True)
    lr = lr * 0.999
    if i % 1 == 0:
        print(f'[{i}]: E[R]={mean_rewards:.4f}, std(R)={std_rewards:.4f} | nn.mean = {nn.weights[0].mean():.4f}')

[0]: E[R]=-77.4647, std(R)=40.9387 | nn.mean = -0.0013
[1]: E[R]=-76.4491, std(R)=42.4877 | nn.mean = -0.0035
[2]: E[R]=-83.4081, std(R)=38.1972 | nn.mean = -0.0055
[3]: E[R]=-64.4276, std(R)=42.2237 | nn.mean = -0.0083
[4]: E[R]=-54.4064, std(R)=34.4916 | nn.mean = -0.0077
[5]: E[R]=-62.0084, std(R)=38.7135 | nn.mean = -0.0067
[6]: E[R]=-47.0158, std(R)=28.3628 | nn.mean = -0.0087
[7]: E[R]=-47.8659, std(R)=17.7355 | nn.mean = -0.0091
[8]: E[R]=-54.3048, std(R)=40.4261 | nn.mean = -0.0054
[9]: E[R]=-59.1145, std(R)=33.8909 | nn.mean = -0.0048
[10]: E[R]=-52.0577, std(R)=29.2137 | nn.mean = -0.0050
[11]: E[R]=-44.5964, std(R)=22.0785 | nn.mean = -0.0075
[12]: E[R]=-57.9151, std(R)=43.6207 | nn.mean = -0.0077
[13]: E[R]=-39.4650, std(R)=23.4733 | nn.mean = -0.0095
[14]: E[R]=-36.7073, std(R)=59.3307 | nn.mean = -0.0091
[15]: E[R]=-46.0172, std(R)=56.7130 | nn.mean = -0.0083
[16]: E[R]=-30.9761, std(R)=72.3562 | nn.mean = -0.0085
[17]: E[R]=-41.9257, std(R)=62.7632 | nn.mean = -0.0073
[1

In [12]:
import pickle as pcl

with open("trained_marvin.pcl", "wb") as f:
    pcl.dump(nn.get_weights(), f)


In [13]:
def render_env(model, env, max_iter=None, verbose=True):
    observation = env.reset()
    done = False
    i = 0
    r_sum = 0
    while not done:
        if (max_iter is not None and i >= max_iter):
            break
        env.render()
        action = nn.predict(observation)
        observation, reward, done, _ = env.step(action)
        print(f"R={reward} | A={action}")
        i += 1
        r_sum += reward
    if verbose:
        print(f"Episode end after {i} iterations with reward = {r_sum} and done status {done}")

In [14]:
render_env(nn, env, max_iter=1500)

R=-0.14659494812713675 | A=[ 0.92374363 -0.88109288 -0.99896318 -0.95914623]
R=-0.021673806491376995 | A=[ 0.92818869 -0.14118075 -0.99927086 -0.81769398]
R=-0.004064192255238495 | A=[ 0.80740114 -0.64758942 -0.99936812 -0.8727064 ]
R=-0.10541219538900876 | A=[ 0.74491007 -0.78401092 -0.99923419 -0.93123478]
R=-0.10365271432955092 | A=[ 0.77571844 -0.63877908 -0.99870447 -0.93562308]
R=-0.09808388953959789 | A=[ 0.84414542 -0.30811105 -0.99762428 -0.93693412]
R=-0.09710032583647574 | A=[ 0.89727747  0.14558007 -0.99521817 -0.9389239 ]
R=-0.11291162788090958 | A=[ 0.94461149  0.55672833 -0.99252692 -0.95465262]
R=-0.12589119784931474 | A=[ 0.98008821  0.93148604 -0.99017513 -0.95453965]
R=-0.12687128142157086 | A=[ 0.9935636   0.99230784 -0.98230109 -0.91098498]
R=-0.1261040093421948 | A=[ 0.99613799  0.99642842 -0.98081138 -0.88153401]
R=-0.12431864894036501 | A=[ 0.9975665   0.99828101 -0.97822455 -0.83899019]
R=-0.10513111347046838 | A=[ 0.99834548  0.99913469 -0.97353001 -0.77311617

In [15]:
import numpy as np
import multiprocessing as mp

np.random.seed(0)


def worker_process(arg):
    get_reward_func, weights = arg
    return get_reward_func(weights)


class EvolutionStrategy(object):
    def __init__(self, weights, get_reward_func, population_size=50, sigma=0.1, learning_rate=0.03, decay=0.999,
                 num_threads=1):

        self.weights = weights
        self.get_reward = get_reward_func
        self.POPULATION_SIZE = population_size
        self.SIGMA = sigma
        self.learning_rate = learning_rate
        self.decay = decay
        self.num_threads = mp.cpu_count() if num_threads == -1 else num_threads

    def _get_weights_try(self, w, p):
        weights_try = []
        for index, i in enumerate(p):
            jittered = self.SIGMA * i
            weights_try.append(w[index] + jittered)
        return weights_try

    def get_weights(self):
        return self.weights

    def _get_population(self):
        population = []
        for i in range(self.POPULATION_SIZE):
            x = []
            for w in self.weights:
                x.append(np.random.randn(*w.shape))
            population.append(x)
        return population

    def _get_rewards(self, pool, population):
        if pool is not None:
            worker_args = ((self.get_reward, self._get_weights_try(self.weights, p)) for p in population)
            rewards = pool.map(worker_process, worker_args)

        else:
            rewards = []
            for p in population:
                weights_try = self._get_weights_try(self.weights, p)
                rewards.append(self.get_reward(weights_try))
        rewards = np.array(rewards)
        return rewards

    def _update_weights(self, rewards, population):
        std = rewards.std()
        if std == 0:
            return
        rewards = (rewards - rewards.mean()) / std
        for index, w in enumerate(self.weights):
            layer_population = np.array([p[index] for p in population])
            update_factor = self.learning_rate / (self.POPULATION_SIZE * self.SIGMA)
            self.weights[index] = w + update_factor * np.dot(layer_population.T, rewards).T
        self.learning_rate *= self.decay

    def run(self, iterations, print_step=10):
        pool = mp.Pool(self.num_threads) if self.num_threads > 1 else None
        for iteration in range(iterations):

            population = self._get_population()
            rewards = self._get_rewards(pool, population)

            self._update_weights(rewards, population)

            if (iteration + 1) % print_step == 0:
                print('iter %d. reward: %f' % (iteration + 1, rewards.mean()))
        if pool is not None:
            pool.close()
            pool.join()


In [16]:
def evaluate_model(weights, max_iter=1500, verbose=False):
    global nn, env
    
    nn.set_weights(weights)
    observation = env.reset()
    done = False
    i = 0
    r_sum = 0
    while not done and i < max_iter:
        action = nn.predict(observation)
        if verbose:
            print("Observation:", observation)
            print("prediction:", action)
        observation, reward, done, _ = env.step(action)
        i += 1
        r_sum += reward
    if verbose:
        print(f"Episode end after {i} iterations with reward = {r_sum} and done status {done}")
    return r_sum

In [17]:
np.random.seed(42)
nn = NN([24, 24, 4])
es = EvolutionStrategy(nn.get_weights(), evaluate_model, population_size=30, num_threads=-1)

In [18]:
es.run(10, print_step=1)

iter 1. reward: -76.243441
iter 2. reward: -71.933924


Process ForkPoolWorker-4:
Process ForkPoolWorker-1:
Process ForkPoolWorker-2:
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-3:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/pytho

KeyboardInterrupt: 