In [1]:
import numpy as np
import gym
env = gym.make('Marvin-v0')

In [2]:
class NN:
    def __init__(self, layer_sizes, seed=None):
        if seed is not None:
            np.random.seed(seed)
        self.weights = [np.zeros((m, n)) * 1e-3 for m, n in zip(layer_sizes[1:], layer_sizes)]
    
    def predict(self, X):
        out = X
        for W in self.weights:
            Z = out @ W.T
            out = np.tanh(Z)
        if out.shape[0] == 1 and len(out.shape) == 1:
            return out.item()
        return out

    def set_weights(self, weights, copy=False):
        if copy:
            self.weights = [np.copy(l) for l in weights]
        else:
            self.weights = weights
        
    def get_weights(self, copy=False):
        if copy:
            return [np.copy(l) for l in self.weights]
        return self.weights
    
    def sample_like(self, sigma=1.0):
        return [np.random.randn(*l.shape) * sigma for l in self.weights]

In [3]:
def sample_like(weights, sigma=1):
    """
    Create a sample of the same shapes as the input
    @param weights: list of np.arrays
    """
    return [np.random.randn(*l.shape) * sigma for l in weights]

def combine_weights(params, delta_params, sigma):
    return [W + dW * sigma for W, dW in zip(params, delta_params)]


    
def update_params(params, population, rewards, lr=0.05, sigma=0.1):
    """
    Inplace update of parameters
    """
    n = len(population)
    for i in range(len(params)):
        W = params[i]
        dW_accum = np.zeros_like(W)

        for candidate, reward in zip(population, rewards):
            dW = candidate[i]
            dW_accum += reward * dW

        W_new = W + lr / (n * sigma) * dW_accum
        params[i] = W_new

    return params

class ESSolver:
    def __init__(self, model, environment, population_size=30, max_episode_len=1500,
                 lr=0.05, lr_decay=0.999, sigma=0.1, verbose=False):
        self.model = model
        self.env = environment
        self.population_size = population_size
        self.max_episode_len = max_episode_len
        self.lr = lr
        self.lr_decay = lr_decay
        self.sigma = sigma
        self.verbose = verbose
    
    def solve(self, weights=None, fitness_fn=None, n_generations=100, seed=None):
        """
        If weights is none, simple MLP is assumed, otherwise this should be the list of weights matrices from some model
        """
        if weights is None:
            weights = self.model.get_weights(copy=True)
        if fitness_fn is None:
            fitness_fn = self.evaluate_model

        if seed is not None:
            np.random.seed(seed)

        lr = self.lr
        for generation in range(n_generations):
    
            population = []
            rewards = []
            
            for i in range(self.population_size):
                candidate = sample_like(weights)
                
                weights_combined = combine_weights(weights, candidate, sigma=self.sigma)
                reward = fitness_fn(weights_combined)
                
                population.append(candidate)
                rewards.append(reward)
            
            rewards = np.array(rewards)
            r_mean, r_std = rewards.mean(), rewards.std()
            rewards = (rewards - r_mean) / r_std
            
            update_params(weights, population, rewards, lr=lr, sigma=self.sigma)
        
        
            lr = lr * self.lr_decay
            if self.verbose and (generation % int(self.verbose) == 0):
                print(f'[{generation}]: E[R]={r_mean:.4f}, std(R)={r_std:.4f} | lr={lr:.4f}')
        return weights
    
    
    def evaluate_model(self, weights):
        self.model.set_weights(weights)
        
        observation = self.env.reset()
        done = False
        i = 0
        r_sum = 0
        while not done and i < self.max_episode_len:
            action = self.model.predict(observation)
            observation, reward, done, _ = self.env.step(action)
            i += 1
            r_sum += reward
        return r_sum
    


In [4]:
np.random.seed(42)
nn = NN([24, 24, 4])

In [5]:
es = ESSolver(nn, env, verbose=1)
weights = es.solve(seed=42)

[0]: E[R]=-74.2073, std(R)=43.1700 | lr=0.0500
[1]: E[R]=-80.1933, std(R)=35.8936 | lr=0.0499
[2]: E[R]=-93.6548, std(R)=35.8769 | lr=0.0499
[3]: E[R]=-81.3174, std(R)=30.9690 | lr=0.0498
[4]: E[R]=-85.9987, std(R)=35.1726 | lr=0.0498
[5]: E[R]=-89.1679, std(R)=22.2086 | lr=0.0497
[6]: E[R]=-92.6968, std(R)=26.8650 | lr=0.0497
[7]: E[R]=-90.7875, std(R)=19.7223 | lr=0.0496
[8]: E[R]=-104.0238, std(R)=19.3702 | lr=0.0496
[9]: E[R]=-97.6063, std(R)=14.8156 | lr=0.0495
[10]: E[R]=-87.3863, std(R)=19.1236 | lr=0.0495
[11]: E[R]=-95.6501, std(R)=26.6276 | lr=0.0494
[12]: E[R]=-89.3087, std(R)=22.7270 | lr=0.0494
[13]: E[R]=-86.8189, std(R)=27.6159 | lr=0.0493
[14]: E[R]=-92.9829, std(R)=22.6269 | lr=0.0493
[15]: E[R]=-84.1191, std(R)=19.5775 | lr=0.0492
[16]: E[R]=-87.1310, std(R)=32.4409 | lr=0.0492
[17]: E[R]=-96.0725, std(R)=15.5007 | lr=0.0491
[18]: E[R]=-99.8766, std(R)=26.8499 | lr=0.0491
[19]: E[R]=-81.8875, std(R)=26.7100 | lr=0.0490
[20]: E[R]=-80.7249, std(R)=26.3051 | lr=0.0490
[

In [7]:
nn.set_weights(weights, copy=True)

In [9]:
from viz import render_env
render_env(nn, env, max_iter=2000)

Episode end after 1694 iterations with reward = 191.30642198846988 and done status True


In [10]:
import pickle as pcl

with open("more_trained.pcl", 'wb') as fp:
    pcl.dump(weights, fp)