In [1]:
import numpy as np
import gym
env = gym.make('Marvin-v0')

In [2]:
import psutil
import ray

In [3]:
ray.init(memory=4 * 1024 * 1024 * 1024, object_store_memory=4 * 1024 * 1024 * 1024)

2019-10-05 12:12:57,363	INFO resource_spec.py:205 -- Starting Ray with 3.96 GiB memory available for workers and up to 4.0 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).


{'node_ip_address': '10.111.6.11',
 'redis_address': '10.111.6.11:19862',
 'object_store_address': '/tmp/ray/session_2019-10-05_12-12-57_360461_33223/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2019-10-05_12-12-57_360461_33223/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2019-10-05_12-12-57_360461_33223'}

In [4]:
class NN:
    def __init__(self, layer_sizes, seed=None):
        if seed is not None:
            np.random.seed(seed)
        self.weights = [np.zeros((m, n)) * 1e-3 for m, n in zip(layer_sizes[1:], layer_sizes)]
    
    def predict(self, X):
        out = X
        for W in self.weights:
            Z = out @ W.T
            out = np.tanh(Z)
        if out.shape[0] == 1 and len(out.shape) == 1:
            return out.item()
        return out

    def set_weights(self, weights, copy=False):
        if copy:
            self.weights = [np.copy(l) for l in weights]
        else:
            self.weights = weights
        
    def get_weights(self, copy=False):
        if copy:
            return [np.copy(l) for l in self.weights]
        return self.weights

In [5]:
def sample_like(weights, sigma=1, rs=None):
    """
    Create a sample of the same shapes as the input
    @param weights: list of np.arrays
    """
    
    if rs is None:
        rs = np.random
    
    return [rs.randn(*l.shape) * sigma for l in weights]


def combine_weights(params, delta_params, sigma):
    return [W + dW * sigma for W, dW in zip(params, delta_params)]

In [6]:
def evaluate_model(model, env):
    
    observation = env.reset()
    done = False
    i = 0
    r_sum = 0
    while not done and i < 1500:
        action = model.predict(observation)
        observation, reward, done, _ = env.step(action)
        i += 1
        r_sum += reward
    return r_sum

def update_params(params, population, rewards, lr=0.05, sigma=0.1):
    """
    Inplace update of parameters
    """
    n = len(population)
    for i in range(len(params)):
        W = params[i]
        
        dW_accum = np.zeros_like(W)
        for candidate, reward in zip(population, rewards):
            dW = candidate[i]
            dW_accum += reward * dW
        W_new = W + lr / (n * sigma) * dW_accum
        params[i] = W_new
    return params

In [7]:
@ray.remote(num_cpus=0.12)
class ESWorker:
    def __init__(self, layer_sizes, init_seed, env_name, seed, sigma=.1):
        self.model = NN(layer_sizes, init_seed)
        self.env = gym.make(env_name)
        self.rs = np.random.RandomState(seed=seed)
        self.sigma = sigma
    
    def evaluate(self):
        
        candidate = sample_like(self.model.weights, rs=self.rs)
        self.model.set_weights(combine_weights(self.model.get_weights(), candidate, self.sigma))
        reward = evaluate_model(self.model, self.env)
        return reward
    
    def update(self, weights):
        self.model.set_weights(weights, copy=True)

In [8]:
# w_seeds = [seed for seed in range(4)]
# w_rss = [np.random.RandomState(seed=seed) for seed in w_seeds]
# workers = [ESWorker.remote([24, 24, 4], 0, 'Marvin-v0', seed) for seed in range(4)]  # actor handles

In [9]:
# workers

In [10]:
# rewards = [ray.get(w.evaluate.remote()) for w in workers]

In [11]:
# population = [sample_like(nn.get_weights(), rs=rs) for rs in w_rss]

In [12]:
# update_params(nn.get_weights(), population, rewards)

In [13]:

class ESRaySolver:
    def __init__(self, model, environment, population_size=30, max_episode_len=1500,
                 lr=0.05, lr_decay=0.999, sigma=0.1, verbose=False):
        self.model = model
        self.env = environment
        self.population_size = population_size
        self.max_episode_len = max_episode_len
        self.lr = lr
        self.lr_decay = lr_decay
        self.sigma = sigma
        self.verbose = verbose
        self.w_seeds = [seed for seed in range(population_size)]
        self.w_rss = [np.random.RandomState(seed=seed) for seed in self.w_seeds]
        self.workers = [ESWorker.remote([24, 24, 4], 0, 'Marvin-v0', seed) for seed in self.w_seeds]  # actor handles
    
    def solve(self, weights=None, fitness_fn=None, n_generations=100, seed=None):
        """
        If weights is none, simple MLP is assumed, otherwise this should be the list of weights matrices from some model
        """
        if weights is None:
            weights = self.model.get_weights(copy=True)
        if fitness_fn is None:
            fitness_fn = evaluate_model

        if seed is not None:
            np.random.seed(seed)

        lr = self.lr


        
        for generation in range(n_generations):
    
            rewards = [ray.get(w.evaluate.remote()) for w in self.workers]
            population = [sample_like(weights, rs=rs) for rs in self.w_rss]

            rewards = np.array(rewards)
            r_mean, r_std = rewards.mean(), rewards.std()
            rewards = (rewards - r_mean) / r_std
            
            update_params(weights, population, rewards, lr=lr, sigma=self.sigma)
            [ray.get(w.update.remote(weights)) for w in self.workers]
        
            lr = lr * self.lr_decay
            if self.verbose and (generation % int(self.verbose) == 0):
                print(f'[{generation}]: E[R]={r_mean:.4f}, std(R)={r_std:.4f} | lr={lr:.4f}')
        return weights


In [14]:
nn = NN([24, 24, 4], 0)
es = ESRaySolver(nn, env, population_size=30, lr=0.03, verbose=5)
weights = es.solve(n_generations=300)



[0]: E[R]=-60.7821, std(R)=43.1073 | lr=0.0300
[5]: E[R]=-73.4601, std(R)=32.7864 | lr=0.0298
[10]: E[R]=-89.5098, std(R)=27.2669 | lr=0.0297
[15]: E[R]=-55.4909, std(R)=30.7682 | lr=0.0295
[20]: E[R]=-62.0867, std(R)=40.0604 | lr=0.0294
[25]: E[R]=-63.9547, std(R)=29.4058 | lr=0.0292
[30]: E[R]=-55.8368, std(R)=39.7465 | lr=0.0291
[35]: E[R]=-60.8313, std(R)=28.1254 | lr=0.0289
[40]: E[R]=-66.6107, std(R)=32.6568 | lr=0.0288
[45]: E[R]=-50.5356, std(R)=35.3773 | lr=0.0287
[50]: E[R]=-45.5580, std(R)=33.8322 | lr=0.0285
[55]: E[R]=5.1548, std(R)=34.8148 | lr=0.0284
[60]: E[R]=17.0382, std(R)=43.2496 | lr=0.0282
[65]: E[R]=36.0285, std(R)=86.4609 | lr=0.0281
[70]: E[R]=46.2152, std(R)=78.4589 | lr=0.0279
[75]: E[R]=38.1711, std(R)=56.1707 | lr=0.0278
[80]: E[R]=33.0489, std(R)=77.1984 | lr=0.0277
[85]: E[R]=69.8223, std(R)=59.1796 | lr=0.0275
[90]: E[R]=73.9432, std(R)=72.0845 | lr=0.0274
[95]: E[R]=93.9002, std(R)=65.1340 | lr=0.0273
[100]: E[R]=80.3982, std(R)=49.4834 | lr=0.0271
[105

In [15]:
import pickle as pcl
with open("ray_weights.pcl", 'wb') as f:
    pcl.dump(weights, f)