In [1]:
import numpy as np
import gym
env = gym.make('Marvin-v0')

In [2]:
class NN:
    def __init__(self, layer_sizes, seed=None):
        if seed is not None:
            np.random.seed(seed)
        self.l = [(np.random.randn(m, n), np.zeros((m, 1))) for m, n in zip(layer_sizes[1:], layer_sizes)]
    
    def predict(self, X):
        out = X
        for W, b in self.l:
            Z = X @ W.T + b.T
            out = np.tanh(Z)
        if out.shape[0] == 1 and len(out.shape) == 1:
            return out.item()
        return out[0]

    def set_params(self, params):
        self.l = params

In [3]:
nn = NN([24, 24, 4])

In [4]:
env.action_space.low, env.action_space.high

(array([-1., -1., -1., -1.], dtype=float32),
 array([1., 1., 1., 1.], dtype=float32))

In [5]:
nn.predict(env.observation_space.sample())

array([ 0.02477412, -0.99980411,  0.99999679, -0.65595123])

In [6]:
def deep_copy(params):
    return [(np.copy(W), np.copy(b)) for (W, b) in params]

In [7]:
def params_perturbation(params, sigma=0.1, seed=None):
    """
    Obtain weights perturbation for the whole network architecture
    """
    if seed is not None:
        np.random.seed(42)
    return [(np.random.randn(*W.shape) * sigma, np.random.randn(*b.shape) * sigma) for W, b in params]

In [8]:
def combine_weights(params, delta_params):
    return [(W + dW, b + db) for ((W, b), (dW, db)) in zip(params, delta_params)]

In [9]:
def evaluate_model(nn, env, max_iter=1000, verbose=False):
    observation = env.reset()
    done = False
    i = 0
    r_sum = 0
    while not done and i < max_iter:
        observation, reward, done, _ = env.step(nn.predict(observation))
        i += 1
        r_sum += reward
    if verbose:
        print(f"Episode end after {i} iterations with reward = {r_sum} and done status {done}")
    return r_sum

In [10]:
n = 40
lr = 0.03
sigma = 0.1

In [11]:
def update_params(params, population, rewards, learning_rate=0.05, sigma=0.1):
    """
    Inplace update of parameters
    """
    n = len(population)
    for candidate, reward in zip(population, rewards):
        for i in range(len(params)):
            W, b = params[i]
            dW, db = candidate[i]
            W_new =  W + learning_rate / (n * sigma) * reward * dW
#             b_new = b + learning_rate / (n * sigma) * reward * db
            b_new = b
            params[i] = (W_new, b_new)

In [12]:
def generation_update(model, environment, sigma=0.1, lr=0.01, population_size=10, seed=None, normalize_rewards=True):
    original_params = deep_copy(model.l)
    
    if seed is not None:
        np.random.seed(seed)

    population = []
    rewards = []
    for i in range(population_size):
        candidate = params_perturbation(original_params, sigma=sigma)
        modified_params = combine_weights(original_params, candidate)
        model.set_params(modified_params)
        reward = evaluate_model(model, environment)
        population.append(candidate)
        rewards.append(reward)

    rewards = np.array(rewards)
    r_mean, r_std = rewards.mean(), rewards.std()
    if normalize_rewards:
        rewards = (rewards - r_mean) / (r_std + 1e-9)
        
    update_params(original_params, population, rewards, learning_rate=lr, sigma=sigma)
    model.set_params(deep_copy(original_params))
    return r_mean, r_std

In [13]:
for i in range(500):
    mean_rewards, std_rewards = generation_update(nn, env, population_size=n, sigma=sigma, lr=lr, normalize_rewards=True)
    if i % 10 == 0:
        print(f'[{i}]: E[R]={mean_rewards}, std(R)={std_rewards}')

-121.59460429173505 3.9956095393830346
-120.74023066040108 3.00362842549627
-119.28674295822404 6.894817246220737
-117.24950311993834 22.631681055272935
-114.56554944356704 16.042170111334112
-110.8876084766274 10.024210246388598
-104.39003396742525 12.034757160428969
-101.60752402795453 12.430552349098939
-107.41799902081546 19.51595570399351
-99.35575896431709 9.642189645631863
-103.35122996983725 17.995861395778284


KeyboardInterrupt: 

In [None]:
def render_env(model, env, max_iter=None, verbose=True):
    observation = env.reset()
    done = False
    i = 0
    r_sum = 0
    while not done or (max_iter is not None and i < max_iter):
        env.render()
        observation, reward, done, _ = env.step(nn.predict(observation))
        i += 1
        r_sum += reward
    if verbose:
        print(f"Episode end after {i} iterations with reward = {r_sum} and done status {done}")

In [None]:
render_env(nn, env)