In [12]:
import numpy as np
import os
import pybullet_envs
import gym
from gym import wrappers

In [13]:
# hyper parameters
num_steps = 1000 # step size
num_direction = 60 # number of direction per iteration N 
num_top_direction = 20 #num of top performing direction to use ; b < N
assert num_direction >= num_top_direction
noise = 0.025




In [57]:
class Normalizer():
    
    def __init__(self, nb_inputs):
        self.n = np.zeros(nb_inputs)
        self.mean = np.zeros(nb_inputs)
        self.mean_diff = np.zeros(nb_inputs)
        self.var = np.zeros(nb_inputs)
    
    def observe(self, x):
        self.n += 1.
        last_mean = self.mean.copy()
        self.mean += (x - self.mean) / self.n
        self.mean_diff += (x - last_mean) * (x - self.mean)
        self.var = (self.mean_diff / self.n).clip(min = 1e-2)
    
    def normalize(self, inputs):
        obs_mean = self.mean
        obs_std = np.sqrt(self.var)
        return (inputs - obs_mean) / obs_std

In [68]:
# initialization
class Model(object):
    def __init__(self, n_steps, n_directions, n_top_directions, noise,episode_length):
        self.n_steps = n_steps # step size
        self.n_directions = n_directions # number of direction per iteration N 
        self.n_top_directions = n_top_directions #num of top performing direction to use ; b < N
        assert self.n_directions >= self.n_top_directions
        self.noise = noise # v => noise 
        # -----
        self.episode_length = episode_length
        self.learning_rate = 0.02
    
    def forward_pass(self, x, delta, direction=None):
        if direction is None:
            return (self.M.dot(x))
        if direction == "positive":
            return (self.M + self.noise* delta).dot(x) 
        else:
            return (self.M - self.noise* delta).dot(x)
    
    def update(self, sorted_deltas, std):
        second_part = np.zeros(self.M.shape)
        for p_reward, n_reward, delta in sorted_deltas:
            second_part += (p_reward - n_reward)* delta
        self.M += self.learning_rate/(self.n_top_directions*std) * second_part
    
    def explore(self, env, direction=None, delta=None):
        state = env.reset()
        done = False 
        num_plays = 0
        sum_rewards = 0
        while not done and num_plays < self.episode_length:
            self.normalizer.observe(state)
#             state = self.normalizer.normalize(state)
            action = self.forward_pass(state, delta, direction)
            state, reward, done, _ = env.step(action)
            sum_rewards += max(min(reward,1),-1)
            num_plays += 1
        return sum_rewards
    
    def train(self, env,input_size, output_size):
        self.normalizer = Normalizer(input_size)
        self.M = np.zeros((output_size, input_size))
        for i in range(self.n_steps):
            sample_deltas = [np.random.randn(*self.M.shape) for _ in range(self.n_directions)]
            deltas = sample_deltas
            positive_rewards = [0] * self.n_directions
            negative_rewards = [0] * self.n_directions
            for k,delta in enumerate(sample_deltas):
                positive_reward = self.explore(env, "positive", delta)
                positive_rewards[k] = positive_reward
            for k,delta in enumerate(sample_deltas):
                negative_reward = self.explore(env, "negative", delta)
                negative_rewards[k] = negative_reward

            # sorting
            scores = {
                k: max(p_reward, n_reward) for k, (p_reward, n_reward) in enumerate(zip(positive_rewards, negative_rewards))
            }
            sorted_scores = sorted(scores.keys(), key=lambda key: scores[key], reverse=True)[:self.n_top_directions]
            sorted_deltas = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in sorted_scores]
            
            # sigma r
            all_rewards = np.array(positive_rewards+ negative_rewards)
            sigma_r = all_rewards.std()
            
            self.update(sorted_deltas, sigma_r)
            
            result = self.explore(env)
            print ("Step: ", i, "Reward: ", result)
        
            
            
            
                
            
            
        

In [None]:
def mkdir(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path
monitor_dir = mkdir('exp4_(no_normalizer)','monitor')
# monitor_dir = mkdir(work_dir, 'monitor')

env_name = 'HalfCheetahBulletEnv-v0'
np.random.seed(1)
env = gym.make(env_name)
env = wrappers.Monitor(env, monitor_dir, force=True)
input_size = env.observation_space.shape[0]
output_size = env.action_space.shape[0]

model = Model(num_steps,num_direction, num_top_direction, noise,1000)
model.train(env, input_size, output_size)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
WalkerBase::__init__ start
Step:  0 Reward:  -957.4178099521027
Step:  1 Reward:  -947.4225000665564
Step:  2 Reward:  -932.2300342840119
Step:  3 Reward:  -954.6621817692194
Step:  4 Reward:  -926.1987192026969
Step:  5 Reward:  -840.8696437183598
Step:  6 Reward:  -931.1651768714132
Step:  7 Reward:  -872.69450076052
Step:  8 Reward:  -961.5361613723699
Step:  9 Reward:  -592.9377438422657
Step:  10 Reward:  -946.5269995017346
Step:  11 Reward:  -536.3409299108653
Step:  12 Reward:  -898.1993233094701
Step:  13 Reward:  -537.6817069479624
Step:  14 Reward:  -481.7321468539703
Step:  15 Reward:  -441.0414019968791
Step:  16 Reward:  -331.5919798321567
Step:  17 Reward:  -216.690924469937
Step:  18 Reward:  -113.44450981747447
Step:  19 Reward:  -178.8414925940782
Step:  20

Step:  212 Reward:  787.9898567472259
Step:  213 Reward:  798.245424417593
Step:  214 Reward:  812.5406922540724
Step:  215 Reward:  816.8370313005204
Step:  216 Reward:  806.0943904107709
Step:  217 Reward:  801.6658245913067
Step:  218 Reward:  810.0815254150627
Step:  219 Reward:  788.0677980137154
Step:  220 Reward:  805.8545865670209
Step:  221 Reward:  766.6891779902322
Step:  222 Reward:  784.813321711613
Step:  223 Reward:  779.7223742604356
Step:  224 Reward:  786.5115053775412
Step:  225 Reward:  806.6866515097743
Step:  226 Reward:  796.5401889666128
Step:  227 Reward:  769.4703062781543
Step:  228 Reward:  797.5644370058382
Step:  229 Reward:  806.4096261204887
Step:  230 Reward:  802.3983622215785
Step:  231 Reward:  807.27067766437
Step:  232 Reward:  770.6829078800986
Step:  233 Reward:  804.5397485365306
Step:  234 Reward:  793.4934169361179
Step:  235 Reward:  790.2583338915848
Step:  236 Reward:  808.7887776667814
Step:  237 Reward:  812.018443404976
Step:  238 Reward