<a href="https://colab.research.google.com/github/saathvikMD/ars/blob/main/ars_cheetah_run.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pybullet

Collecting pybullet
[?25l  Downloading https://files.pythonhosted.org/packages/72/d9/756b8fe29c574b34e3a60fd777688f8aaacb7eae37fcd1b5983ec415646d/pybullet-3.0.7-cp36-cp36m-manylinux1_x86_64.whl (87.5MB)
[K     |████████████████████████████████| 87.5MB 34kB/s 
[?25hInstalling collected packages: pybullet
Successfully installed pybullet-3.0.7


In [None]:
from google.colab import files
files.download('/content/exp/brs/monitor') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import os
import gym
import numpy as np
from gym import wrappers
import pybullet_envs

In [None]:
class Hp():    
    def __init__(self):
        self.nb_steps = 1000
        self.episode_length = 1000
        self.learning_rate = 0.02
        self.nb_directions = 16
        self.nb_best_directions = 16
        assert self.nb_best_directions <= self.nb_directions
        self.noise = 0.03
        self.seed = 1
        self.env_name = 'HalfCheetahBulletEnv-v0'

class Normalizer():
    def __init__(self, nb_inputs):
        self.n = np.zeros(nb_inputs)
        self.mean = np.zeros(nb_inputs)
        self.mean_diff = np.zeros(nb_inputs)
        self.var = np.zeros(nb_inputs)
    
    def observe(self, x):
        self.n += 1.
        last_mean = self.mean.copy()
        self.mean += (x - self.mean) / self.n
        self.mean_diff += (x - last_mean) * (x - self.mean)
        self.var = (self.mean_diff / self.n).clip(min = 1e-2)
    
    def normalize(self, inputs):
        obs_mean = self.mean
        obs_std = np.sqrt(self.var)
        return (inputs - obs_mean) / obs_std

class Policy():    
    def __init__(self, input_size, output_size):
        self.theta = np.zeros((output_size, input_size))
    
    def evaluate(self, input, delta = None, direction = None):
        if direction is None:
            return self.theta.dot(input)
        elif direction == "positive":
            return (self.theta + hp.noise*delta).dot(input)
        else:
            return (self.theta - hp.noise*delta).dot(input)
    
    def sample_deltas(self):
        return [np.random.randn(*self.theta.shape) for _ in range(hp.nb_directions)]
    
    def update(self, rollouts, sigma_r):
        step = np.zeros(self.theta.shape)
        for r_pos, r_neg, d in rollouts:
            step += (r_pos - r_neg) * d
        self.theta += hp.learning_rate / (hp.nb_best_directions * sigma_r) * step

def explore(env, normalizer, policy, direction = None, delta = None):
    state = env.reset()
    done = False
    num_plays = 0.
    sum_rewards = 0
    while not done and num_plays < hp.episode_length:
        normalizer.observe(state)
        state = normalizer.normalize(state)
        action = policy.evaluate(state, delta, direction)
        state, reward, done, _ = env.step(action)
        reward = max(min(reward, 1), -1)
        sum_rewards += reward
        num_plays += 1
    return sum_rewards

def train(env, policy, normalizer, hp):
    for step in range(hp.nb_steps):
        deltas = policy.sample_deltas()
        positive_rewards = [0] * hp.nb_directions
        negative_rewards = [0] * hp.nb_directions
        for k in range(hp.nb_directions):
            positive_rewards[k] = explore(env, normalizer, policy, direction = "positive", delta = deltas[k])
        
        for k in range(hp.nb_directions):
            negative_rewards[k] = explore(env, normalizer, policy, direction = "negative", delta = deltas[k])
        
        all_rewards = np.array(positive_rewards + negative_rewards)
        sigma_r = all_rewards.std()
        
        scores = {k:max(r_pos, r_neg) for k,(r_pos,r_neg) in enumerate(zip(positive_rewards, negative_rewards))}
        order = sorted(scores.keys(), key = lambda x:scores[x], reverse = True)[:hp.nb_best_directions]
        rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order]
        
        policy.update(rollouts, sigma_r)
        
        reward_evaluation = explore(env, normalizer, policy)
        print('Step:', step, 'Reward:', reward_evaluation)

def mkdir(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path
work_dir = mkdir('exp', 'brs')
monitor_dir = mkdir(work_dir, 'monitor')

hp = Hp()
np.random.seed(hp.seed)
env = gym.make(hp.env_name)
env = wrappers.Monitor(env, monitor_dir, force = True)
nb_inputs = env.observation_space.shape[0]
nb_outputs = env.action_space.shape[0]
policy = Policy(nb_inputs, nb_outputs)
normalizer = Normalizer(nb_inputs)
train(env, policy, normalizer, hp)



Step: 0 Reward: -958.0028225510683
Step: 1 Reward: -964.2504199466127
Step: 2 Reward: -952.9964652563986
Step: 3 Reward: -961.675630312527
Step: 4 Reward: -932.6042998808095
Step: 5 Reward: -920.0881146083314
Step: 6 Reward: -958.8676212505227
Step: 7 Reward: -935.2474240675112
Step: 8 Reward: -919.3373037285539
Step: 9 Reward: -878.8943219216015
Step: 10 Reward: -782.3713307096431
Step: 11 Reward: -840.8359492494601
Step: 12 Reward: -778.1047157112719
Step: 13 Reward: -943.5428086647771
Step: 14 Reward: -461.35869527977695
Step: 15 Reward: -852.6516633811475
Step: 16 Reward: -537.9394085529985
Step: 17 Reward: -863.5507289365997
Step: 18 Reward: -445.88008340731864
Step: 19 Reward: -494.14533114580587
Step: 20 Reward: -510.67672895184876
Step: 21 Reward: -484.79060238237435
Step: 22 Reward: -435.80329302577326
Step: 23 Reward: -397.13629285377334
Step: 24 Reward: -357.33588542955005
Step: 25 Reward: -259.2812842617334
Step: 26 Reward: -246.1946186402447
Step: 27 Reward: -182.535842655

KeyboardInterrupt: ignored