In [None]:

!apt-get update
!apt-get -qq install python-opengl -y
!apt-get -qq -y install xvfb ffmpeg
!pip -q install pyvirtualdisplay
!pip -q install piglet


0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.36)] [Connecting to security.ubuntu.com (91.189.91                                                                                                    Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
                                                                                                    0% [Connecting to archive.ubuntu.com (185.125.190.36)] [Waiting for headers] [Waiting for headers]                                                                                                  Hit:3 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
                                                                                                  0% [Connecting to archive.ubuntu.com (185.125.190.36)] [Waiting for headers]                                           

In [None]:

import gym
import numpy as np
from gym import wrappers

In [None]:

class Hp():
    # Hyperparameters
    def __init__(self,
                 episode_length=500,
                 learning_rate=0.1,
                 num_deltas=16,
                 num_best_deltas=16,
                 noise=0.03,
                 seed=1,
                 env_name='BipedalWalker-v2',
                 record_every=10):
        self.episode_length = episode_length
        self.learning_rate = learning_rate
        self.num_deltas = num_deltas
        self.num_best_deltas = num_best_deltas
        assert self.num_best_deltas <= self.num_deltas
        self.noise = noise
        self.seed = seed
        self.env_name = env_name
        self.record_every = record_every

In [None]:

class Normalizer():
    # Normalizes the inputs
    def __init__(self, nb_inputs):
        self.n = np.zeros(nb_inputs)
        self.mean = np.zeros(nb_inputs)
        self.mean_diff = np.zeros(nb_inputs)
        self.var = np.zeros(nb_inputs)

    def observe(self, x):
        self.n += 1.0
        last_mean = self.mean.copy()
        self.mean += (x - self.mean) / self.n
        self.mean_diff += (x - last_mean) * (x - self.mean)
        self.var = (self.mean_diff / self.n).clip(min = 1e-2)

    def normalize(self, inputs):
        self.observe(inputs)
        obs_mean = self.mean
        obs_std = np.sqrt(self.var)
        return (inputs - obs_mean) / obs_std

In [None]:
class ArsTrainer():
    def __init__(self, env, input_size=None, output_size=None, hp=None, normalizer=None, monitor_dir=None):
        self.env = env
        self.input_size = input_size or self.env.observation_space.shape[0]
        self.output_size = output_size or self.env.action_space.shape[0]
        self.weights = np.zeros((self.output_size, self.input_size))
        self.hp = hp or Hp()
        self.normalizer = normalizer or Normalizer(self.input_size)
        self.cur_step = 0;
        self.set_monitor(monitor_dir)
        self.record_video = False

    def set_monitor(self, monitor_dir=None):
        #use this method if you want to record the episode
        #set the folder where the recorded video will be stored
        if monitor_dir is not None:
            should_record = lambda i: self.record_video
            self.env = wrappers.Monitor(self.env, monitor_dir, video_callable=should_record, force=True)
            self.hp.episode_length = 2000

    def learning_rate(self, decay=0.01):
        return self.hp.learning_rate / (1 + decay * self.cur_step)

    def train(self, n_steps):
        for step in range(n_steps):
            self.cur_step += 1
            # initialize the random noise deltas and the positive/negative rewards
            deltas = self.generate_deltas()
            positive_rewards = np.zeros(self.hp.num_deltas)
            negative_rewards = np.zeros(self.hp.num_deltas)
            # play an episode each with positive deltas and negative deltas, collect rewards
            for i in range(self.hp.num_deltas):
                positive_rewards[i] = self.play_episode(self.weights + self.hp.noise * deltas[i])
                negative_rewards[i] = self.play_episode(self.weights - self.hp.noise * deltas[i])

            # Compute the standard deviation of all rewards
            sigma_rewards = np.array(positive_rewards + negative_rewards).std()

            # Sort the rollouts by the max(r_pos, r_neg) and select the deltas with best rewards
            scores = {k: max(r_pos, r_neg) for k, (r_pos, r_neg) in enumerate(zip(positive_rewards, negative_rewards))}
            order = sorted(scores.keys(), key=lambda x: scores[x], reverse=True)[:self.hp.num_best_deltas]
            rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order]
            # Update the policy
            self.update_weights(rollouts, sigma_rewards)

            # Only record video during evaluation, every n steps
            if step % self.hp.record_every == 0:
                self.record_video = True
            # Play an episode with the new weights and print the score
            reward_evaluation = self.play_episode(self.weights, train=False)
            print('Step: ', step, 'Reward: ', reward_evaluation)
            self.record_video = False

    def update_weights(self, rollouts, sigma_rewards):
        # sigma_rewards is the standard deviation of the rewards
        step = np.zeros(self.weights.shape)
        for r_pos, r_neg, delta in rollouts:
            step += (r_pos - r_neg) * delta
        self.weights += self.learning_rate() / (self.hp.num_best_deltas * sigma_rewards) * step

    def play_episode(self, theta=None, train=True):
        # play one episode of game
        if theta is None:
            theta = self.weights
        obs = self.env.reset()
        sum_reward = 0
        episode = 0
        while True:
            episode += 1
            # choose action using theta
            action = self.predict(obs, theta)
            obs, reward, done, _ = self.env.step(action)
            sum_reward += reward
            if done:
                break
            # break if reached max number of episodes
            if episode >= self.hp.episode_length:
                break

        return sum_reward

    def predict(self, inp, theta):
        # predict action from input using theta
        inp = self.normalizer.normalize(inp)
        return theta @ inp

    def generate_deltas(self):
        return np.random.randn(self.hp.num_deltas, *self.weights.shape)


In [None]:
!pip install swig
!pip install gym[box2d]

Collecting box2d-py==2.3.5 (from gym[box2d])
  Using cached box2d-py-2.3.5.tar.gz (374 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pygame==2.1.0 (from gym[box2d])
  Using cached pygame-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp310-cp310-linux_x86_64.whl size=2376134 sha256=4a78bd8529cb307a04252eaecbeac153ae42c20f2356550cd41a7375c77b98a5
  Stored in directory: /root/.cache/pip/wheels/db/8f/6a/eaaadf056fba10a98d986f6dce954e6201ba3126926fc5ad9e
Successfully built box2d-py
Installing collected packages: box2d-py, pygame
  Attempting uninstall: pygame
    Found existing installation: pygame 2.5.2
    Uninstalling pygame-2.5.2:
      Successfully uninstalled pygame-2.5.2
Successfully installed box2d-py-2.3.5 pygame-2.1.0


In [None]:
env = gym.make('BipedalWalker-v3')
trainer = ArsTrainer(env)
trainer.train(100)

  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


Step:  0 Reward:  -121.97701323504731
Step:  1 Reward:  -15.583307119312188
Step:  2 Reward:  -9.855134650859407
Step:  3 Reward:  -2.2529806423482177
Step:  4 Reward:  -21.14698464896552
Step:  5 Reward:  -15.549281211480464
Step:  6 Reward:  -2.1038549855274864
Step:  7 Reward:  -11.919960764286046
Step:  8 Reward:  -3.560869641966173
Step:  9 Reward:  -9.979922836640032
Step:  10 Reward:  -4.771576702069035
Step:  11 Reward:  -7.39026327543413
Step:  12 Reward:  -8.516660428660332
Step:  13 Reward:  -3.8087913446375268
Step:  14 Reward:  -5.717333625567242
Step:  15 Reward:  -3.0273777369185915
Step:  16 Reward:  -6.9932600975951225
Step:  17 Reward:  -5.850128786782016
Step:  18 Reward:  -5.764709460988472
Step:  19 Reward:  -3.3343986124737977
Step:  20 Reward:  -3.77886123191379
Step:  21 Reward:  -6.966957774605357
Step:  22 Reward:  -2.8720948613111283
Step:  23 Reward:  -3.9166996251580746
Step:  24 Reward:  -97.7530494970671
Step:  25 Reward:  -1.3025401946140018
Step:  26 Re

In [None]:

DIR_PATH = '/content/drive/MyDrive/Colab Notebooks/BipedalWalkervids'

In [None]:

import os
def mkdir(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path

In [None]:
videos_dir = mkdir(DIR_PATH, 'videos')
monitor_dir = mkdir(videos_dir, 'bi')

In [None]:
# Start virtual display
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1024, 768))
display.start()
import os
# os.environ["DISPLAY"] = ":" + str(display.display) + "." + str(display.screen)


In [None]:
trainer.set_monitor(monitor_dir)

  and should_run_async(code)


AttributeError: module 'gym.wrappers' has no attribute 'Monitor'

In [None]:

trainer.train(100)