# Running OpenAI Baselines With Spell (Ex. Training PPO on OpenAI Gym)

In [1]:
import os
os.getcwd() 

'/spell/baselines'

In [2]:
import sys
import multiprocessing
import os.path as osp
import gym
from collections import defaultdict
import tensorflow as tf
import numpy as np
import os
import time
import spell.metrics as metrics

from baselines.common.vec_env.vec_video_recorder import VecVideoRecorder
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
from baselines.common.cmd_util import common_arg_parser, parse_unknown_args, make_vec_env, make_env
from baselines.common.tf_util import get_session
from importlib import import_module
from baselines.logger import KVWriter, HumanOutputFormat, CSVOutputFormat, JSONOutputFormat, TensorBoardOutputFormat
from baselines.ppo2.runner import Runner
from baselines.common import explained_variance, set_global_seeds
from baselines.common.policies import build_policy
from baselines.common.vec_env.vec_normalize import VecNormalize
from collections import deque

try:
    from mpi4py import MPI
except ImportError:
    MPI = None
    
try:
    import pybullet_envs
except ImportError:
    pybullet_envs = None

try:
    import roboschool
except ImportError:
    roboschool = None
    
# Load all OpenAI gym environments to dictionary, including Classic Control, Mujoco, Atari, etc.
# ----------------------------------------
_game_envs = defaultdict(set)
for env in gym.envs.registry.all():
    env_type = env._entry_point.split(':')[0].split('.')[-1]
    _game_envs[env_type].add(env.id)

_game_envs['retro'] = {
    'BubbleBobble-Nes',
    'SuperMarioBros-Nes',
    'TwinBee3PokoPokoDaimaou-Nes',
    'SpaceHarrier-Nes',
    'SonicTheHedgehog-Genesis',
    'Vectorman-Genesis',
    'FinalFight-Snes',
    'SpaceInvaders-Snes',
}

Logging to /tmp/openai-2019-02-01-22-58-25-575079


### Load OpenAI Baselines Logger Class (modified to send metrics live to Spell)

In [3]:
DEBUG = 10
INFO = 20
WARN = 30
ERROR = 40

DISABLED = 50

class Logger(object):
    DEFAULT = None  # A logger with no output files.
    CURRENT = None  # Current logger being used

    def __init__(self, dir, output_formats):
        self.name2val = defaultdict(float)  # values this iteration
        self.name2cnt = defaultdict(int)
        self.level = INFO
        self.dir = dir
        self.output_formats = output_formats

    # Logging API,
    def logkv(self, key, val):
        self.name2val[key] = val
        if isinstance(val, np.float32): val = val.item()
        print(key, val)
        metrics.send_metric(key, val) # Send (key, val) to Spell as metric

    def logkv_mean(self, key, val):
        if val is None:
            self.name2val[key] = None
            return
        oldval, cnt = self.name2val[key], self.name2cnt[key]
        newval = oldval*cnt/(cnt+1) + val/(cnt+1)
        self.name2val[key] = newval
        if isinstance(newval, np.float32): newval = newval.item()
        metrics.send_metric(key, newval) # Send mean (key, val) to Spell as metric
        self.name2cnt[key] = cnt + 1

    def dumpkvs(self):
        if self.level == DISABLED: return
        for fmt in self.output_formats:
            if isinstance(fmt, KVWriter):
                fmt.writekvs(self.name2val)
        self.name2val.clear()
        self.name2cnt.clear()

    def log(self, *args, level=INFO):
        if self.level <= level:
            self._do_log(args)

    # Configuration
    # ----------------------------------------
    def set_level(self, level):
        self.level = level

    def get_dir(self):
        return self.dir

    def close(self):
        for fmt in self.output_formats:
            fmt.close()

    # Misc
    # ----------------------------------------
    def _do_log(self, args):
        for fmt in self.output_formats:
            if isinstance(fmt, SeqWriter):
                fmt.writeseq(map(str, args))

In [4]:
# Configure output log generator with specified format
# ----------------------------------------
def make_output_format(format, ev_dir, log_suffix=''):
    os.makedirs(ev_dir, exist_ok=True)
    if format == 'stdout':
        return HumanOutputFormat(sys.stdout)
    elif format == 'log':
        return HumanOutputFormat(osp.join(ev_dir, 'log%s.txt' % log_suffix))
    elif format == 'json':
        return JSONOutputFormat(osp.join(ev_dir, 'progress%s.json' % log_suffix))
    elif format == 'csv':
        return CSVOutputFormat(osp.join(ev_dir, 'progress%s.csv' % log_suffix))
    elif format == 'tensorboard':
        return TensorBoardOutputFormat(osp.join(ev_dir, 'tb%s' % log_suffix))
    else:
        raise ValueError('Unknown format specified: %s' % (format,))

### Load OpenAI PPO2.py

In [5]:
import spell

def constfn(val):
    def f(_):
        return val
    return f

def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4,
            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
            log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
            save_interval=0, load_path=None, model_fn=None, **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''

    set_global_seeds(seed)
    type(env)
    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)
    # Get the nb of env
    nenvs = env.num_envs
    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space
    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    
    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model
    model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm)
    if load_path is not None:
        model.load(load_path)
        
    # Instantiate the runner object
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    if eval_env is not None:
        eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam)
    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)
        
    # Start total timer
    tfirststart = time.time()
    nupdates = total_timesteps//nbatch
    
    for update in range(1, nupdates+1):
        assert nbatch % nminibatches == 0      
        # Start timer
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates      
        # Calculate the learning rate
        lrnow = lr(frac)     
        # Calculate the cliprange
        cliprangenow = cliprange(frac)    
        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run()
        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() #pylint: disable=E0632
        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)
            
        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None: # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indices
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices))
        else: # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))
                    
        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.time()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            # Log (key, value) tuples
            Logger.CURRENT.logkv("serial_timesteps", update*nsteps)
            Logger.CURRENT.logkv("nupdates", update)
            Logger.CURRENT.logkv("total_timesteps", update*nbatch)
            Logger.CURRENT.logkv("fps", fps)
            Logger.CURRENT.logkv("explained_variance", float(ev))
            Logger.CURRENT.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
            Logger.CURRENT.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                Logger.CURRENT.logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) )
                Logger.CURRENT.logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) )
            Logger.CURRENT.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                Logger.CURRENT.logkv(lossname, lossval)
            if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
                Logger.CURRENT.dumpkvs()
        if save_interval and (update % save_interval == 0 or update == 1) and Logger.CURRENT.get_dir() and (MPI is None or MPI.COMM_WORLD.Get_rank() == 0):
            checkdir = osp.join(Logger.CURRENT.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i'%update)
            print('Saving to', savepath)
            model.save(savepath)
    return model

# Avoid division error when calculating the mean (in our case if epinfo is empty returns np.nan, not return an error)
def safemean(xs):
    return np.nan if len(xs) == 0 else np.mean(xs)


## Set Arguments and Hyperparameters

In [6]:
# Total number of training timesteps
total_timesteps = 2e7

# OpenAI Gym training environment ID (navigate to https://gym.openai.com/envs/ to see the full list of environments)
env_id = 'PongNoFrameskip-v4'

# Any of {'csv', 'json', 'stdout','tensorboard'}
log_formats = ['csv']

# Name of folder to which run logs and the trained model will be saved, 
# accessible via run 'outputs' through the Spell web console
save_path = 'pong-ppo' 

# Dictionary with values of training hyperparameters (see 'learn' fn above for more details)
alg_kwargs = {'nsteps': 128} 

# Policy Network Architecture (for more details, see baselines.common/models.py)
network = 'mlp'

## Initialize the Training Environment

In [7]:
# Retrieve env_type, env_id
for env in gym.envs.registry.all():
    env_type = env._entry_point.split(':')[0].split('.')[-1]
    _game_envs[env_type].add(env.id)  # This is a set so add is idempotent

if env_id in _game_envs.keys():
    env_type = env_id
    env_id = [g for g in _game_envs[env_type]][0]
else:
    env_type = None
    for g, e in _game_envs.items():
        if env_id in e:
            env_type = g
            break
            
# Build environment
ncpu = multiprocessing.cpu_count()
if sys.platform == 'darwin': ncpu //= 2
nenv = ncpu

frame_stack_size = 4
env = make_vec_env(env_id, env_type, nenv, None, gamestate=None, reward_scale=1.0)
env = VecFrameStack(env, frame_stack_size)

## Configure Logger

In [8]:
# Create folder where logs and trained model will be saved
os.makedirs(save_path, exist_ok=True)

# Initialize output formatter
output_formats = [make_output_format(f, save_path, '') for f in log_formats]

# Create instance of Logger, with appropriate path and output formatter
Logger.CURRENT = Logger(dir=save_path, output_formats=output_formats) 

## Train the Agent

In [6]:
# Run PPO learning function, with parameters set above
model = learn(network=network, env=env, seed=None, total_timesteps=total_timesteps, **alg_kwargs)

NameError: name 'network' is not defined

In [None]:
# After learning ends, save trained agent
model.save(save_path +'/model')

#### To see your trained agent in action, navigate to the Spell Web Console and download the saved model from run outputs. Then, run the command below from your local machine (with the env_id set above and the model file's local path passed in as arguments):

In [None]:
"""
python -m baselines.run --alg=ppo2 --env=ENV_ID --num_timesteps=0 --load_path=LOCAL_MODEL_PATH --play
"""