# Minigame 10: Choose One Element To Refine

This is essentially the global environment for refinement where the action is choosing one element at a time.  However, the observation space is the DOFs directly, not function values.

Some things to explore:

* PPO vs DQN vs ?
* CNN vs MLP vs ?
* order=1 vs order=2 vs ?
* H1 space vs DG space vs ?

Setup PyMFEM:

In [1]:
import math
from math import cos,sin
import random

In [2]:
import sys
import gym
from gym import spaces, utils
import numpy as np
import ray
import ray.rllib.agents.ppo as ppo
import ray.rllib.agents.dqn as dqn
from os.path import expanduser, join
import os

Instructions for updating:
non-resource variables are not supported in the long term


In [3]:
from glvis import glvis, to_stream

In [4]:
from mfem import path
import mfem.ser as mfem

Start up rllib

In [5]:
ray.shutdown()
ray.init(ignore_reinit_error=True)
#config = dqn.DEFAULT_CONFIG.copy()
config = ppo.DEFAULT_CONFIG.copy()
config['train_batch_size'] = int(1e4)
config['num_workers'] = 3
config['framework'] = 'tfe'
config

2021-02-18 14:59:40,404	INFO services.py:1173 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'num_workers': 3,
 'num_envs_per_worker': 1,
 'create_env_on_driver': False,
 'rollout_fragment_length': 200,
 'batch_mode': 'truncate_episodes',
 'num_gpus': 0,
 'train_batch_size': 10000,
 'model': {'fcnet_hiddens': [256, 256],
  'fcnet_activation': 'tanh',
  'conv_filters': None,
  'conv_activation': 'relu',
  'free_log_std': False,
  'no_final_linear': False,
  'vf_share_layers': True,
  'use_lstm': False,
  'max_seq_len': 20,
  'lstm_cell_size': 256,
  'lstm_use_prev_action': False,
  'lstm_use_prev_reward': False,
  '_time_major': False,
  'framestack': True,
  'dim': 84,
  'grayscale': False,
  'zero_mean': True,
  'custom_model': None,
  'custom_model_config': {},
  'custom_action_dist': None,
  'custom_preprocessor': None,
  'lstm_use_prev_action_reward': -1},
 'optimizer': {},
 'gamma': 0.99,
 'horizon': None,
 'soft_horizon': False,
 'no_done_at_end': False,
 'env_config': {},
 'env': None,
 'normalize_actions': False,
 'clip_rewards': None,
 'clip_actions': True,
 'preproc

Define some synthetic test functions: steps and bumps.

In [6]:
def rotate(x,theta):
    x0 = x[0]
    y0 = x[1]
    x1 = x0*cos(theta)-y0*sin(theta)
    y1 = x0*sin(theta)+y0*cos(theta)
    return [x1,y1]

In [7]:
def step(x):
    x0 = x[0]
    if (x0 < 0.0):
        return 1.0
    else:
        return 0.0

In [8]:
def rotated_step(x, theta):
    xr = rotate(x,theta)
    return step(xr)

In [9]:
def bump(x):
    rsq = x[0]**2 +x[1]**2
    return math.exp(-rsq)

In [10]:
def smooth_step(x):
    return 0.5*(1.0 +math.tanh(x[0]))

In [11]:
def rotated_smooth_step(x,theta):
    xr = rotate(x,theta)
    return smooth_step(xr)

Create classes where we can set the parameters and then eval a bunch of points.

In [12]:
class Step(mfem.PyCoefficient):
    
    def SetParams(self):
        self.theta = random.uniform(0.0, 2.0*math.pi)
        self.dx = [random.uniform(-1.0, 1.0),random.uniform(-1.0, 1.0)]
        
    def EvalValue(self, x):
        return rotated_step(x+self.dx, self.theta)

In [13]:
class Bump(mfem.PyCoefficient):
    
    def SetParams(self):
        self.width = random.uniform(0.1,1.0)
        self.xc = [0.5,0.5]
        self.dx = [random.uniform(-0.5, 0.5),random.uniform(-0.5, 0.5)]
        y1 = random.uniform(0.0,1.0)
        y2 = random.uniform(0.0,1.0)
        self.floor = min(y1,y2)
        self.ceiling = max(y1,y2)
        self.height = self.ceiling -self.floor
        
    def EvalValue(self, x):
        return self.floor +self.height*bump((x-self.xc+self.dx)/self.width)

In [14]:
class TwoBump(mfem.PyCoefficient):
    
    def SetParams(self):
        self.width1 = random.uniform(0.1,0.5)
        self.width2 = random.uniform(0.1,0.5)
        self.xc1 = [0.5,0.5]
        self.xc2 = [0.5,0.5]
        self.dx1 = [random.uniform(-0.5, 0.5),random.uniform(-0.5, 0.5)]
        self.dx2 = [random.uniform(-0.5, 0.5),random.uniform(-0.5, 0.5)]

    def EvalValue(self, x):
        return 0.5*(bump((x-self.xc1+self.dx1)/self.width1)+bump((x-self.xc2+self.dx2)/self.width2))

In [15]:
class SmoothStep(mfem.PyCoefficient):
    
    def SetParams(self):
        self.width = random.uniform(5.0, 10.0)
        self.xc = [0.5,0.5]
        self.dx = random.uniform(-0.5,0.5)
        self.theta = random.uniform(0.0, 2.0*math.pi)
        self.height = random.uniform(0.0, 1.0)
        y1 = random.uniform(0.0,1.0)
        y2 = random.uniform(0.0,1.0)
        self.floor = min(y1,y2)
        self.ceiling = max(y1,y2)
        self.height = self.ceiling -self.floor

    def EvalValue(self, x):
        x -= self.xc
        x += self.dx
        return self.floor +self.height*rotated_smooth_step(x*self.width, self.theta)

In [16]:
class BumpAndSmoothStep(mfem.PyCoefficient):
    
    def SetParams(self):
        self.bump = Bump()
        self.bump.SetParams()
        self.smooth_step = SmoothStep()
        self.smooth_step.SetParams()
        self.alpha = random.uniform(0.0, 1.0)

    def EvalValue(self, x):
        return self.alpha*self.bump.EvalValue(x)+ (1-self.alpha)*self.smooth_step.EvalValue(x)

In [17]:
class BumpNarrowWide(mfem.PyCoefficient):
    
    def SetParams(self):
        a = random.uniform(0.0,1.0)
        if (a < 0.5):
            self.width = 0.2
            self.height = 1.0
        else:
            self.width = 0.4
            self.height = 0.1
        self.xc = [0.5,0.5]
        self.dx = [random.uniform(-0.5, 0.5),random.uniform(-0.5, 0.5)]

    def EvalValue(self, x):
        return self.height*bump((x-self.xc+self.dx)/self.width)

Visualize an instance of the test function. Note that each instance has randomly chosen parameters.  For the steps, it's a rotation angle and a displacement.  For the bumps, it's a width and a displacement.

In [18]:
mesh = mfem.Mesh('inline-quad.mesh')
mesh.UniformRefinement()
mesh.UniformRefinement()
fec = mfem.L2_FECollection(p=1, dim=2)
fes = mfem.FiniteElementSpace(mesh, fec)
u = mfem.GridFunction(fes)
c = BumpAndSmoothStep()
c.SetParams()
u.ProjectCoefficient(c)

In [19]:
glvis(to_stream(mesh,u) + 'keys Rjlmc',600,600)

glvis()

Create the gym environment.

In [20]:
class AMRGame(gym.Env):
    
    class u0_coeff(mfem.PyCoefficient):
        
        def SetParams(self):
            self.fn = BumpAndSmoothStep()
            self.fn.SetParams()
            
        def EvalValue(self, x):
            return self.fn.EvalValue(x)
        
    # In RLlib, you need the config arg
    def __init__(self,config):
        self.meshfile = 'inline-quad-7.mesh'
        
        # keep a copy of the unrefined mesh so we can restore it
        self.mesh0 = mfem.Mesh(self.meshfile)
        self.mesh = mfem.Mesh(self.meshfile)
        
        # The only reason we need to create a fespace and gf here
        # is to find the sizes needed for the action and observation spaces
        dim = self.mesh.Dimension()
        self.order = 1
        self.fec = mfem.L2_FECollection(self.order, dim)
        self.fes = mfem.FiniteElementSpace(self.mesh, self.fec)
        self.u = mfem.GridFunction(self.fes);

        # actions are: refine each element, or do nothing
        self.action_space = spaces.Discrete(self.mesh.GetNE())
        self.observation_space = spaces.Box(-1.0, 1.0, shape=(self.u.Size(),), dtype=np.float32)
        self.state = None
        
        # call reset to create the first synthetic function
        self.reset()
        
        #self.gl = GlvisWidget(get_solnstream(self.mesh,self.u))
        
    def get_ne(self):
        return self.mesh.GetNE()
    
    def get_size(self):
        return self.u.Size()
    
    # Compute L2 error wrt to the analytic fn definition
    def get_error(self):
        err = self.u.ComputeL2Error(self.u0)
        return err
    
    # Manually refine the elements in the array elems
    def refine_elems(self, elems):
        self.mesh.GeneralRefinement(mfem.intArray(elems))
        self.fes.Update()
        self.u.Update()
        self.u.ProjectCoefficient(self.u0)
            
    # action is the number of the element to refine
    def step(self, action):
        err1 = self.get_error()
        self.refine_elems([action])
        err2 = self.get_error()
        reward = err1-err2
        done = True
        self.state = self.u.GetDataArray()
        return np.array(self.state), reward, done, {}
    
    # similar to reset, but do not choose a new function
    def reinit(self):
        del self.mesh
        self.mesh = mfem.Mesh(self.mesh0)

        del self.fes
        self.fes = mfem.FiniteElementSpace(self.mesh, self.fec)

        del self.u
        self.u = mfem.GridFunction(self.fes)
        self.u.ProjectCoefficient(self.u0)
        
        self.state = self.u.GetDataArray()
        return np.array(self.state)
    
    # every reset of the env chooses a new synthetic function
    def reset(self):
        self.u0 = self.u0_coeff()
        self.u0.SetParams()
        return self.reinit()
    
    def render(self):
        return glvis(to_stream(self.mesh,self.u) + 'keys Rjlmc',600,600)

Instantiate the environment and sanity check it.

In [21]:
env = AMRGame(None)
env.render()

glvis()

Ok, try training a policy:

In [22]:
config['train_batch_size'] = int(1e3)
agent = ppo.PPOTrainer(config, env=AMRGame)
#agent = dqn.DQNTrainer(config, env=AMRGame)

2021-02-18 14:59:42,118	INFO trainer.py:588 -- Executing eagerly, with eager_tracing=False
2021-02-18 14:59:42,119	INFO trainer.py:618 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=22342)[0m Instructions for updating:
[2m[36m(pid=22342)[0m non-resource variables are not supported in the long term
[2m[36m(pid=22340)[0m Instructions for updating:
[2m[36m(pid=22340)[0m non-resource variables are not supported in the long term
[2m[36m(pid=22338)[0m Instructions for updating:
[2m[36m(pid=22338)[0m non-resource variables are not supported in the long term


In [23]:
%%time
for n in range(1):
    result = agent.train()
    print("episode reward mean: %f " % result["episode_reward_mean"])


[2m[36m(pid=22342)[0m   arr = np.array(v)
[2m[36m(pid=22340)[0m   arr = np.array(v)
[2m[36m(pid=22338)[0m   arr = np.array(v)


Instructions for updating:
Prefer Variable.assign which has equivalent behavior in 2.X.
episode reward mean: 0.000017 
CPU times: user 7.17 s, sys: 191 ms, total: 7.37 s
Wall time: 16.7 s


[2m[36m(pid=22342)[0m Instructions for updating:
[2m[36m(pid=22342)[0m Prefer Variable.assign which has equivalent behavior in 2.X.
[2m[36m(pid=22340)[0m Instructions for updating:
[2m[36m(pid=22340)[0m Prefer Variable.assign which has equivalent behavior in 2.X.
[2m[36m(pid=22338)[0m Instructions for updating:
[2m[36m(pid=22338)[0m Prefer Variable.assign which has equivalent behavior in 2.X.


In [24]:
policy = agent.get_policy()
model = policy.model
print(model.base_model.summary())

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
observations (InputLayer)       [(None, 196)]        0                                            
__________________________________________________________________________________________________
fc_1 (Dense)                    (None, 256)          50432       observations[0][0]               
__________________________________________________________________________________________________
fc_value_1 (Dense)              (None, 256)          50432       observations[0][0]               
__________________________________________________________________________________________________
fc_2 (Dense)                    (None, 256)          65792       fc_1[0][0]                       
______________________________________________________________________________________________

Create a convenience function for applying a policy to a given observation

In [25]:
def apply_policy(model, obs):
    action = agent.compute_action(obs, explore=False) # use deterministic mode
    state, reward, done, info = env.step(action)
    #print("policy chooses action %d with reward %f" % (action, reward))
    return action, reward

In [26]:
obs = env.reset()
action, reward = apply_policy(model, obs)
action, reward

(6, 3.6708462993265134e-07)

In [27]:
obs = env.reinit()
action, reward = apply_policy(model, obs)
action, reward

(6, 3.6708462993265134e-07)

Brute force search for the best choice by trying each one, remembering to reset the environment after each action and after we're done.

In [28]:
def find_optimal(obs):
    u0 = mfem.Vector(obs)
    maxr = 0.0;
    maxel = -1;
    env.reinit()
    ne = env.get_ne()
    for n in range(ne):
        env.reinit()
        state, reward, done, info = env.step(n)
        if reward > maxr:
            maxr = reward
            maxel = n
    #print("max reward is %f by refining element %d" % (maxr, maxel))
    env.reinit()
    return maxel, maxr

In [29]:
obs = env.reset()
maxel, maxr = find_optimal(obs)
env.refine_elems([maxel])
env.render()

glvis()

Compare with what the policy does:

In [30]:
env.reinit()
apply_policy(model,obs)
env.render()

glvis()

Define an error estimator based on the difference between the discontinuous and continuous representations. This is only valid for L2 FE spaces.

In [31]:
def find_dgjumps(env):
    
    mesh = env.mesh
    u = env.u
    
    # put the L2 gridfunction into a coefficient so we can project it
    u_disc_coeff = mfem.GridFunctionCoefficient(u)
    h1_fec = mfem.H1_FECollection(p=1, dim=2)
    h1_fes = mfem.FiniteElementSpace(mesh, h1_fec)
    u_h1 = mfem.GridFunction(h1_fes)
    u_h1.ProjectDiscCoefficient(u_disc_coeff, mfem.GridFunction.ARITHMETIC)
    
    # put the H1 smoothed function into a coefficient
    u_h1_coeff = mfem.GridFunctionCoefficient(u_h1)
    
    # create a 0-order L2 field to hold errors
    l2_0_fec = mfem.L2_FECollection(p=0,dim=2)
    l2_0_fes = mfem.FiniteElementSpace(mesh,l2_0_fec)

    # Compute elementwise "errors" between continuous and discontinuous fields
    err_gf = mfem.GridFunction(l2_0_fes);
    u.ComputeElementL2Errors(u_h1_coeff, err_gf);
    
    best_action = np.argmax(err_gf.GetDataArray())
    
    state, reward, done, info = env.step(best_action)
    env.reinit()

    return best_action, reward

In [32]:
env.reset()
action, reward = find_dgjumps(env)
env.step(action)
env.render()

glvis()

Run a more systematic evaluation using an ensemble of samples:

In [33]:
def eval_ensemble(model, ntrials):
    ncorrect = 0.0
    sumsq = 0.0
    maxerrsq = 0.0
    dg_ncorrect = 0.0
    dg_sumsq = 0.0
    dg_maxerrsq = 0.0
    for n in range(ntrials):
        obs = env.reset()
        bestaction, bestreward = find_optimal(obs)
        dgaction, dgreward = find_dgjumps(env)
        action, reward = apply_policy(model,obs)
        err = bestreward-reward
        maxerrsq = max(err*err,maxerrsq)
        sumsq += err*err
        dg_err = bestreward-dgreward
        dg_maxerrsq = max(dg_err*dg_err,dg_maxerrsq)
        dg_sumsq += dg_err*dg_err
        if (bestaction == action):
            ncorrect += 1
        if (bestaction == dgaction):
            dg_ncorrect += 1
    rms = math.sqrt(sumsq/ntrials)
    corr = 100.*ncorrect/ntrials
    print("policy rms error: ",rms,flush=True)
    print("policy max sq error: ",maxerrsq,flush=True)
    print("policy % correct: ",corr,flush=True)
    dg_rms = math.sqrt(dg_sumsq/ntrials)
    dg_corr = 100.*dg_ncorrect/ntrials
    print("dg rms error: ",dg_rms,flush=True)
    print("dg max sq error: ",dg_maxerrsq,flush=True)
    print("dg % correct: ",dg_corr,flush=True)
    return rms, math.sqrt(maxerrsq), corr, dg_rms, math.sqrt(dg_maxerrsq), dg_corr

eval_ensemble(model, 100)

Run a few eval sample sizes to get a sense of how many are needed to estimate the metrics of the policy

eval_ensemble(model, 200)

eval_ensemble(model, 400)

Let's see if the training process is making progress:

In [None]:
total_episodes = 2.e6
nbatches = 40
batch_size = total_episodes/nbatches
neval = 400

del agent
config['train_batch_size'] = int(batch_size)
agent = ppo.PPOTrainer(config, env=AMRGame)
policy = agent.get_policy()
model = policy.model

rms = [0.0] * nbatches
cor = [0.0] * nbatches
maxerr = [0.0] * nbatches

dg_rms = [0.0] * nbatches
dg_cor = [0.0] * nbatches
dg_maxerr = [0.0] * nbatches

eval_episode = 0
for n in range(nbatches):
    print("training batch %d of size %d" % (n,batch_size))
    agent.train()
    checkpoint_path = agent.save()
    print(checkpoint_path)
    rms[n], maxerr[n], cor[n], dg_rms[n], dg_maxerr[n], dg_cor[n] = eval_ensemble(model, neval)

[2m[36m(pid=22339)[0m Instructions for updating:
[2m[36m(pid=22339)[0m non-resource variables are not supported in the long term


training batch 0 of size 50000


[2m[36m(pid=22653)[0m Instructions for updating:
[2m[36m(pid=22653)[0m non-resource variables are not supported in the long term
[2m[36m(pid=22651)[0m Instructions for updating:
[2m[36m(pid=22651)[0m non-resource variables are not supported in the long term
[2m[36m(pid=22339)[0m   arr = np.array(v)
[2m[36m(pid=22653)[0m   arr = np.array(v)
[2m[36m(pid=22651)[0m   arr = np.array(v)
[2m[36m(pid=22339)[0m Instructions for updating:
[2m[36m(pid=22339)[0m Prefer Variable.assign which has equivalent behavior in 2.X.
[2m[36m(pid=22653)[0m Instructions for updating:
[2m[36m(pid=22653)[0m Prefer Variable.assign which has equivalent behavior in 2.X.
[2m[36m(pid=22651)[0m Instructions for updating:
[2m[36m(pid=22651)[0m Prefer Variable.assign which has equivalent behavior in 2.X.


/home/rwa/ray_results/PPO_AMRGame_2021-02-18_15-00-0280281bn6/checkpoint_1/checkpoint-1
policy rms error:  0.0004619635763398737
policy max sq error:  2.9324919722085938e-05
policy % correct:  2.5
dg rms error:  2.471158083591221e-05
dg max sq error:  5.2523281338418574e-08
dg % correct:  36.75
training batch 1 of size 50000
/home/rwa/ray_results/PPO_AMRGame_2021-02-18_15-00-0280281bn6/checkpoint_2/checkpoint-2
policy rms error:  0.00022937949888932277
policy max sq error:  6.076806675342031e-06
policy % correct:  4.5
dg rms error:  3.902945522089444e-05
dg max sq error:  3.7991718126875357e-07
dg % correct:  37.5
training batch 2 of size 50000




/home/rwa/ray_results/PPO_AMRGame_2021-02-18_15-00-0280281bn6/checkpoint_3/checkpoint-3
policy rms error:  0.00019005764316295688
policy max sq error:  3.1373412862317568e-06
policy % correct:  8.5
dg rms error:  2.230281231387222e-05
dg max sq error:  2.8410094944932805e-08
dg % correct:  38.75
training batch 3 of size 50000


In [None]:
%matplotlib inline
isteps = list(range(nbatches))
asteps = [i*config['train_batch_size'] for i in isteps]
import matplotlib.pyplot as plt
ax = plt.subplot(211)
ax.set_ylim(0.00001,0.01)
ax.set_ylabel('Error')
line1, = plt.semilogy(asteps,rms[:nbatches], marker='o')
line2, = plt.semilogy(asteps,dg_rms[:nbatches], marker='x')
line3, = plt.semilogy(asteps,maxerr[:nbatches], marker='.')
line4, = plt.semilogy(asteps,dg_maxerr[:nbatches], marker='+')

line1.set_label('RL rms')
line2.set_label('DG rms')
line3.set_label('RL max')
line4.set_label('DG max')
ax.legend()
plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))

ax = plt.subplot(212)
ax.set_ylim(0,100)
ax.set_ylabel('% correct')
ax.set_xlabel('training episodes')
line1, = plt.plot(asteps,cor[:nbatches], marker='o')
line2, = plt.plot(asteps,dg_cor[:nbatches], marker='x')
line1.set_label('RL policy')
line2.set_label('DG')
ax.legend()
plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))

In [None]:
rms

Let's look for cases where the policy gets it right and the DG method gets it wrong.

In [None]:
for n in range(500):
    obs = env.reset()
    opt_action, opt_reward = find_optimal(obs)
    dg_action, dg_reward = find_dgjumps(env)
    pol_action, pol_reward = apply_policy(model, obs)
    if ((pol_action == opt_action) and (dg_action != opt_action)):
        break
env.reinit()
env.step(pol_action)
env.render()

In [None]:
env.reinit()
env.step(dg_action)
env.render()