# CARLE's Game Evaluation Notebook

## Agent/Policy Description

## Voting Instructions

Voting instructions will be added here in a later update

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

import numpy as np
import torch
import time

from carle.env import CARLE
from carle.mcl import CornerBonus, SpeedDetector, PufferDetector, AE2D, RND2D
from game_of_carle.agents.harli import HARLI
from game_of_carle.agents.carla import CARLA
from game_of_carle.agents.grnn import ConvGRNN
from game_of_carle.agents.toggle import Toggle

import bokeh
import bokeh.io as bio
from bokeh.io import output_notebook, show
from bokeh.plotting import figure

from bokeh.layouts import column, row
from bokeh.models import TextInput, Button, Paragraph
from bokeh.models import ColumnDataSource

from bokeh.events import DoubleTap, Tap

import matplotlib.pyplot as plt
my_cmap = plt.get_cmap("magma")

output_notebook()

In [None]:
"""
Trained with the SpeedDetector and RND2D bonus wrappers with the B358/S245 Morely rules,
some of these agents learned that they could game that reward system by exploiting a chaotic boundary with 
essentially random actions. This is something of a specification gaming/reward hacking strategy, as it is 
unlikely for agents to learn more interesting strategies when they already get a high reward from cells near 
the border transiently becoming active. One way to improve this would be to restrict agent activity to the center 
of the action space (like with the Toggle agent), which yields a buffer between what the agent modifies and
what the speed reward wrapper uses to calculate center of mass. Likewise the reward wrapper could be modified to 
include a 'frontier zone' itself.

Occasionally agents exhibit a 'wave' strategy, where toggling all the cells at the action space boundary 
creates a diminishing line of active cells that propagates toward the CA grid edges. For CARLA agents, this 
strategy mostly if not only is used immediately after resetting the environment/agent.
"""

agent = CARLA()

params_list = [\
        "../policies/CARLA_42_glider_rnd2d_experiment1622466808best_params_gen31.npy",\
        "../policies/CARLA_43110_glider_rnd2d_experiment1622503099best_params_gen31.npy"] 

# choose parameters to load
params_index = 0

agent.set_params(np.load(params_list[params_index]))

env = CARLE(height=128, width=128)
env = SpeedDetector(env)

my_rules = "B368/S245"

env.rules_from_string(my_rules)
    

In [None]:
"""
The Toggle agent parameters become the actions at step 0, after which the agent does nothing until `agent.reset` 
is called. In other words you can use the Toggle agent to optimize an initial pattern directly. 

Using CMA-ES with this strategy and SpeedDetector + RND2D reward wrappers pretty reliably finds patterns than 
coalesce into a moving machine, although so far the pattern always turns into either a jellyfish glider or
the common puffer. 
"""

agent =  Toggle()

#my_params = np.load("../policies/Toggle_13_glider_rnd2d_experiment1622420340best_params_gen31.npy") # glider
#my_params = np.load("../policies/Toggle_1337_glider_rnd2d_experiment1622437453best_params_gen31.npy") # glider
#my_params = np.load("../policies/Toggle_42_glider_rnd2d_experiment1622455453best_params_gen31.npy") # glider
#my_params = np.load("../policies/Toggle_12345_glider_rnd2d_experiment1622474002best_params_gen31.npy") # puffer
#my_params = np.load("../policies/Toggle_43110_glider_rnd2d_experiment1622491637best_params_gen31.npy") # puffer

params_list = ["../policies/Toggle_13_glider_rnd2d_experiment1622420340best_params_gen31.npy", \
        "../policies/Toggle_1337_glider_rnd2d_experiment1622437453best_params_gen31.npy",\
        "../policies/Toggle_42_glider_rnd2d_experiment1622455453best_params_gen31.npy",\
        "../policies/Toggle_12345_glider_rnd2d_experiment1622474002best_params_gen31.npy",\
        "../policies/Toggle_43110_glider_rnd2d_experiment1622491637best_params_gen31.npy"]

params_index = 0

agent.set_params(np.load(params_list[params_index]))

env = CARLE(height=128, width=128)
env = SpeedDetector(env)

my_rules = "B368/S245"

env.rules_from_string(my_rules)   

In [None]:
"""
Trained with the SpeedDetector and RND2D bonus wrappers with the B358/S245 Morely rules,
some of these agents learned that they could game that reward system by exploiting a chaotic boundary with 
essentially random actions. This is something of a specification gaming/reward hacking strategy, as it is 
unlikely for agents to learn more interesting strategies when they already get a high reward from cells near 
the border transiently becoming active. One way to improve this would be to restrict agent activity to the center 
of the action space (like with the Toggle agent), which yields a buffer between what the agent modifies and
what the speed reward wrapper uses to calculate center of mass. Likewise the reward wrapper could be modified to 
include a 'frontier zone' itself.

There is an interesting reward hack that is sometimes exhibited by HARLI agents trained with SpeedDetector. 
CARLE instances can be reset if the agent toggles every cell in the action space simultaneously, and this can generate
high rewards in one of two ways. The first is that if there are many live cells outside the action space these will
be zeroed out when the environment is reset, making for an large change in the center of mass of all live cells as
is used by SpeedDetector to calculate rewards. The second is that this sets up the environment perfectly for a 
highly rewarding "wave" strategy, where a line of active cells at the action space boundary sets up conditions for 
a fast moving line of live cells to propagate toward the grid edge. When both those mechanisms are combined, 
the rewards can be quite high, although we probably would have preferred agents that come up with or rediscover
interesting glider and spaceship patterns. 
"""

agent = HARLI()

params_list = ["../policies/HARLI_13_glider_rnd2d_experiment1622423202best_params_gen31.npy", \
        "../policies/HARLI_42_glider_rnd2d_experiment1622458272best_params_gen31.npy",\
        "../policies/HARLI_1337_glider_rnd2d_experiment1622440720best_params_gen31.npy",\
        "../policies/HARLI_12345_glider_rnd2d_experiment1622477110best_params_gen31.npy",\
        "../policies/HARLI_43110_glider_rnd2d_experiment1622494528best_params_gen31.npy"]

# choose parameters to load
params_index = 0

agent.set_params(np.load(params_list[params_index]))

env = CARLE(height=128, width=128)
env = SpeedDetector(env)
#env = AE2D(env)

my_rules = "B368/S245"

env.rules_from_string(my_rules)   

In [None]:
def modify_doc(doc):
        
    #agent = SubmissionAgent()
    #agent.toggle_rate = 0.48
    global obs
    obs = env.reset()
    
    p = figure(plot_width=3*256, plot_height=3*256, title="CA Universe")
    p_plot = figure(plot_width=int(2.5*256), plot_height=int(2.5*256), title="'Reward'")

    global my_period
    global number_agents
    global agent_number
    
    agent_number = 0
    number_agents = len(params_list)
    my_period = 512    
    
    source = ColumnDataSource(data=dict(my_image=[obs.squeeze().cpu().numpy()]))
    source_plot = ColumnDataSource(data=dict(x=np.arange(1), y=np.arange(1)*0))
    
    img = p.image(image='my_image',x=0, y=0, dw=256, dh=256, palette="Magma256", source=source)
    line_plot = p_plot.line(line_width=3, color="firebrick", source=source_plot)
    
    button_go = Button(sizing_mode="stretch_width", label="Run >")     
    button_slower = Button(sizing_mode="stretch_width",label="<< Slower")
    button_faster = Button(sizing_mode="stretch_width",label="Faster >>")
    button_reset = Button(sizing_mode="stretch_width",label="Reset")
    
    button_reset_prev_agent = Button(sizing_mode="stretch_width",label="Reset w/ Prev. Agent")
    button_reset_this_agent = Button(sizing_mode="stretch_width",label="Reset w/ This Agent")
    button_reset_next_agent = Button(sizing_mode="stretch_width",label="Reset w/ Next Agent")
  
    
    input_birth = TextInput(value="B")
    input_survive = TextInput(value="S")
    button_birth = Button(sizing_mode="stretch_width", label="Update Birth Rules")
    button_survive = Button(sizing_mode="stretch_width", label="Update Survive Rules")
    button_agent_switch = Button(sizing_mode="stretch_width", label="Turn Agent Off")
    
    message = Paragraph()
    
    def update():
        global obs
        global stretch_pixel
        global action
        global agent_on
        global my_step
        global rewards
        
        obs, r, d, i = env.step(action)
        rewards = np.append(rewards, r.cpu().numpy().item())
        if agent_on:
            action = agent(obs) #1.0 * (torch.rand(env.instances,1,env.action_height,env.action_width) < 0.05)
        else:
            action = torch.zeros_like(action)
            
        #padded_action = stretch_pixel/2 + env.action_padding(action).squeeze()
        padded_action = stretch_pixel/2 + env.inner_env.action_padding(action).squeeze()
        
        my_img = (padded_action*2 + obs.squeeze()).cpu().numpy()
        my_img[my_img > 3.0] = 3.0
        (padded_action*2 + obs.squeeze()).cpu().numpy()
        new_data = dict(my_image=[my_img])
        
        #new_line = dict(x=np.arange(my_step+2), y=rewards)
        new_line = dict(x=[my_step], y=[r.cpu().numpy().item()])
        
        source.stream(new_data, rollover=1)
        source_plot.stream(new_line, rollover=2000)
        
        my_step += 1
        message.text = f"agent {agent_number}, step {my_step}, reward: {r.item()} \n"\
                f"{params_list[agent_number]}"
        
        
    def go():
       
        if button_go.label == "Run >":
            my_callback = doc.add_periodic_callback(update, my_period)
            button_go.label = "Pause"
            #doc.remove_periodic_callback(my_callback)
            
        else:
            doc.remove_periodic_callback(doc.session_callbacks[0])
            button_go.label = "Run >"
    
    def faster():
        
        
        global my_period
        my_period = max([my_period * 0.5, 1])
        go()
        go()
        
    def slower():
        
        global my_period
        my_period = min([my_period * 2, 8192])
        go()
        go()
    
    def reset():
        global obs
        global stretch_pixel
        global my_step
        global rewards
        
        my_step = 0
        
        obs = env.reset()        
        agent.reset()
                
        stretch_pixel = torch.zeros_like(obs).squeeze()
        stretch_pixel[0,0] = 3
        new_data = dict(my_image=[(stretch_pixel + obs.squeeze()).cpu().numpy()])
        rewards = np.array([0])
        
        new_line = dict(x=[my_step], y=[0])
        
        source_plot.stream(new_line, rollover=1)
        source.stream(new_data, rollover=8)
    
    def reset_this_agent():
       
        global obs
        global stretch_pixel
        global my_step
        global rewards
        global agent_number
        global number_agents
        
        my_step = 0
        
        obs = env.reset()        
        agent.reset()
                
        stretch_pixel = torch.zeros_like(obs).squeeze()
        stretch_pixel[0,0] = 3
        new_data = dict(my_image=[(stretch_pixel + obs.squeeze()).cpu().numpy()])
        rewards = np.array([0])
        
        new_line = dict(x=[my_step], y=[0])
        
        source_plot.stream(new_line, rollover=1)
        source.stream(new_data, rollover=8)
        
    
    def reset_next_agent():
       
        global obs
        global stretch_pixel
        global my_step
        global rewards
        global agent_number
        global number_agents
        
        my_step = 0
        
        obs = env.reset()        
                
        stretch_pixel = torch.zeros_like(obs).squeeze()
        stretch_pixel[0,0] = 3
        new_data = dict(my_image=[(stretch_pixel + obs.squeeze()).cpu().numpy()])
        rewards = np.array([0])
        
        new_line = dict(x=[my_step], y=[0])
        
        source_plot.stream(new_line, rollover=1)
        source.stream(new_data, rollover=8)
        
        agent_number = (agent_number + 1) % number_agents
        
        agent.set_params(np.load(params_list[agent_number]))
        agent.reset()
        
        message.text = f"reset with agent {agent_number}"
        
    def reset_prev_agent():
       
        global obs
        global stretch_pixel
        global my_step
        global rewards
        global agent_number
        global number_agents
        
        my_step = 0
        
        obs = env.reset()        
                
        stretch_pixel = torch.zeros_like(obs).squeeze()
        stretch_pixel[0,0] = 3
        new_data = dict(my_image=[(stretch_pixel + obs.squeeze()).cpu().numpy()])
        rewards = np.array([0])
        
        new_line = dict(x=[my_step], y=[0])
        
        source_plot.stream(new_line, rollover=1)
        source.stream(new_data, rollover=8)
        
        agent_number = (agent_number - 1) % number_agents
        
        agent.set_params(np.load(params_list[agent_number]))
        agent.reset()
        
        message.text = f"reset with agent {agent_number}"
    
    
    def set_birth_rules():
        env.birth_rule_from_string(input_birth.value)
        
        my_message = "Rules updated to B"
        
        for elem in env.birth:
            my_message += str(elem)
        my_message += "/S"    
        
        for elem in env.survive:
            my_message += str(elem)
            
        message.text = my_message
        
        time.sleep(0.1)
    
    def set_survive_rules():
        env.survive_rule_from_string(input_survive.value)
        
        my_message = "Rules updated to B"
        
        for elem in env.birth:
            my_message += str(elem)
        my_message += "/S"    
        
        for elem in env.survive:
            my_message += str(elem)
            
        message.text = my_message
        
        time.sleep(0.1)
        
    def human_toggle(event):
        global action
        
        coords = [np.round(env.height*event.y/256-0.5), np.round(env.width*event.x/256-0.5)]
        offset_x = (env.height - env.action_height) / 2
        offset_y = (env.width - env.action_width) / 2
   
        coords[0] = coords[0] - offset_x
        coords[1] = coords[1] - offset_y
        
        coords[0] = np.uint8(np.clip(coords[0], 0, env.action_height-1))
        coords[1] = np.uint8(np.clip(coords[1], 0, env.action_height-1))
       
        action[:, :, coords[0], coords[1]] = 1.0 * (not(action[:, :, coords[0], coords[1]]))
   
        padded_action = stretch_pixel/2 + env.inner_env.action_padding(action).squeeze()
        
        my_img = (padded_action*2 + obs.squeeze()).cpu().numpy()
        my_img[my_img > 3.0] = 3.0
        (padded_action*2 + obs.squeeze()).cpu().numpy()
        new_data = dict(my_image=[my_img])
        
        source.stream(new_data, rollover=8)
        
    def clear_toggles():
        global action
        
        if button_go.label == "Pause":
            
            action *= 0
            doc.remove_periodic_callback(doc.session_callbacks[0])
            button_go.label = "Run >"

            padded_action = stretch_pixel/2 + env.inner_env.action_padding(action).squeeze()

            my_img = (padded_action*2 + obs.squeeze()).cpu().numpy()
            my_img[my_img > 3.0] = 3.0
            (padded_action*2 + obs.squeeze()).cpu().numpy()
            new_data = dict(my_image=[my_img])

            source.stream(new_data, rollover=8)
        else:
            doc.add_periodic_callback(update, my_period)
            button_go.label = "Pause"
            
    def agent_on_off():
        global agent_on
        
        if button_agent_switch.label == "Turn Agent Off":
            agent_on = False
            button_agent_switch.label = "Turn Agent On"
                
        else:
            agent_on = True
            button_agent_switch.label = "Turn Agent Off"
            
    
    global agent_on
    agent_on = True
    global action
    action = torch.zeros(1, 1, env.action_height, env.action_width)
    
    reset()
    
    
    p.on_event(Tap, human_toggle)
    p.on_event(DoubleTap, clear_toggles)
    
    
    button_reset_prev_agent.on_click(reset_prev_agent)
    button_reset_this_agent.on_click(reset_this_agent)
    button_reset_next_agent.on_click(reset_next_agent)
    
    button_birth.on_click(set_birth_rules)
    button_survive.on_click(set_survive_rules)
    button_go.on_click(go)
    button_faster.on_click(faster)
    button_slower.on_click(slower)
    button_reset.on_click(reset)
    button_agent_switch.on_click(agent_on_off)
    
    
    control_layout = row(button_slower, button_go, button_faster, button_reset)
    policy_change_layout = row(button_reset_prev_agent, button_reset_this_agent, button_reset_next_agent)
    rule_layout = row(input_birth, button_birth, input_survive, button_survive)
    agent_toggle_layout = row(button_agent_switch)
    
    display_layout = row(p, p_plot)
    message_layout = row(message)
    
    doc.add_root(display_layout)
    doc.add_root(control_layout)
    doc.add_root(policy_change_layout)
    doc.add_root(rule_layout)
    doc.add_root(message_layout)
    doc.add_root(agent_toggle_layout)
    

show(modify_doc)    

In [None]:
print("adf")