In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys, copy, os, shutil
from tqdm.notebook import tqdm
import seaborn as sns
from importlib import reload
import copy

# our helper functions for the gridworlds
import GridWorldHelpers as gwh

In [2]:
# initializing our environments + corresponding colors
d = 8 # dimension of our 
gw0, gw1, gw2 = gwh.build_grids(d=8, baseline_penalty = -1, water_penalty = -10, end_reward = 10)
gw0_colors = gwh.make_gw_colors(gw0)
gw1_colors = gwh.make_gw_colors(gw1)
gw2_colors = gwh.make_gw_colors(gw2)

# store quick-access indices for the environment
environments = {
                0: [gw0, gw0_colors], # baseline
                1: [gw1, gw2_colors], # non-flooding
                2: [gw2, gw2_colors] # flooding
               }

# global environment parameters 
p_switch = 0.5 # flooding Markov chain parameter
p_wind_i = 0.5 # up-down wind frequency
p_wind_j = 0.5 # left-right wind frequency

# what is our starting "current environment"
ce = 1

# which environments are we flipping through?
indices = np.array([1, 2]) # the two to be flipping between, if any. If just one, make first element

In [202]:
# make sure we are refreshing our helper functions
reload(gwh)

# SIMULATION SETTINGS
max_iters = 1000
baseline_imputer_settings = ["last_fobs", "random_action", "missing_state"]
baseline_imputer = baseline_imputer_settings[0]
epsilon = 0.05

# initialize our Q matrix: {((i, j, color), (a1, a2))}
Q = gwh.init_Q(d, init_value=0.0)
action_descs = gwh.load_actions()
actions = list(action_descs.keys())

# counter of how many episodes we've hit
finished_episodes = 0

# initialize our starting environment + corresponding colors
env, env_colors = environments[ce][0], environments[ce][1]

# initialize our true initial state to be the bottom left corner. Assume fully-observed initial state
true_state, pobs_state = (d-1, 0, env_colors[d-1, 0]), (d-1, 0, env_colors[d-1, 0])

# things we want to store: rewards, time per iteration, last fully-observed state, Qmatrices every X steps.
rewards, times, last_fobs_state = [], [], copy.deepcopy(true_state)

# for each timestep ...
for t_step in tqdm(range(max_iters)):
    
    ##### "choose action A from S using policy-derived from Q (e.g., \epsilon-greedy)"
    
    # do we have any missing state values?
    if np.any(np.isnan(pobs_state).mean()):
        
        # deal with it accordingly to get imputed states + actions
        if baseline_imputer == "last_fobs":
            action = gwh.select_action(last_fobs_state, Q, epsilon)
        elif baseline_imputer == "random_action":
            action = actions[np.random.choice(a=len(actions))]
        elif baseline_imputer == "missing_state":
            action = gwh.select_action("missing", Q, epsilon)
        else:
            raise Exception("baseline_imputer choice is not currently supported.")
        
    # if not, update our last_fobs_state + select an action accordingly
    else:
        
        # select our action using standard epsilon-greedy on Q
        action = gwh.select_action(pobs_state, Q, epsilon)
        
        # update our last fully-observed state
        last_fobs_state = copy.deepcopy(pobs_state)
    
    ##### "Take action A, observe R, S'" - BASED ON TRUE STATE, OF COURSE!
    
    # toggle our environment potentially!
    
    # figure out what our new state is, which tells us our reward
    new_true_state = gwh.true_move(true_state, action, env, env_colors, p_wind_i, p_wind_j)
    reward = env[int(new_true_state[0]), int(new_true_state[1])]
    
    # simulate our partially-observed mechanism.
    # WIP!
    
    break

  0%|          | 0/1000 [00:00<?, ?it/s]

In [208]:
env[int(new_true_state[0]), int(new_true_state[1])]

-1

In [207]:
new_true_state[0]

6.0

In [203]:
new_true_state

array([6., 0., 0.])