In [1]:
import numpy as np
import pandas as pd
import sys, copy, os, shutil
import copy, time
from IPython.display import clear_output
import pickle

import GridWorldEnvironments as gwe # added 7/16/2024
import GridWorldHelpers as gwh
import GridWorldImputers as gwi

In [21]:
# encode our parameters
p_switch, p_wind_i, p_wind_j = 0.1, 0.1, 0.1
allow_stay_action = False
env_missing, thetas, thetas_in, thetas_out, thetas_dict = "MCAR", np.array([0.5, 0.5, 0.5]), None, None, None 
impute_method, K, p_shuffle, num_cycles, epsilon, alpha, gamma = "joint", 10, 0.1, 1, 0.01, 0.1, 0.25   

# computational settings
max_iters, seed, verbose, river_restart = 50000, 0, True, False

# temporary toggle for having multiple Q matrices
mult_Q = True

In [3]:
# for better referencing later (7/16/2024).
baseline_penalty = -1
water_penalty = -10
end_reward = 100

# For convenience
MImethods = ["joint", "mice", "joint-conservative"]

###############################################################
##### CREATING ENVIRONMENT + SETTING THE SEED #################
###############################################################
#assert K >= 1 or K is None
#assert num_cycles >= 1 or num_cycles is None

# initializing our environments + corresponding colors
d = 8 # dimension of our gridworld
colors = [0,1,2] #colors encoded with 0,1,2
gw0, gw1, gw2 = gwe.build_grids(d=8, baseline_penalty = baseline_penalty, 
                                water_penalty = water_penalty, 
                                end_reward = end_reward)
gw0_colors = gwe.make_gw_colors(gw0)
gw1_colors = gwe.make_gw_colors(gw1)
gw2_colors = gwe.make_gw_colors(gw2)

# store quick-access indices for the environment
environments = {
                0: [gw0, gw0_colors], # baseline
                1: [gw1, gw1_colors], # non-flooding CORRECTED 4/16/2024
                2: [gw2, gw2_colors] # flooding
               }

# fog range - fixed.
i_range, j_range = (0, 2), (5, 7)

# what is our starting "current environment"
ce = 1

# which environments are we flipping through?
indices = np.array([1, 2]) # the two to be flipping between, if any. If just one, make first element

# set our seed for use in multiple trials
np.random.seed(seed)

###############################################################
##### INITIALIZING START OF SIMULATIONS + DATA STRUCTURES #####
###############################################################

# load the possible actions list, specifying whether stay in place allowed
action_descs = gwh.load_actions(allow_stay_action = allow_stay_action)
ACTIONS = list(action_descs.keys())

# initialize our Q matrix: {((i, j, color), (a1, a2))}
if impute_method == "missing_state":
    Q = gwi.init_Q(d, ACTIONS, include_missing_as_state=True, colors = colors)
else:
    Q = gwi.init_Q(d, ACTIONS, include_missing_as_state=False, colors = colors)


# initialize Transition matrices
Tstandard = gwi.init_Tstandard(d = d, action_list = ACTIONS, colors = colors, init_value = 0)
Tmice = gwi.init_Tmice(d = d, action_list = ACTIONS, colors = colors, init_value = 0)

# initialize our starting environment + corresponding colors
env, env_colors = environments[ce][0], environments[ce][1]

# initialize our true initial state to be the bottom left corner.
true_state = (d-1, 0, env_colors[d-1, 0])
                                                                           
#  Assume fully-observed initial state and initialize first obs
#  state and first imp state
pobs_state, impu_state = true_state, true_state

# if doing multiple imputation method, initilize state list
if impute_method in MImethods:
    imp_state_list = [true_state] * int(K)

# initialize variable for our last fully-obs-state
last_fobs_state = copy.deepcopy(true_state)

'''
DataFrame to log our results for this simulation:
1. Mean reward per episode, # of times we landed in the river per episode, # of steps per episode.
2. Counts of fully-observed, 1-missing, 2-missing, and 3-missing states per episode.
3. Wall clock time per episode.
'''
# ALL METRICS ARE PER EPISODE!
logs = pd.DataFrame(data=None, columns=["total_reward", "steps_river", "path_length", 
                                        "counts_0miss", "counts_1miss", "counts_2miss", "counts_3miss",
                                        "wall_clock_time"])

# 4/25/2024: PER-TIMESTEP LOGGING TOO!
t_step_logs = pd.DataFrame(data=None, columns=["t_step", "action_i", "action_j", 
                                               "true_i", "true_j", "true_c", 
                                               "obs_i", "obs_j", "obs_c", 
                                               "reward", "wall_clock_time"])

# things we want to store PER EPISODE
total_reward, steps_river, path_length = 0, 0, 0
counts_0miss, counts_1miss, counts_2miss, counts_3miss = 0, 0, 0, 0
wall_clock_time = None

In [4]:
# start our timer FOR THIS EPISODE
start_time = time.time()

###############################################################
##### RUNNING SIMULATIONS FOR EACH TIMESTEP ###################
###############################################################

# for each timestep ...
for t_step in range(max_iters):

    # start a timer for this TIMESTEP!
    start_time_t_step = time.time()

    # create a row to store our desired TIMESTEP-SPECIFIC METRICS TOO!
    t_step_row = [t_step]

    #############################################################
    # Action selection based on last state(s) or random selection 
    #############################################################
    # "choose action A from S using policy-derived from Q (e.g., \epsilon-greedy)"

    # do we have any missing state values?
    if np.any(np.isnan(pobs_state).mean()):
        # deal with it accordingly to get imputed actions
        if impute_method == "last_fobs1":
            action = gwi.select_action(last_fobs_state, ACTIONS, Q, epsilon)
        elif impute_method == "last_fobs2":
            action = gwi.select_action(last_fobs_state, ACTIONS, Q, epsilon)
        elif impute_method == "random_action":
            action = ACTIONS[np.random.choice(a=len(ACTIONS))]
        elif impute_method == "missing_state":
            # for this method only, we need to convert np.nan to -1
            pobs_state_temp = tuple([val if ~np.isnan(val) else -1 for val in pobs_state])
            action = gwi.select_action(pobs_state_temp, ACTIONS, Q, epsilon)
        elif impute_method in MImethods:

            # vote on action. note: not taking most-selected action because suspect not enough exploration
            action_options = [gwi.select_action(s, ACTIONS, Q, epsilon) for s in imp_state_list]
            action = action_options[np.random.choice(len(action_options))]                   
        else:
            raise Exception("impute_method choice is not currently supported.")

    # if no missingness, select an action by standard epsilon greedy 
    else:
        action = gwi.select_action(pobs_state, ACTIONS, Q, epsilon)

    # add to our logs FOR THIS TIMESTEP!
    t_step_row += [action[0], action[1]]

    ###############################################
    # Take action A, observe R, S'
    # Taking action affects underlying TRUE state, even if
    # we won't observe it!!!
    ###############################################

    # toggle our environment potentially!
    env, env_colors = environments[gwe.get_environment(ce, p_switch, indices)]

    # figure out what our new state is, which tells us our reward
    new_true_state = gwh.true_move(true_state, action, env, env_colors, p_wind_i, p_wind_j)
    reward = env[int(new_true_state[0]), int(new_true_state[1])]

    # have the option of moving back to start if fall into the river
    if (reward == water_penalty) and (river_restart == True):
        new_true_state = (d-1, 0, env_colors[d-1, 0])

    # record our NEW TRUE STATE
    t_step_row += [new_true_state[0], new_true_state[1], new_true_state[2]]

    # update our reward counter + river counters
    total_reward += reward
    if reward == water_penalty:
        steps_river += 1

    ###############################################
    # Apply missingness mechanism to generate our new partially observed state
    ###############################################

    # simulate our partially-observed mechanism.
    if env_missing == "MCAR":
        new_pobs_state = gwh.MCAR(new_true_state, thetas)
    elif env_missing == "Mcolor":
        new_pobs_state = gwh.Mcolor(new_true_state, theta_dict)
    elif env_missing == "Mfog":
        new_pobs_state = gwh.Mfog(new_true_state, i_range, j_range, thetas_in, thetas_out)
    else:
        raise Exception("The given env_missing mode is not supported.")

    # record our NEW POBS STATE + the reward
    t_step_row += [new_pobs_state[0], new_pobs_state[1], new_pobs_state[2], reward]

    ###############################################
    # IMPUTATION
    # make our imputation for the new_pobs_state, if not everything is observed.
    ###############################################

    if np.any(np.isnan(np.array(new_pobs_state)).mean()):

        if impute_method == "last_fobs1":
            new_impu_state = copy.deepcopy(last_fobs_state)

        elif impute_method == "last_fobs2":
            new_impu_state = tuple([i if ~np.isnan(i) else j for (i,j) in zip(new_pobs_state, last_fobs_state)])

        elif impute_method == "random_action":
            new_impu_state = None # we're not imputing any states!

        elif impute_method == "missing_state":

            # swapping np.nan to -1 to play nicer with dictionary indexing.
            new_impu_state = tuple([val if ~np.isnan(val) else -1 for val in new_pobs_state])

        elif impute_method in MImethods:

            #decide if we will shuffle (affects Q and T updates below)
            shuffle = gwi.shuffle(p_shuffle)

            #generate list of imputed values
            #note: because first state already observed, will only
            #get here when already have defined action variable 
            new_imp_state_list = gwi.MI(
                   method = impute_method,
                   Slist = imp_state_list,
                   A = action, #previous action?
                   pobs_state = new_pobs_state,
                   shuffle = shuffle,
                   Tmice = Tmice,
                   Tstandard = Tstandard,
                   num_cycles = num_cycles)

            new_impu_state = None #don't need this
        else:
            raise Exception("impute_method choice is not currently supported.")

    # if nothing is missing, just set new_impu_state equal to the new_pobs_state
    else:
        # just make a deepcopy!
        new_impu_state = copy.deepcopy(new_pobs_state)
        if impute_method in MImethods:
            new_imp_state_list = [new_pobs_state] * int(K)


    ######################################
    # Q update (if permitted)
    ######################################
    # multiple imputation way of updating Q with fractional allocation   
    if impute_method in MImethods:
        Q  = gwi.updateQ_MI(Q, 
                            Slist = imp_state_list, 
                            new_Slist = new_imp_state_list, 
                            A = action, action_list = ACTIONS,
                            reward = reward, alpha = alpha, gamma = gamma)

    # if we have random_action method, then we cannot update 
    elif impute_method != "random_action":
        Q = gwi.update_Q(Q, impu_state, action, ACTIONS, reward, new_impu_state, alpha, gamma)

    #if nothing is missing in last or current state, then we can
    #update Q under random_action
    elif ~np.any(np.isnan(new_pobs_state)):
        if ~np.any(np.isnan(pobs_state)):
            Q = gwi.update_Q(Q, pobs_state, action, ACTIONS, reward, new_pobs_state, alpha, gamma)


    ######################################
    # T update (if needed)
    ######################################
    if impute_method in MImethods:
        if impute_method == "mice":
            gwi.Tmice_update(Tmice, 
                             Slist = imp_state_list, 
                             A = action, 
                             newSlist = new_imp_state_list)
        if impute_method == "joint":
            gwi.Tstandard_update(Tstandard, 
                                 Slist = imp_state_list,
                                 A = action,
                                 new_Slist = new_imp_state_list)
        if impute_method == "joint-conservative":
            #only update if previous and current state are fully observed
            if ~np.any(np.isnan(new_pobs_state)):
                if ~np.any(np.isnan(pobs_state)):
                    gwi.Tstandard_update(Tstandard, 
                                        Slist = imp_state_list,
                                        A = action,
                                        new_Slist = new_imp_state_list)

    # check whether our last_fobs_state can be updated
    if ~np.any(np.isnan(pobs_state).mean()):
        last_fobs_state = copy.deepcopy(pobs_state)

    # now that we have updated Q and T functions
    # update true_state, pobs_state, impu_state, imp_state_list
    # as 'current state' for for the next round
    true_state = copy.deepcopy(new_true_state)
    pobs_state = copy.deepcopy(new_pobs_state)
    impu_state = copy.deepcopy(new_impu_state)
    if impute_method in MImethods:
        imp_state_list = copy.deepcopy(new_imp_state_list)

    # update our missing data counters
    if np.isnan(pobs_state).sum() == 0:
        counts_0miss += 1
    elif np.isnan(pobs_state).sum() == 1:
        counts_1miss += 1
    elif np.isnan(pobs_state).sum() == 2:
        counts_2miss += 1
    elif np.isnan(pobs_state).sum() == 3:
        counts_3miss += 1

    # update our path-length counter
    path_length += 1

    # also see if we hit the terminal state
    if (true_state[0] == 6) and (true_state[1] == 7):

        # end our timer + record time elapsed FOR THIS EPISODE!
        end_time = time.time()
        wall_clock_time = end_time - start_time

        # update our dataframe
        row = [total_reward, steps_river, path_length, 
               counts_0miss, counts_1miss, counts_2miss, counts_3miss,
               wall_clock_time]
        logs.loc[len(logs.index)] = row

        # reset our counter variables per EPISODE
        total_reward, steps_river, path_length = 0, 0, 0
        counts_0miss, counts_1miss, counts_2miss, counts_3miss = 0, 0, 0, 0
        wall_clock_time = None

        # reset our timer, too
        start_time = time.time()


    # end the timer for this TIMESTEP!
    end_time_t_step = time.time()

    # wrap up row of TIMESTEP-SPECIFIC METRICS, add to our dataframe
    t_step_row += [end_time_t_step - start_time_t_step]
    t_step_logs.loc[len(t_step_logs.index)] = t_step_row

    # status update?
    if verbose == True:
        s = 20
        if (t_step+1) % 5 == 0 and len(logs.index) >= 20:
            clear_output(wait=True)
            print(f"Timestep: {t_step+1}, Past 20 Mean Epi. Sum Reward: {np.round(logs.loc[-20:].total_reward.mean(), 3)}, Fin. Episodes: {len(logs.index)}, Past 20 Mean Path Length: {np.round(logs.loc[-20:].path_length.mean(), 3)}")
        elif (t_step+1) % 5 == 0:
            clear_output(wait=True)
            print(f"Timestep: {t_step+1}")
            print(f"Reward This Episode: {total_reward}")

Timestep: 630
Reward This Episode: -71


KeyboardInterrupt: 

In [20]:
imp_state_list, ACTIONS, Q, epsilon

([(5, 7, 0),
  (3, 0, 0),
  (3, 0, 0),
  (3, 0, 0),
  (6, 0, 0),
  (6, 7, 0),
  (2, 7, 0),
  (0, 4, 0),
  (2, 4, 0),
  (3, 7, 0)],
 [(0, 1), (1, 1), (1, 0), (1, -1), (0, -1), (-1, -1), (-1, 0), (-1, 1)],
 {((0, 0, 0), (0, 1)): -0.029701,
  ((0, 0, 0), (1, 1)): -0.09571544986644552,
  ((0, 0, 0), (1, 0)): -0.029701,
  ((0, 0, 0), (1, -1)): -0.04913247497525,
  ((0, 0, 0), (0, -1)): -0.12763799,
  ((0, 0, 0), (-1, -1)): -0.10902500000000001,
  ((0, 0, 0), (-1, 0)): -0.0490099501,
  ((0, 0, 0), (-1, 1)): -0.07754644182880992,
  ((0, 0, 1), (0, 1)): -0.029701,
  ((0, 0, 1), (1, 1)): -0.010025,
  ((0, 0, 1), (1, 0)): -0.01,
  ((0, 0, 1), (1, -1)): 0.0,
  ((0, 0, 1), (0, -1)): -0.01,
  ((0, 0, 1), (-1, -1)): 0.0,
  ((0, 0, 1), (-1, 0)): -0.019925,
  ((0, 0, 1), (-1, 1)): -0.022666359290120626,
  ((0, 0, 2), (0, 1)): -0.01,
  ((0, 0, 2), (1, 1)): -0.01,
  ((0, 0, 2), (1, 0)): -0.01004975,
  ((0, 0, 2), (1, -1)): -0.10006867490876309,
  ((0, 0, 2), (0, -1)): 0.0,
  ((0, 0, 2), (-1, -1)): 0.0,


In [5]:
action_options = [gwi.select_action(s, ACTIONS, Q, epsilon) for s in imp_state_list]

In [15]:
vals = np.array([Q[(s, a)] for a in ACTIONS for s in [imp_state_list[0]]])

In [19]:
np.exp(vals) / np.exp(vals).sum()

array([0.1249708 , 0.12148342, 0.12923883, 0.12033929, 0.12844864,
       0.12497024, 0.13090792, 0.11964086])

In [6]:
action_options

[(-1, 0),
 (0, 1),
 (0, 1),
 (0, 1),
 (-1, -1),
 (1, -1),
 (-1, 0),
 (1, 1),
 (-1, -1),
 (1, 1)]