# Human Informed RL

In [1]:
import numpy as np
import gymnasium as gym
env = gym.make('FrozenLake-v1', is_slippery = False)
stateDim = 3#env.observation_space.n #size of state space, 16
actionDim = 2#env.action_space.n #size of action space, 4

<div>
<img src="Steps.png" width="500"/>
</div>

# Functions

In [2]:
def pi(state, theta, actionDim,):
    logits = np.zeros(actionDim)
    for action in range(actionDim):
        logit = np.exp(theta[state, action])
        logits[action] = logit
    return logits / np.sum(logits)

In [3]:
def piInverse(matrix, stateDim, actionDim):
    """Converts a matrix of probabilities to a matrix of numerical preferences using inverse softmax"""
    numPrefMatrix = np.zeros((stateDim, actionDim))
    for state in range(stateDim):
        mu = matrix[state]
        logSum = 0
        for action in range(actionDim):
            logSum += np.log(mu[action])
        c = (-1 / actionDim) * logSum

        for action in range(actionDim):
            numPrefMatrix[state, action] = np.log(mu[action]) + c

    return(numPrefMatrix)

In [4]:
def numToProb(matrix, stateDim, actionDim):
    """ Converts a matrix of numerical preferences to a matrix of probabilities"""
    probMatrix = np.zeros((stateDim, actionDim))
    for state in range(stateDim):
        probMatrix[state] = pi(state, matrix, actionDim)
    return probMatrix

In [5]:
def probToOpinion(matrix, stateDim, actionDim):
    """Converts a matrix of probabilities to a matrix of tuples of opinions (S Booleans)"""
    opinionMatrix = np.zeros((stateDim, actionDim), dtype = "f, f, f, f")
    for state in range(stateDim):
        for action in range(actionDim):
            p = matrix[state, action]
            opinionMatrix[state, action] = (p, 1-p, 0, p)
    return(opinionMatrix)



In [6]:
def opinionToProb(matrix, stateDim, actionDim):
    """Converts a matrix of tuples of opinions (SBool) to a matrix of probabilities"""
    probMatrix = np.zeros((stateDim, actionDim))
    for state in range(stateDim):
        for action in range(actionDim):
            b = (matrix[state, action])[0]
            u = (matrix[state, action])[2]
            a = (matrix[state, action])[3]
            probMatrix[state, action] = b + a * u
    return probMatrix


In [7]:
def beliefConstraintFusion(matrix1, matrix2, stateDim, actionDim):
    iota = np.zeros((stateDim, actionDim), dtype = "f, f, f, f")
    for state in range(stateDim):
        for action in range(actionDim):
            b1 = (matrix1[state, action])[0]
            d1 = (matrix1[state, action])[1]
            u1 =  (matrix1[state, action])[2]
            a1 = (matrix1[state, action])[3]

            b2 = (matrix2[state, action])[0]
            d2 = (matrix2[state, action])[1]
            u2 =  (matrix2[state, action])[2]
            a2 = (matrix2[state, action])[3]

            harmony = b1*b2 + b1*u2 + b2*u1
            conflict = b1*d2 + b2*d1
            b = harmony / (1 - conflict)
            u = u1 * u2 / (1 - conflict)
            a = (a1 * (1 - u1) + a2 * (1 - u2)) / (2 - u1 - u2)
            d = 1 - (b + u) #paper says d = 1 - b - u - a but I think this is an error bc b + d + u = 1

            (iota[state, action])[0] = b
            (iota[state, action])[1] = d
            (iota[state, action])[2] = u
            (iota[state, action])[3] = a
    return iota

## To Do

In [None]:
def consensusAndCompromiseFusion(matrix1, matrix2, stateDim, actionDim): #TODO
    return 0 

In [None]:
def averagingBeliefFusion(matrix1, matrix2, stateDim, actionDim): #TODO
    return 0

In [None]:
def weightedBeliefFusion(matrix1, matrix2, stateDim, actionDim): #TODO
    return 0

In [None]:
def aleatoryCumulativeBeliefFusion(matrix1, matrix2, stateDim, actionDim): #TODO
    return 0

In [None]:
def epistemicCumulativeBeliefFusion(matrix1, matrix2, stateDim, actionDim): #TODO
    return 0

# Main Code

In [18]:
#step 1 gather human opinion #TODO 
# for now just hard code values, will come up with likert scale later
kappa = np.zeros((stateDim, actionDim), dtype = "f, f, f, f")
#agent has opinion on s1 a0 and s1 a1
#kappa[1, 0] = (0.1, 0.7, 0.2, 0.5)
#kappa[1, 1] = (0.8, 0.0, 0.2, 0.5)

kappa[1, 0] = (0, 0, 1, 0.5)
kappa[1, 1] = (0, 0, 1, 0.5)

#agent does not have opinion on other state action pairs 
#paper says use dogmatic truth (1, 0, 0, 1)
#I propose to use (0, 0 ,1, 1/actionDim) bc this means human is completely uncertain and thus does not provide b/d & maintains identity of at least BCF
kappa[0, 0] = (0, 0, 1, 0.5)
kappa[0, 1] = (0, 0, 1, 0.5)
kappa[2, 0] = (0, 0, 1, 0.5)
kappa[2, 1] = (0, 0, 1, 0.5)

print(f"Kappa Matrix of Opinions is \n{kappa}")

Kappa Matrix of Opinions is 
[[(0., 0., 1., 0.5) (0., 0., 1., 0.5)]
 [(0., 0., 1., 0.5) (0., 0., 1., 0.5)]
 [(0., 0., 1., 0.5) (0., 0., 1., 0.5)]]


In [9]:
#step 2 convert matrix of numerical preferences to matrix of probs
theta = np.zeros((stateDim, actionDim))
print(f"Theta Matrix of Numerical Preferences is \n{theta}")

thetaProb = numToProb(theta, stateDim, actionDim)
print(f"\nPi Theta Matrix of Probabilities is \n{thetaProb}")

Theta Matrix of Numerical Preferences is 
[[0. 0.]
 [0. 0.]
 [0. 0.]]

Pi Theta Matrix of Probabilities is 
[[0.5 0.5]
 [0.5 0.5]
 [0.5 0.5]]


In [10]:
#step 3 convert matrix of probs to matrix of opinions
thetaOpinion = probToOpinion(thetaProb, stateDim, actionDim)
print(f"Pi Theta Matrix of Opinions is \n{thetaOpinion}")

Pi Theta Matrix of Opinions is 
[[(0.5, 0.5, 0., 0.5) (0.5, 0.5, 0., 0.5)]
 [(0.5, 0.5, 0., 0.5) (0.5, 0.5, 0., 0.5)]
 [(0.5, 0.5, 0., 0.5) (0.5, 0.5, 0., 0.5)]]


In [15]:
#step 4 fuse matrix of human and agent opinions into iota matrix #TODO
iota = beliefConstraintFusion(kappa, thetaOpinion, stateDim, actionDim) 
print(f"Iota Matrix of Fused Opinions is \n{iota}")

Iota Matrix of Fused Opinions is 
[[(0.5, 0.5, 0., 0.5) (0.5, 0.5, 0., 0.5)]
 [(0.5, 0.5, 0., 0.5) (0.5, 0.5, 0., 0.5)]
 [(0.5, 0.5, 0., 0.5) (0.5, 0.5, 0., 0.5)]]


In [16]:
#step 5 convert iota matrix of opinions into a matrix of probs
thetaProb2 = opinionToProb(iota, stateDim, actionDim)
print(f"Pi Theta Matrix of Probabilities is \n{thetaProb2}")

Pi Theta Matrix of Probabilities is 
[[0.5 0.5]
 [0.5 0.5]
 [0.5 0.5]]


In [17]:
#step 6 convert matrix of probs into matrix of numerical preferences
theta2 = piInverse(thetaProb2, stateDim, actionDim)
print(f"Theta Matrix of Numerical Preferences is \n{theta2}")

Theta Matrix of Numerical Preferences is 
[[0. 0.]
 [0. 0.]
 [0. 0.]]
