In [212]:
import numpy as np

In [213]:
START_STATE = (0,0)
FOOD_STATE = (4,4)
ACTIONS = [(0,-1), (0,1), (-1,0), (1,0)]
p={}
p['specified'] = 0.7
p['right'] = 0.12
p['left'] = 0.12
p['sleepy'] = 0.06
FORBIDDEN_FURNITURES = [(2,1), (2,2), (2,3), (3,2)]
MONSTERS = [(0,3), (4,1)]
DELTA = 0.0001

In [214]:
def getTransitionProbabilities(action, p):
    if action == (0, -1):
        specified = (0, -1)
        left = (1, 0)
        right = (-1, 0)
    elif action == (0, 1):
        specified = (0, 1)
        left = (-1, 0)
        right = (1, 0)
    elif action == (-1, 0):
        specified = (-1, 0)
        left = (0, -1)
        right = (0, 1)
    else:
        specified = (1, 0)
        left = (0, 1)
        right = (0, -1)

    sleepy = (0, 0)
    return [(specified, p['specified']), (left, p['left']), (right, p['right']), (sleepy, p['sleepy'])]


In [215]:
def getReward(to_state, isCatnip=False, isCatnipTerminal = False, checkDiffRewards = (False, None)):
    if to_state == FOOD_STATE:
        return 10
    elif to_state in MONSTERS:
        return -8
    elif (isCatnip or isCatnipTerminal) and to_state == (0, 1):
        return 5
    elif (checkDiffRewards[0] and to_state == (0,1)):
        return checkDiffRewards[1]
    else:
        return -0.05


In [216]:
def isValidState(i, j):
    if i < 0 or j < 0 or i >= 5 or j >= 5:
        return False
    if (i, j) in FORBIDDEN_FURNITURES:
        return False
    return True


In [217]:
def runValueIterationAlgo(gamma, theta=DELTA, isCatnip=False, isCatnipTerminal=False, checkDiffRewards = (False, None)):
    value_mat = np.zeros((5, 5))
    policy_mat = np.full((5, 5), None)
    count = 0
    while True:
        count +=1
        delta = 0
        new_value_mat = np.copy(value_mat)

        for i in range(5):
            for j in range(5):
                if (i, j) in FORBIDDEN_FURNITURES or (i, j) == FOOD_STATE or (isCatnipTerminal and (i, j) == (0, 1)):
                    continue

                best_action = None
                best_value = float('-inf')

                for action in ACTIONS:
                    transitions = getTransitionProbabilities(action, p) #[(), ()]
                    value = 0

                    for next_state, prob in transitions:
                        next_i, next_j = i + next_state[0], j + next_state[1]

                        if not isValidState(next_i, next_j):
                            next_i = i
                            next_j = j

                        reward = getReward((next_i, next_j), isCatnip, isCatnipTerminal, checkDiffRewards)  # Adjust for Catnip if needed
                        value += prob * (reward + gamma * value_mat[next_i, next_j])

                    if value > best_value:
                        best_value = value
                        best_action = action

                new_value_mat[i, j] = best_value
                policy_mat[i, j] = best_action

                delta = max(delta, abs(value_mat[i, j] - new_value_mat[i, j]))

        value_mat = new_value_mat  # Update value matrix
        if delta < theta:  # Check for convergence
            break

    return value_mat, policy_mat, count


In [218]:
def runValueIterationInPlace(gamma, theta=DELTA, isCatnip=False, isCatnipTerminal=False):
    value_mat = np.zeros((5, 5))
    policy_mat = np.full((5, 5), None)
    count = 0
    while True:
        count += 1
        delta = 0

        for i in range(5):
            for j in range(5):
                if (i, j) in FORBIDDEN_FURNITURES or (i, j) == FOOD_STATE or (isCatnipTerminal and (i, j) == (0, 1)):
                    continue

                best_action = None
                best_value = float('-inf')

                for action in ACTIONS:
                    transitions = getTransitionProbabilities(action, p)
                    value = 0

                    for next_state, prob in transitions:
                        next_i, next_j = i + next_state[0], j + next_state[1]

                        if not isValidState(next_i, next_j):
                            next_i = i
                            next_j = j

                        temp_reward = getReward((next_i, next_j), isCatnip, isCatnipTerminal)
                        value += prob * (temp_reward + gamma * value_mat[next_i, next_j])

                    if value > best_value:
                        best_value = value
                        best_action = action

                # Update the value_mat directly
                delta = max(delta, abs(value_mat[i, j] - best_value))
                value_mat[i, j] = best_value
                policy_mat[i, j] = best_action

        if delta < theta:
            break

    return value_mat, policy_mat, count


In [219]:
def printPolicyMat(policy_mat, isCatnipTerminal=False):
    action_to_arrow = {
        (0, -1): '←',  # Left
        (0, 1): '→',   # Right
        (-1, 0): '↑',   # Up
        (1, 0): '↓'     # Down
    }
    result = []
    
    for i in range(5):
        row = []
        for j in range(5):
            state = (i, j)
            if state in FORBIDDEN_FURNITURES:
                symbol = "X"
            elif state in MONSTERS:
                action = policy_mat[i][j]
                symbol = f"{action_to_arrow.get(action, ' ')}ᴹ"
            elif state == FOOD_STATE or (isCatnipTerminal and state == (0,1)):
                symbol = "G"
            else:
                action = policy_mat[i][j]
                symbol = action_to_arrow.get(action, " ")
            row.append(symbol)
        result.append(" ".join(row))
    
    print("\n".join(result))


In [220]:
def printResults(value_mat, policy_mat, count, gamma, isCatnipTerminal=False, algoType = 'std'):
    if(algoType == 'std'):
        print("######################### USING STANDARD VALUE ITERATION ALGO #########################")
    else:
        print("######################### USING IN-PLACE VALUE ITERATION ALGO #########################")
    print("******** GAMMA ********")
    print(gamma)
    np.set_printoptions(precision=4, suppress=True)
    print("\n******** FINAL VALUE FUNCTION ********")
    print(value_mat)
    print("\n******** FINAL POLICY ********")
    printPolicyMat(policy_mat, isCatnipTerminal)
    print("\n******** TOTAL ITERATIONS ********")
    print(count)

In [221]:
gamma = 0.925
value_mat, policy_mat, count = runValueIterationAlgo(gamma)
printResults(value_mat, policy_mat, count, gamma)

######################### USING STANDARD VALUE ITERATION ALGO #########################
******** GAMMA ********
0.925

******** FINAL VALUE FUNCTION ********
[[2.6638 2.9969 2.8117 3.6671 4.8497]
 [2.9713 3.5101 4.0819 4.8497 7.1648]
 [2.5936 0.     0.     0.     8.4687]
 [2.0992 1.0849 0.     8.6097 9.5269]
 [1.0849 4.9465 8.4687 9.5269 0.    ]]

******** FINAL POLICY ********
→ ↓ ← ↓ᴹ ↓
→ → → → ↓
↑ X X X ↓
↑ ← X → ↓
↑ →ᴹ → → G

******** TOTAL ITERATIONS ********
41


In [222]:
gamma = 0.925
value_mat, policy_mat, count = runValueIterationInPlace(gamma)
printResults(value_mat, policy_mat, count, gamma, algoType='inplace')

######################### USING IN-PLACE VALUE ITERATION ALGO #########################
******** GAMMA ********
0.925

******** FINAL VALUE FUNCTION ********
[[2.6638 2.9969 2.8117 3.6671 4.8497]
 [2.9713 3.5101 4.0819 4.8497 7.1648]
 [2.5936 0.     0.     0.     8.4687]
 [2.0993 1.085  0.     8.6097 9.5269]
 [1.085  4.9466 8.4687 9.5269 0.    ]]

******** FINAL POLICY ********
→ ↓ ← ↓ᴹ ↓
→ → → → ↓
↑ X X X ↓
↑ ← X → ↓
↑ →ᴹ → → G

******** TOTAL ITERATIONS ********
29


In [223]:
gamma = 0.2
value_mat, policy_mat, count = runValueIterationAlgo(gamma)
printResults(value_mat, policy_mat, count, gamma)

######################### USING STANDARD VALUE ITERATION ALGO #########################
******** GAMMA ********
0.2

******** FINAL VALUE FUNCTION ********
[[-0.0625 -0.0625 -0.0625 -0.5445 -0.0579]
 [-0.0625 -0.0624 -0.0619 -0.058   0.0963]
 [-0.0625  0.      0.      0.      1.0304]
 [-0.0625 -0.0625  0.      1.1859  7.2754]
 [-0.0625 -0.5187  1.0304  7.2754  0.    ]]

******** FINAL POLICY ********
← ↓ ← ↓ᴹ →
→ → → ↓ ↓
← X X X ↓
← ↑ X → ↓
← ↑ᴹ → → G

******** TOTAL ITERATIONS ********
7


In [224]:
gamma = 0.2
value_mat, policy_mat, count = runValueIterationInPlace(gamma)
printResults(value_mat, policy_mat, count, gamma, algoType='inplace')

######################### USING IN-PLACE VALUE ITERATION ALGO #########################
******** GAMMA ********
0.2

******** FINAL VALUE FUNCTION ********
[[-0.0625 -0.0625 -0.0625 -0.5445 -0.0579]
 [-0.0625 -0.0624 -0.0619 -0.058   0.0963]
 [-0.0625  0.      0.      0.      1.0304]
 [-0.0625 -0.0625  0.      1.1859  7.2754]
 [-0.0625 -0.5187  1.0304  7.2754  0.    ]]

******** FINAL POLICY ********
← ↓ ← ↓ᴹ →
→ → → ↓ ↓
↑ X X X ↓
← ↑ X → ↓
← ↑ᴹ → → G

******** TOTAL ITERATIONS ********
7


In [225]:
gamma = 0.925
isCatnip = True
value_mat, policy_mat, count = runValueIterationAlgo(gamma, isCatnip=True, isCatnipTerminal=False, checkDiffRewards=(None, False))
printResults(value_mat, policy_mat, count, gamma)

######################### USING STANDARD VALUE ITERATION ALGO #########################
******** GAMMA ********
0.925

******** FINAL VALUE FUNCTION ********
[[47.1488 47.9783 47.1002 39.7442 29.0421]
 [42.772  46.5915 42.4068 37.0317 32.1221]
 [38.2626  0.      0.      0.     28.7183]
 [33.3777 27.8861  0.     21.9072 25.167 ]
 [27.8861 23.7394 16.1924 18.1453  0.    ]]

******** FINAL POLICY ********
→ ↑ ← ←ᴹ ↓
↑ ↑ ← ← ←
↑ X X X ↑
↑ ← X → ↑
↑ ↑ᴹ → ↑ G

******** TOTAL ITERATIONS ********
136


In [226]:
gammas = [0.925, 0.93, 0.94, 0.95]
for gamma in gammas:
    value_mat, policy_mat, count = runValueIterationAlgo(gamma, isCatnip=False, isCatnipTerminal=True, checkDiffRewards=(None, False))
    printResults(value_mat, policy_mat, count, gamma, isCatnipTerminal=True)

######################### USING STANDARD VALUE ITERATION ALGO #########################
******** GAMMA ********
0.925

******** FINAL VALUE FUNCTION ********
[[4.7478 0.     4.7634 3.9275 4.8883]
 [4.2547 4.7036 4.3718 4.8883 7.1699]
 [3.7438 0.     0.     0.     8.4687]
 [3.098  1.8748 0.     8.6097 9.5269]
 [1.8748 5.0517 8.4687 9.5269 0.    ]]

******** FINAL POLICY ********
→ G ← ↓ᴹ ↓
↑ ↑ → → ↓
↑ X X X ↓
↑ ← X → ↓
↑ →ᴹ → → G

******** TOTAL ITERATIONS ********
24
######################### USING STANDARD VALUE ITERATION ALGO #########################
******** GAMMA ********
0.93

******** FINAL VALUE FUNCTION ********
[[4.7617 0.     4.7938 4.093  5.0573]
 [4.2978 4.7351 4.5367 5.0573 7.3087]
 [3.8112 0.     0.     0.     8.5574]
 [3.185  1.9783 0.     8.691  9.5543]
 [1.9783 5.1773 8.5574 9.5543 0.    ]]

******** FINAL POLICY ********
→ G ← ↓ᴹ ↓
↑ ↑ → → ↓
↑ X X X ↓
↑ ← X ↓ ↓
↑ →ᴹ → → G

******** TOTAL ITERATIONS ********
24
######################### USING STANDARD VALUE ITERATION 

In [227]:
gamma = 0.925
rewards = [5, 3, 2.9, 2.8, 2.7, 2.6, 2.5]
for reward in rewards:
    value_mat, policy_mat, count = runValueIterationAlgo(gamma, checkDiffRewards = (True, reward))
    printResults(value_mat, policy_mat, count, gamma, isCatnipTerminal=True)
    print("\n******** REWARD ********")
    print(reward)

######################### USING STANDARD VALUE ITERATION ALGO #########################
******** GAMMA ********
0.925

******** FINAL VALUE FUNCTION ********
[[47.1488 47.9783 47.1002 39.7442 29.0421]
 [42.772  46.5915 42.4068 37.0317 32.1221]
 [38.2626  0.      0.      0.     28.7183]
 [33.3777 27.8861  0.     21.9072 25.167 ]
 [27.8861 23.7394 16.1924 18.1453  0.    ]]

******** FINAL POLICY ********
→ G ← ←ᴹ ↓
↑ ↑ ← ← ←
↑ X X X ↑
↑ ← X → ↑
↑ ↑ᴹ → ↑ G

******** TOTAL ITERATIONS ********
136

******** REWARD ********
5
######################### USING STANDARD VALUE ITERATION ALGO #########################
******** GAMMA ********
0.925

******** FINAL VALUE FUNCTION ********
[[28.2115 28.7124 28.1821 22.9857 16.2921]
 [25.568  27.8749 25.3475 21.5475 18.5825]
 [22.8446  0.      0.      0.     16.5842]
 [19.8143 15.9766  0.     12.6904 14.5133]
 [15.9766 13.4314  9.8631 11.0829  0.    ]]

******** FINAL POLICY ********
→ G ← ←ᴹ ↓
↑ ↑ ← ← ←
↑ X X X ↑
↑ ← X → ↑
↑ ↑ᴹ → ↑ G

******** TOTAL 