#Planning

doesn't involve necessarily going to the end goal rather planning the best action for a specific state and thats it.

The below is an example for One-step Q planning

In [None]:
# prompt: create a 3x3 grid on which I can perform value iteration

import numpy as np

class GridWorld:
    def __init__(self, size=3):
        self.size = size
        self.grid = np.zeros((size, size))
        # Example: Define some rewards (you can customize this)
        self.grid[0, 2] = 1  # Goal state
        self.grid[1, 1] = -1  # Obstacle or penalty

    def get_possible_actions(self, state):
        row, col = state
        actions = []
        if row > 0:
            actions.append("up")
        if row < self.size - 1:
            actions.append("down")
        if col > 0:
            actions.append("left")
        if col < self.size - 1:
            actions.append("right")
        return actions

    def get_next_state(self, state, action):
        row, col = state
        if action == "up":
            row -= 1
        elif action == "down":
            row += 1
        elif action == "left":
            col -= 1
        elif action == "right":
            col += 1
        return row, col

    def get_reward(self,state):
        return self.grid[state]

# Example usage
grid = GridWorld()
print(grid.grid)
print(grid.get_possible_actions((0,0)))
print(grid.get_next_state((0,0),"right"))
print(grid.get_reward((0,2)))


[[ 0.  0.  1.]
 [ 0. -1.  0.]
 [ 0.  0.  0.]]
['down', 'right']
(0, 1)
1.0


In [None]:


import random

# Assuming a 3x3 grid world as an example
num_states = 9  # Number of states in the grid
num_actions = 4  # Number of possible actions (up, down, left, right)

# Initialize V(s) randomly
Q = [{'up':0.0,'down':0.0,'right':0.0,'left':0.0} for _ in range(num_states)]
N = [{'up':0,'down':0,'right':0,'left':0} for _ in range(num_states)]

# Initialize pi(s) randomly
pi = target_policy = {
    0: {"up": 0, "down": 0.5, "left": 0, "right": 0.5},  # (0,0) → mostly right
    1: {"up": 0, "down": 0.33, "left": 0.33, "right": 0.34},  # (0,1) → mostly right
    2: {"up": 0, "down": 0.5, "left": 0.5, "right": 0},  # (0,2) → goal state
    3: {"up": 0.34, "down": 0.33, "left": 0, "right": 0.33},  # (1,0) → mostly up
    4: {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25},  # (1,1) → penalty (random)
    5: {"up": 0.33, "down": 0.34, "left": 0.33, "right": 0},  # (1,2) → mostly up
    6: {"up": 0.5, "down": 0, "left": 0, "right": 0.5},  # (2,0) → mostly up
    7: {"up": 0.33, "down": 0, "left": 0.33, "right": 0.34},  # (2,1) → mostly up
    8: {"up": 0.5, "down": 0, "left": 0.5, "right": 0},  # (2,2) → mostly left
}

print("Q(s,a):", Q)
print("pi(s):", pi)
print(N)


Q(s,a): [{'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}]
pi(s): {0: {'up': 0, 'down': 0.5, 'left': 0, 'right': 0.5}, 1: {'up': 0, 'down': 0.33, 'left': 0.33, 'right': 0.34}, 2: {'up': 0, 'down': 0.5, 'left': 0.5, 'right': 0}, 3: {'up': 0.34, 'down': 0.33, 'left': 0, 'right': 0.33}, 4: {'up': 0.25, 'down': 0.25, 'left': 0.25, 'right': 0.25}, 5: {'up': 0.33, 'down': 0.34, 'left': 0.33, 'right': 0}, 6: {'up': 0.5, 'down': 0, 'left': 0, 'right': 0.5}, 7: {'up': 0.33, 'down': 0, 'left': 0.33, 'right': 0.34}, 8: {'up': 0.5, 'down': 0, 'left': 0.5, 'right': 0}}
[{'up': 0,

In [None]:
episodes = 5e6
gamma = 0.9
for _ in range(int(episodes)):
    # Generate an episode
    state = random.randint(0,8)
    actions, probabilities = zip(*pi[state].items())
    action = (random.choices(actions, probabilities)[0])
    #print((state//3,state%3))
    #print(action)
    next_state = grid.get_next_state((state//3,state%3),action)
    reward = grid.get_reward(next_state)
    nstate = next_state[0]*3 + next_state[1]
    N[state][action] += 1
    value = None
    for next_action in grid.get_possible_actions(next_state):
        if value is None or Q[nstate][next_action] > value:
            value = Q[nstate][next_action]
    Q[state][action] += (1/N[state][action])*(reward + gamma*value - Q[state][action])

In [None]:
pi_mc = []
for s in range(num_states):
    best_action = None
    best_value = -float('inf')
    for a in grid.get_possible_actions((s // 3, s % 3)):
        value = Q[s][a]
        if value > best_value:
            best_value = value
            best_action = a
    pi_mc.append(best_action)

print("Optimal policy (Monte Carlo):", pi_mc)

Optimal policy (Monte Carlo): ['right', 'right', 'down', 'up', 'up', 'up', 'right', 'right', 'up']


In [None]:
Q

[{'up': 0.0,
  'down': np.float64(2.8449885984042345),
  'right': np.float64(3.7482476305608814),
  'left': 0.0},
 {'up': 0.0,
  'down': np.float64(2.2751158001799805),
  'right': np.float64(4.274880558939775),
  'left': np.float64(3.2735082183825375)},
 {'up': 0.0,
  'down': np.float64(3.7486537879118593),
  'right': 0.0,
  'left': np.float64(3.7483413507711068)},
 {'up': np.float64(3.273452406758655),
  'down': np.float64(2.463450863875035),
  'right': np.float64(2.274840309082044),
  'left': 0.0},
 {'up': np.float64(3.748575261726096),
  'down': np.float64(2.84761751750847),
  'right': np.float64(3.748149049949298),
  'left': np.float64(2.8449468886457203)},
 {'up': np.float64(4.274606454481191),
  'down': np.float64(3.2742210558975926),
  'right': 0.0,
  'left': np.float64(2.2752349704862675)},
 {'up': np.float64(2.8452890133044537),
  'down': 0.0,
  'right': np.float64(2.847761961214706),
  'left': 0.0},
 {'up': np.float64(2.2744256118980553),
  'down': 0.0,
  'right': np.float64(

#Dyna Q

In [None]:


import random

# Assuming a 3x3 grid world as an example
num_states = 9  # Number of states in the grid
num_actions = 4  # Number of possible actions (up, down, left, right)

# Initialize V(s) randomly
Q = [{'up':0.0,'down':0.0,'right':0.0,'left':0.0} for _ in range(num_states)]
N = [{'up':0,'down':0,'right':0,'left':0} for _ in range(num_states)]
Model = [{'up':(-2000,-2000),'down':(-2000,-2000),'right':(-2000,-2000),'left':(-2000,-2000)} for _ in range(num_states)]

# Initialize pi(s) randomly
pi = target_policy = {
    0: {"up": 0, "down": 0.5, "left": 0, "right": 0.5},  # (0,0) → mostly right
    1: {"up": 0, "down": 0.33, "left": 0.33, "right": 0.34},  # (0,1) → mostly right
    2: {"up": 0, "down": 0.5, "left": 0.5, "right": 0},  # (0,2) → goal state
    3: {"up": 0.34, "down": 0.33, "left": 0, "right": 0.33},  # (1,0) → mostly up
    4: {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25},  # (1,1) → penalty (random)
    5: {"up": 0.33, "down": 0.34, "left": 0.33, "right": 0},  # (1,2) → mostly up
    6: {"up": 0.5, "down": 0, "left": 0, "right": 0.5},  # (2,0) → mostly up
    7: {"up": 0.33, "down": 0, "left": 0.33, "right": 0.34},  # (2,1) → mostly up
    8: {"up": 0.5, "down": 0, "left": 0.5, "right": 0},  # (2,2) → mostly left
}

print("Q(s,a):", Q)
print("pi(s):", pi)
print(N)


Q(s,a): [{'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}]
pi(s): {0: {'up': 0, 'down': 0.5, 'left': 0, 'right': 0.5}, 1: {'up': 0, 'down': 0.33, 'left': 0.33, 'right': 0.34}, 2: {'up': 0, 'down': 0.5, 'left': 0.5, 'right': 0}, 3: {'up': 0.34, 'down': 0.33, 'left': 0, 'right': 0.33}, 4: {'up': 0.25, 'down': 0.25, 'left': 0.25, 'right': 0.25}, 5: {'up': 0.33, 'down': 0.34, 'left': 0.33, 'right': 0}, 6: {'up': 0.5, 'down': 0, 'left': 0, 'right': 0.5}, 7: {'up': 0.33, 'down': 0, 'left': 0.33, 'right': 0.34}, 8: {'up': 0.5, 'down': 0, 'left': 0.5, 'right': 0}}
[{'up': 0,

In [None]:
episodes = 1e6
n=5
gamma = 0.9
#DYNA-Q
for _ in range(int(episodes)):
    '''
    Choose a state and action
    calculate the Q value for the state and action.

    and update the Q value and store the next state and reward for the next state as a pair in the Model dictionary

    DIRECT REINFORCEMENT LEARNING
    '''
    state = random.randint(0,8)
    actions, probabilities = zip(*pi[state].items())
    action = (random.choices(actions, probabilities)[0])

    next_state = grid.get_next_state((state//3,state%3),action)
    reward = grid.get_reward(next_state)
    nstate = next_state[0]*3 + next_state[1]
    N[state][action] += 1
    value = None
    for next_action in grid.get_possible_actions(next_state):
        if value is None or Q[nstate][next_action] > value:
            value = Q[nstate][next_action]
    Q[state][action] += (1/N[state][action])*(reward + gamma*value - Q[state][action])
    Model[state][action] = (nstate,reward)
    '''
    select random past experiences and then calculate the Q value for those past experience
    We are trying to learn the previous experience

    INDIRECT REINFORCEMENT LEARNING
    '''
    for i in range(n):
      iterate_state = random.randint(0,8)
      actions, probabilities = zip(*pi[iterate_state].items())
      iterate_action = (random.choices(actions, probabilities)[0])
      iterate_next_state, iterate_reward = Model[iterate_state][iterate_action]

      if iterate_next_state == -2000:
        continue


      N[state][action] += 1
      value = None
      for next_action in grid.get_possible_actions((iterate_next_state//3,iterate_next_state%3)):
        if value is None or Q[iterate_next_state][iterate_action] > value:
            value = Q[iterate_state][next_action]
      Q[state][action] += (1/N[state][action])*(iterate_reward + gamma*value - Q[iterate_next_state][iterate_action])

In [None]:
pi_mc = []
for s in range(num_states):
    best_action = None
    best_value = -float('inf')
    for a in grid.get_possible_actions((s // 3, s % 3)):
        value = Q[s][a]
        if value > best_value:
            best_value = value
            best_action = a
    pi_mc.append(best_action)

print("Optimal policy (Monte Carlo):", pi_mc)

Optimal policy (Monte Carlo): ['right', 'right', 'down', 'up', 'right', 'up', 'right', 'right', 'up']


#Prioritized Sweeping

we perform operations and update our Fixed model only when the priortized value is met given a threshol|

Goal : Minimize the number of exploration for reaching an optimal solution
Make the calculation efficient to scale to a larger model

In [None]:


import random

# Assuming a 3x3 grid world as an example
num_states = 9  # Number of states in the grid
num_actions = 4  # Number of possible actions (up, down, left, right)

# Initialize V(s) randomly
Q = [{'up':0.0,'down':0.0,'right':0.0,'left':0.0} for _ in range(num_states)]
N = [{'up':0,'down':0,'right':0,'left':0} for _ in range(num_states)]
Model = [{'up':(-2000,-2000),'down':(-2000,-2000),'right':(-2000,-2000),'left':(-2000,-2000)} for _ in range(num_states)]

# Initialize pi(s) randomly
pi = target_policy = {
    0: {"up": 0, "down": 0.5, "left": 0, "right": 0.5},  # (0,0) → mostly right
    1: {"up": 0, "down": 0.33, "left": 0.33, "right": 0.34},  # (0,1) → mostly right
    2: {"up": 0, "down": 0.5, "left": 0.5, "right": 0},  # (0,2) → goal state
    3: {"up": 0.34, "down": 0.33, "left": 0, "right": 0.33},  # (1,0) → mostly up
    4: {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25},  # (1,1) → penalty (random)
    5: {"up": 0.33, "down": 0.34, "left": 0.33, "right": 0},  # (1,2) → mostly up
    6: {"up": 0.5, "down": 0, "left": 0, "right": 0.5},  # (2,0) → mostly up
    7: {"up": 0.33, "down": 0, "left": 0.33, "right": 0.34},  # (2,1) → mostly up
    8: {"up": 0.5, "down": 0, "left": 0.5, "right": 0},  # (2,2) → mostly left
}

print("Q(s,a):", Q)
print("pi(s):", pi)
print(N)




Q(s,a): [{'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}, {'up': 0.0, 'down': 0.0, 'right': 0.0, 'left': 0.0}]
pi(s): {0: {'up': 0, 'down': 0.5, 'left': 0, 'right': 0.5}, 1: {'up': 0, 'down': 0.33, 'left': 0.33, 'right': 0.34}, 2: {'up': 0, 'down': 0.5, 'left': 0.5, 'right': 0}, 3: {'up': 0.34, 'down': 0.33, 'left': 0, 'right': 0.33}, 4: {'up': 0.25, 'down': 0.25, 'left': 0.25, 'right': 0.25}, 5: {'up': 0.33, 'down': 0.34, 'left': 0.33, 'right': 0}, 6: {'up': 0.5, 'down': 0, 'left': 0, 'right': 0.5}, 7: {'up': 0.33, 'down': 0, 'left': 0.33, 'right': 0.34}, 8: {'up': 0.5, 'down': 0, 'left': 0.5, 'right': 0}}
[{'up': 0,

In [None]:
from queue import PriorityQueue

pqchange = PriorityQueue()

In [None]:
episodes = 1e6
n=5
gamma = 0.9
threshold = 0.05
#Prioritzed sweeping(Not completed (Boring))
for _ in range(int(episodes)):
    '''
    Choose a state and action
    calculate the Q value for the state and action.

    and update the Q value and store the next state and reward for the next state as a pair in the Model dictionary

    DIRECT REINFORCEMENT LEARNING
    '''
    state = random.randint(0,8)
    actions, probabilities = zip(*pi[state].items())
    action = (random.choices(actions, probabilities)[0])

    next_state = grid.get_next_state((state//3,state%3),action)
    reward = grid.get_reward(next_state)
    nstate = next_state[0]*3 + next_state[1]

    Model[state][action] = (nstate,reward)

    value = None
    for next_action in grid.get_possible_actions(next_state):
        if value is None or Q[nstate][next_action] > value:
            value = Q[nstate][next_action]
    P = abs(reward + gamma*value - Q[state][action])
    if P>threshold:
      pqchange.put((P,(state,action)))


    '''
    select random past experiences and then calculate the Q value for those past experience
    We are trying to learn the previous experience

    INDIRECT REINFORCEMENT LEARNING
    '''
    for i in range(n):
      if pqchange.empty():
        break
      iterate_state , iterate_action = pqchange.get()[1]

      iterate_next_state, iterate_reward = Model[iterate_state][iterate_action]


      N[state][action] += 1
      value = None
      for next_action in grid.get_possible_actions((iterate_next_state//3,iterate_next_state%3)):
        if value is None or Q[iterate_next_state][iterate_action] > value:
            value = Q[iterate_state][next_action]
      Q[state][action] += (1/N[state][action])*(iterate_reward + gamma*value - Q[iterate_next_state][iterate_action])

      ##Another step to map from next_state to state
      #samething.


##Chapter 9 : Gradient descent stuff

- gradient monte carlo algorithm


In [None]:
# prompt: create a 3x3 grid on which I can perform value iteration

import numpy as np

class GridWorld:
    def __init__(self, size=3):
        self.size = size
        self.grid = np.zeros((size, size))
        # Example: Define some rewards (you can customize this)
        self.grid[0, 2] = 1  # Goal state
        self.grid[1, 1] = -1  # Obstacle or penalty
        self.grid[2,2] = -1

    def get_possible_actions(self, state):
        row, col = state
        actions = []
        if row > 0:
            actions.append("up")
        if row < self.size - 1:
            actions.append("down")
        if col > 0:
            actions.append("left")
        if col < self.size - 1:
            actions.append("right")
        return actions

    def get_next_state(self, state, action):
        row, col = state
        if action == "up":
            row -= 1
        elif action == "down":
            row += 1
        elif action == "left":
            col -= 1
        elif action == "right":
            col += 1
        return row, col

    def get_reward(self,state):
        return self.grid[state]

# Example usage
grid = GridWorld()
print(grid.grid)
print(grid.get_possible_actions((0,0)))
print(grid.get_next_state((0,0),"right"))
print(grid.get_reward((0,2)))


[[ 0.  0.  1.]
 [ 0. -1.  0.]
 [ 0.  0. -1.]]
['down', 'right']
(0, 1)
1.0


In [None]:
import random
import numpy as np

# Assuming a 3x3 grid world as an example
num_states = 9  # Number of states in the grid
num_actions = 4  # Number of possible actions (up, down, left, right)

# Initialize V(s) randomly
#V = [0 for _ in range(num_states)]
#N = [0 for _ in range(num_states)]

W = np.zeros((3))
#winning strategy
pi = target_policy = {
    0: {"up": 0, "down": 0.5, "left": 0, "right": 0.5},  # (0,0) → mostly right
    1: {"up": 0, "down": 0.33, "left": 0.33, "right": 0.34},  # (0,1) → mostly right
    2: {"up": 0, "down": 0, "left": 0, "right": 1.00},  # (0,2) → goal state
    3: {"up": 0.34, "down": 0.33, "left": 0, "right": 0.33},  # (1,0) → mostly up
    4: {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25},  # (1,1) → penalty (random)
    5: {"up": 0.33, "down": 0.34, "left": 0.33, "right": 0},  # (1,2) → mostly up
    6: {"up": 0.5, "down": 0, "left": 0, "right": 0.5},  # (2,0) → mostly up
    7: {"up": 0.33, "down": 0, "left": 0.33, "right": 0.34},  # (2,1) → mostly up
    8: {"up": 0.5, "down": 0, "left": 0.5, "right": 0},  # (2,2) → mostly left
}
# Initialize pi(s) randomly


W

array([0., 0., 0.])

In [None]:

gamma = 0.9
i=0
weightsum=0
goal =2
alpha = 0.05

while i<100000:
  state=goal
  while state==goal:
    state = random.randint(0,8)
  episodes = []
  actionlist = []
  goallist = []
  i+=1
  while state!=goal:
    x = state//3
    y = state%3

    actions, probabilities = zip(*pi[state].items())
    action = (random.choices(actions, probabilities)[0])
    random.choices(actions, probabilities)
    #N[state]+=1

    newx,newy = grid.get_next_state((x,y),action)

    reward = grid.get_reward((newx,newy))

    newstate = newx*3+newy
    #V[state] = V[state] + (1/N[state])*(reward + gamma*V[newstate]-V[state])
    episodes = episodes + [state]
    actionlist = actionlist + [action]
    goallist = goallist + [reward]
    state = newstate

    if state == goal:
      break

  G=0
  for t in reversed(range(len(episodes))):
    state = episodes[t]
    action = actionlist[t]
    reward = goallist[t]

    G = gamma*G + reward
    x = state//3
    y = state%3
    V = W.dot(np.array([x,y,1]))
    W += alpha*(G - V)*np.array([x,y,1])
    #N[state][action]+=1
    #Q[state][action] = Q[state][action] + (1/N[state][action])*(G-Q[state][action])
    #for doable in grid.get_possible_actions((state//3,state%3)):
    #  if Q[state][doable] > Q[state][action]:
    #    action = doable
    #pi[state] = action


In [None]:
V = []
for state in range(9):
  x = state//3
  y = state%3
  V.append(W.dot(np.array([x,y,1])))

V


[np.float64(-0.5439965777805723),
 np.float64(-0.4855123284781801),
 np.float64(-0.4270280791757879),
 np.float64(-0.6862892970697911),
 np.float64(-0.627805047767399),
 np.float64(-0.5693207984650067),
 np.float64(-0.8285820163590101),
 np.float64(-0.7700977670566178),
 np.float64(-0.7116135177542257)]

In [None]:
pi_mc = []
for s in range(num_states):
    best_action = None
    best_value = -float('inf')
    for a in grid.get_possible_actions((s // 3, s % 3)):
        s_prime = grid.get_next_state((s // 3, s % 3), a)
        value = grid.get_reward(s_prime) + gamma * V[s_prime[0] * 3 + s_prime[1]]
        if value > best_value:
            best_value = value
            best_action = a
    pi_mc.append(best_action)

print("Optimal policy (Monte Carlo):", pi_mc)
print("Weight:",W)

Optimal policy (Monte Carlo): ['right', 'right', 'left', 'up', 'up', 'up', 'up', 'left', 'up']
Weight: [-0.14229272  0.05848425 -0.54399658]


TD(0) approach using semi gradient

same approach but instead of using the goal we calculate TD(0) = R+ gamma*V with our neural network function to coompute the same

In [None]:
import random
import numpy as np

# Assuming a 3x3 grid world as an example
num_states = 9  # Number of states in the grid
num_actions = 4  # Number of possible actions (up, down, left, right)

# Initialize V(s) randomly
#V = [0 for _ in range(num_states)]
#N = [0 for _ in range(num_states)]

W = np.zeros((3))
#winning strategy
pi = target_policy = {
    0: {"up": 0, "down": 0.5, "left": 0, "right": 0.5},  # (0,0) → mostly right
    1: {"up": 0, "down": 0.33, "left": 0.33, "right": 0.34},  # (0,1) → mostly right
    2: {"up": 0, "down": 0, "left": 0, "right": 1.00},  # (0,2) → goal state
    3: {"up": 0.34, "down": 0.33, "left": 0, "right": 0.33},  # (1,0) → mostly up
    4: {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25},  # (1,1) → penalty (random)
    5: {"up": 0.33, "down": 0.34, "left": 0.33, "right": 0},  # (1,2) → mostly up
    6: {"up": 0.5, "down": 0, "left": 0, "right": 0.5},  # (2,0) → mostly up
    7: {"up": 0.33, "down": 0, "left": 0.33, "right": 0.34},  # (2,1) → mostly up
    8: {"up": 0.5, "down": 0, "left": 0.5, "right": 0},  # (2,2) → mostly left
}
# Initialize pi(s) randomly


W

array([0., 0., 0.])

In [None]:

gamma = 0.9
i=0
weightsum=0
goal =2
alpha = 0.05

while i<100000:
  state=goal
  while state==goal:
    state = random.randint(0,8)
  i+=1
  while state!=goal:
    x = state//3
    y = state%3

    actions, probabilities = zip(*pi[state].items())
    action = (random.choices(actions, probabilities)[0])
    random.choices(actions, probabilities)
    #N[state]+=1

    newx,newy = grid.get_next_state((x,y),action)

    reward = grid.get_reward((newx,newy))

    newstate = newx*3+newy

    V_next = W.dot(np.array([newx,newy,1]))
    V = W.dot(np.array([x,y,1]))

    W += alpha*(reward + gamma*V_next - V)*np.array([x,y,1])

    state = newstate

    if state == goal:
      break


In [None]:
V = []
for state in range(9):
  x = state//3
  y = state%3
  V.append(W.dot(np.array([x,y,1])))

print("Value Function (TD(0)):", V)

pi_mc = []
for s in range(num_states):
    best_action = None
    best_value = -float('inf')
    for a in grid.get_possible_actions((s // 3, s % 3)):
        s_prime = grid.get_next_state((s // 3, s % 3), a)
        value = grid.get_reward(s_prime) + gamma * V[s_prime[0] * 3 + s_prime[1]]
        if value > best_value:
            best_value = value
            best_action = a
    pi_mc.append(best_action)

print("Optimal policy (Monte Carlo):", pi_mc)
print("Weight:",W)

Value Function (TD(0)): [np.float64(-1.0724578835080645), np.float64(-0.9533612908049747), np.float64(-0.834264698101885), np.float64(-1.2770244004521643), np.float64(-1.1579278077490744), np.float64(-1.0388312150459846), np.float64(-1.481590917396264), np.float64(-1.3624943246931742), np.float64(-1.2433977319900844)]
Optimal policy (Monte Carlo): ['right', 'right', 'left', 'up', 'up', 'up', 'up', 'left', 'up']
Weight: [-0.20456652  0.11909659 -1.07245788]


You can extend The above algorithm to N-steps as well


##Chapter 10: On policy control (Using Q values) for Approximation



In [None]:
import random
import numpy as np

# Assuming a 3x3 grid world as an example
num_states = 9  # Number of states in the grid
num_actions = 4  # Number of possible actions (up, down, left, right)

# Initialize V(s) randomly
#V = [0 for _ in range(num_states)]
#N = [0 for _ in range(num_states)]

W = np.zeros((4))

actiondict1 = {'up':1 , 'down':-1 , 'left':0 , 'right':0}
actiondict2 = {'up':0 , 'down':0 , 'left':-1 , 'right':1}
#winning strategy


W

array([0., 0., 0., 0.])

In [None]:
def best_action(state):
  action = None
  x = state//3
  y = state%3
  if random.random() < epsilon:
    action = random.choice(grid.get_possible_actions((x,y)))
  else:
    buff_value = None
    for a in grid.get_possible_actions((x,y)):
      next_state = grid.get_next_state((x,y),a)
      if action is None or W.dot(np.array([newx,newy,actiondict1[a],actiondict2[a]]))>buff_value:
        action = a
        buff_value = W.dot(np.array([newx,newy,actiondict1[a],actiondict2[a]]))
  return action

In [None]:

gamma = 0.9
i=0
weightsum=0
goal =2
alpha = 0.01
epsilon = 0.2

while i<100000:
  state=goal
  while state==goal:
    state = random.randint(0,8)
  i+=1

  action = best_action(state)

  while state!=goal:
    x = state//3
    y = state%3


    currentQ =W.dot(np.array([x,y,actiondict1[action],actiondict2[action]]))


    if state == goal:
      W += alpha*(reward  - currentQ)*np.array([x,y,actiondict1[action],actiondict2[action]])
      break

    newx,newy = grid.get_next_state((x,y),action)

    reward = grid.get_reward((newx,newy))

    newstate = newx*3+newy

    newaction = best_action(newstate)
    nextQ = W.dot(np.array([newx,newy,actiondict1[newaction],actiondict2[newaction]]))
    W += alpha*(reward + gamma*nextQ - currentQ)*np.array([x,y,actiondict1[action],actiondict2[action]])

    state = newstate
    action = newaction

In [None]:
Q = [{'up':0.0,'down':0.0,'right':0.0,'left':0.0} for _ in range(num_states)]

for state in range(num_states):
  for action in grid.get_possible_actions((state//3,state%3)):
    Q[state][action] = W.dot(np.array([state//3,state%3,actiondict1[action],actiondict2[action]]))

Maybe the features should be modified for a better result, but this is what I could come up with . Playing around with the reward , by boosting up the negative reward can get the desired results

In [None]:
pi_mc = []
for s in range(num_states):
    best_action = None
    best_value = -float('inf')
    for a in grid.get_possible_actions((s // 3, s % 3)):
        value = Q[s][a]
        if value > best_value:
            best_value = value
            best_action = a
    pi_mc.append(best_action)

print("Optimal policy (Monte Carlo):", pi_mc)
print(Q)

Optimal policy (Monte Carlo): ['right', 'right', 'down', 'right', 'right', 'up', 'right', 'right', 'up']
[{'up': 0.0, 'down': np.float64(-0.6535574725522957), 'right': np.float64(0.9472149871672468), 'left': 0.0}, {'up': 0.0, 'down': np.float64(0.2213809055655951), 'right': np.float64(1.8221533652851376), 'left': np.float64(-0.07227660904935607)}, {'up': 0.0, 'down': np.float64(1.0963192836834859), 'right': 0.0, 'left': np.float64(0.8026617690685347)}, {'up': np.float64(0.2367900811781281), 'down': np.float64(-1.0703248639264633), 'right': np.float64(0.5304475957930792), 'left': 0.0}, {'up': np.float64(1.1117284592960188), 'down': np.float64(-0.19538648580857249), 'right': np.float64(1.40538597391097), 'left': np.float64(-0.48904400042352364)}, {'up': np.float64(1.9866668374139096), 'down': np.float64(0.6795518923093182), 'right': 0.0, 'left': np.float64(0.3858943776943671)}, {'up': np.float64(-0.17997731019603946), 'down': 0.0, 'right': np.float64(0.11368020441891169), 'left': 0.0}, {

The above algorithm can be extended toward n-step SARSA!!

There is an expected approach for Sarsa for continuous reward which replaces gamma discounting with R mean

delta = summation(t+1 -> t+n) - (R- Rmean) + Q(nextstate,a') - Q(state,a)
R_mean = R_mean  + beta * delta
beta is some contstant> 0

##TD(lambda)

In [None]:
# prompt: create a 3x3 grid on which I can perform value iteration

import numpy as np

class GridWorld:
    def __init__(self, size=3):
        self.size = size
        self.grid = np.zeros((size, size))
        # Example: Define some rewards (you can customize this)
        self.grid[0, 2] = 1  # Goal state
        self.grid[1, 1] = -1  # Obstacle or penalty
        self.grid[2,2] = -1

    def get_possible_actions(self, state):
        row, col = state
        actions = []
        if row > 0:
            actions.append("up")
        if row < self.size - 1:
            actions.append("down")
        if col > 0:
            actions.append("left")
        if col < self.size - 1:
            actions.append("right")
        return actions

    def get_next_state(self, state, action):
        row, col = state
        if action == "up":
            row -= 1
        elif action == "down":
            row += 1
        elif action == "left":
            col -= 1
        elif action == "right":
            col += 1
        return row, col

    def get_reward(self,state):
        return self.grid[state]

# Example usage
grid = GridWorld()
print(grid.grid)
print(grid.get_possible_actions((0,0)))
print(grid.get_next_state((0,0),"right"))
print(grid.get_reward((0,2)))


[[ 0.  0.  1.]
 [ 0. -1.  0.]
 [ 0.  0. -1.]]
['down', 'right']
(0, 1)
1.0


In [None]:
import random
import numpy as np

# Assuming a 3x3 grid world as an example
num_states = 9  # Number of states in the grid
num_actions = 4  # Number of possible actions (up, down, left, right)

# Initialize V(s) randomly
#V = [0 for _ in range(num_states)]
#N = [0 for _ in range(num_states)]

W = np.zeros((2))
#winning strategy
pi = target_policy = {
    0: {"up": 0, "down": 0.5, "left": 0, "right": 0.5},  # (0,0) → mostly right
    1: {"up": 0, "down": 0.33, "left": 0.33, "right": 0.34},  # (0,1) → mostly right
    2: {"up": 0, "down": 0, "left": 0, "right": 1.00},  # (0,2) → goal state
    3: {"up": 0.34, "down": 0.33, "left": 0, "right": 0.33},  # (1,0) → mostly up
    4: {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25},  # (1,1) → penalty (random)
    5: {"up": 0.33, "down": 0.34, "left": 0.33, "right": 0},  # (1,2) → mostly up
    6: {"up": 0.5, "down": 0, "left": 0, "right": 0.5},  # (2,0) → mostly up
    7: {"up": 0.33, "down": 0, "left": 0.33, "right": 0.34},  # (2,1) → mostly up
    8: {"up": 0.5, "down": 0, "left": 0.5, "right": 0},  # (2,2) → mostly left
}
# Initialize pi(s) randomly


W

array([0., 0.])

In [None]:

gamma = 0.9
lamb_da = 0.8
i=0
weightsum=0
goal =2
alpha = 0.05

while i<100000:
  state=goal
  while state==goal:
    state = random.randint(0,8)
  i+=1
  Z = np.zeros((2))

  while state!=goal:
    x = state//3
    y = state%3

    actions, probabilities = zip(*pi[state].items())
    action = (random.choices(actions, probabilities)[0])
    random.choices(actions, probabilities)
    #N[state]+=1

    newx,newy = grid.get_next_state((x,y),action)

    reward = grid.get_reward((newx,newy))

    newstate = newx*3+newy

    V_next = W.dot(np.array([newx,newy]))

    Z = gamma*lamb_da*Z + np.array([x,y])

    V = W.dot(np.array([x,y]))

    W += alpha*(reward + gamma*V_next - V)*Z

    state = newstate

    if state == goal:
      break


In [None]:
W

array([-0.35977965,  0.19676802])

In [None]:
V = []
for state in range(9):
  x = state//3
  y = state%3
  V.append(W.dot(np.array([x,y])))

print("Value Function (TD(0)):", V)

pi_mc = []
for s in range(num_states):
    best_action = None
    best_value = -float('inf')
    for a in grid.get_possible_actions((s // 3, s % 3)):
        s_prime = grid.get_next_state((s // 3, s % 3), a)
        value = grid.get_reward(s_prime) + gamma * V[s_prime[0] * 3 + s_prime[1]]
        if value > best_value:
            best_value = value
            best_action = a
    pi_mc.append(best_action)

print("Optimal policy (Monte Carlo):", pi_mc)
print("Weight:",W)

Value Function (TD(0)): [np.float64(0.0), np.float64(0.19676801761779486), np.float64(0.3935360352355897), np.float64(-0.3597796527713165), np.float64(-0.16301163515352166), np.float64(0.033756382464273205), np.float64(-0.719559305542633), np.float64(-0.5227912879248382), np.float64(-0.3260232703070433)]
Optimal policy (Monte Carlo): ['right', 'right', 'left', 'up', 'up', 'up', 'up', 'left', 'up']
Weight: [-0.35977965  0.19676802]


#Online TD(lambda)

In [None]:
import random
import numpy as np

# Assuming a 3x3 grid world as an example
num_states = 9  # Number of states in the grid
num_actions = 4  # Number of possible actions (up, down, left, right)

# Initialize V(s) randomly
#V = [0 for _ in range(num_states)]
#N = [0 for _ in range(num_states)]

W = np.zeros((2))
#winning strategy
pi = target_policy = {
    0: {"up": 0, "down": 0.5, "left": 0, "right": 0.5},  # (0,0) → mostly right
    1: {"up": 0, "down": 0.33, "left": 0.33, "right": 0.34},  # (0,1) → mostly right
    2: {"up": 0, "down": 0, "left": 0, "right": 1.00},  # (0,2) → goal state
    3: {"up": 0.34, "down": 0.33, "left": 0, "right": 0.33},  # (1,0) → mostly up
    4: {"up": 0.25, "down": 0.25, "left": 0.25, "right": 0.25},  # (1,1) → penalty (random)
    5: {"up": 0.33, "down": 0.34, "left": 0.33, "right": 0},  # (1,2) → mostly up
    6: {"up": 0.5, "down": 0, "left": 0, "right": 0.5},  # (2,0) → mostly up
    7: {"up": 0.33, "down": 0, "left": 0.33, "right": 0.34},  # (2,1) → mostly up
    8: {"up": 0.5, "down": 0, "left": 0.5, "right": 0},  # (2,2) → mostly left
}
# Initialize pi(s) randomly


W

array([0., 0.])

In [None]:

gamma = 0.9
lamb_da = 0.8
i=0
weightsum=0
goal =2
alpha = 0.05

while i<100000:
  state=goal
  while state==goal:
    state = random.randint(0,8)
  i+=1
  Z = np.zeros((2))
  V_old = 0

  while state!=goal:
    x = state//3
    y = state%3

    actions, probabilities = zip(*pi[state].items())
    action = (random.choices(actions, probabilities)[0])
    random.choices(actions, probabilities)
    #N[state]+=1

    newx,newy = grid.get_next_state((x,y),action)

    reward = grid.get_reward((newx,newy))

    newstate = newx*3+newy
    V = W.dot(np.array([x,y]))
    V_next = W.dot(np.array([newx,newy]))

    delta = reward + gamma*V_next - V

    Z = gamma*lamb_da*Z + (1 - alpha* gamma * lamb_da * Z.dot(np.array([x,y])))*np.array([x,y])

    W += alpha*(delta + V - V_old)*Z - alpha*(V-V_old)*np.array([x,y])

    V_old = V

    state = newstate

    if state == goal:
      break


In [None]:
V = []
for state in range(9):
  x = state//3
  y = state%3
  V.append(W.dot(np.array([x,y])))

print("Value Function (TD(0)):", V)

pi_mc = []
for s in range(num_states):
    best_action = None
    best_value = -float('inf')
    for a in grid.get_possible_actions((s // 3, s % 3)):
        s_prime = grid.get_next_state((s // 3, s % 3), a)
        value = grid.get_reward(s_prime) + gamma * V[s_prime[0] * 3 + s_prime[1]]
        if value > best_value:
            best_value = value
            best_action = a
    pi_mc.append(best_action)

print("Optimal policy (Monte Carlo):", pi_mc)
print("Weight:",W)

Value Function (TD(0)): [np.float64(0.0), np.float64(-0.04182715310722768), np.float64(-0.08365430621445535), np.float64(-0.3445419060423065), np.float64(-0.3863690591495342), np.float64(-0.42819621225676185), np.float64(-0.689083812084613), np.float64(-0.7309109651918406), np.float64(-0.7727381182990684)]
Optimal policy (Monte Carlo): ['right', 'right', 'left', 'up', 'up', 'up', 'up', 'left', 'up']
Weight: [-0.34454191 -0.04182715]


#Sarsa(lambda)

same thing but implementing sarsa W = 3dimension an extra for action

similar approach as the other lambdas where our lambda update on z = lambda * gamma * z'\

There is also a true online SARSA approach .

#Policy Gradient

Doesn't work. Have to modify it for a better result. It fails at state 7 doesn't give a good result at that state.


FEATURES are the concern. We don't have much info to learn from . We can use conv net of the grid for more features but i'm not going to. The problem doesn't require Neural Network so Cool!!

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random

In [None]:
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim=2, output_dim=4, hidden_dim=16):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = self.softmax(self.fc2(x))
        return x

In [None]:
import numpy as np

class GridWorld:
    # Your existing GridWorld class remains unchanged
    def __init__(self, size=3):
        self.size = size
        self.grid = np.zeros((size, size))
        self.grid[0, 2] = 1  # Goal state
        self.grid[1, 1] = -1  # Obstacle
        self.grid[2,2] = -1


    def get_possible_actions(self, state):
        row, col = state
        actions = []
        if row > 0:
            actions.append("up")
        if row < self.size - 1:
            actions.append("down")
        if col > 0:
            actions.append("left")
        if col < self.size - 1:
            actions.append("right")
        return actions

    def get_next_state(self, state, action):
        row, col = state
        if action == "up":
            row -= 1
        elif action == "down":
            row += 1
        elif action == "left":
            col -= 1
        elif action == "right":
            col += 1


        if row < 0 or row >= self.size or col < 0 or col >= self.size:
            return state
        else:
           return row, col

    def get_reward(self, state):
        return self.grid[state]

grid = GridWorld()
print(grid.grid)
print(grid.get_possible_actions((0,0)))
print(grid.get_next_state((0,0),"right"))
print(grid.get_reward((0,2)))

[[ 0.  0.  1.]
 [ 0. -1.  0.]
 [ 0.  0. -1.]]
['down', 'right']
(0, 1)
1.0


In [None]:
policy_net = PolicyNetwork()
optimizer = optim.Adam(policy_net.parameters(), lr=0.01)
action_map = ["up", "down", "left", "right"]

In [None]:

gamma = 0.9
i=0
weightsum=0
goal =2
alpha = 0.05

while i<15000:
  state=goal
  while state==goal:
    state = random.randint(0,8)
  episodes = []
  actionlist = []
  goallist = []
  logproblist = []

  i+=1
  while state!=goal:
    x = state//3
    y = state%3
    state_tensor = torch.tensor((x,y), dtype=torch.float32)
    action_probs = policy_net(state_tensor)
    action_dist = torch.distributions.Categorical(action_probs) #categorical keeps track of action probs , log probs .
    action_index  = action_dist.sample() #sample action
    action = action_map[action_index.item()]
    log_prob = action_dist.log_prob(action_index)#log probability


    newx,newy = grid.get_next_state((x,y),action)

    reward = grid.get_reward((newx,newy))

    newstate = newx*3+newy
    episodes = episodes + [state]
    actionlist = actionlist + [action]
    goallist = goallist + [reward]
    logproblist = logproblist + [log_prob]
    state = newstate

    if state == goal:
      break

  G=0
  policy_loss = []
  for t in reversed(range(len(episodes))):
    state = episodes[t]
    action = actionlist[t]
    reward = goallist[t]
    log_prob = logproblist[t]


    G = gamma*G + reward
    policy_loss.append(-log_prob*G)
  optimizer.zero_grad()
  loss = torch.stack(policy_loss).sum()
  if i%1000==0:
    print(f"{i} episodes \|\| Loss : {loss}")
  loss.backward()
  optimizer.step()


1000 episodes \|\| Loss : 2.3841860752327193e-07
2000 episodes \|\| Loss : 2.2649767572602286e-07
3000 episodes \|\| Loss : 4.732609113489161e-07
4000 episodes \|\| Loss : 2.2649767572602286e-07
5000 episodes \|\| Loss : 1.1920930376163597e-07
6000 episodes \|\| Loss : 2.0384790389016416e-07
7000 episodes \|\| Loss : 2.2649767572602286e-07
8000 episodes \|\| Loss : 3.2305720765180013e-07
9000 episodes \|\| Loss : 1.1920930376163597e-07
10000 episodes \|\| Loss : 2.2649767572602286e-07
11000 episodes \|\| Loss : 2.2649767572602286e-07
12000 episodes \|\| Loss : 1.1920930376163597e-07
13000 episodes \|\| Loss : 2.0384790389016416e-07
14000 episodes \|\| Loss : 2.0384790389016416e-07
15000 episodes \|\| Loss : 2.0384790389016416e-07


In [None]:
for i in range(9):
  x = i//3
  y = i%3
  state_tensor = torch.tensor((x,y), dtype=torch.float32)
  action_probs = policy_net(state_tensor)

  print(i)
  print('----------------------------------------------')
  print("Best Action :", action_map[np.argmax(action_probs.detach().numpy())])
  print(action_probs)
  #print(action_probs)
  #print(action_dist.logits)
  #print(action_dist.probs)
  #print(action_index)

  #print(action)
  #print(log_prob)

0
----------------------------------------------
Best Action : right
tensor([7.1313e-08, 8.4415e-09, 9.1579e-09, 1.0000e+00],
       grad_fn=<SoftmaxBackward0>)
1
----------------------------------------------
Best Action : right
tensor([4.2570e-08, 4.5811e-10, 3.8859e-10, 1.0000e+00],
       grad_fn=<SoftmaxBackward0>)
2
----------------------------------------------
Best Action : right
tensor([4.7620e-08, 2.4785e-11, 1.5792e-11, 1.0000e+00],
       grad_fn=<SoftmaxBackward0>)
3
----------------------------------------------
Best Action : up
tensor([1.0000e+00, 1.5127e-08, 6.7268e-09, 8.9378e-08],
       grad_fn=<SoftmaxBackward0>)
4
----------------------------------------------
Best Action : up
tensor([1.0000e+00, 8.3336e-10, 2.5925e-10, 6.9470e-08],
       grad_fn=<SoftmaxBackward0>)
5
----------------------------------------------
Best Action : up
tensor([1.0000e+00, 4.5106e-11, 9.7784e-12, 6.0992e-08],
       grad_fn=<SoftmaxBackward0>)
6
-----------------------------------------

ACTOR -CRITIC

Fancy term for using weight for value prediction and policy prediction

In [None]:
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim=2, output_dim=4, hidden_dim=16):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = self.softmax(self.fc2(x))
        return x

class ValueNetwork(nn.Module):
    def __init__(self, input_dim=2, output_dim=1, hidden_dim=9):
        super(ValueNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, output_dim)

    def forward(self, state):
        x = self.fc1(state)

        return x

In [None]:
policy_net = PolicyNetwork()
value_net =  ValueNetwork()
optimizer1 = optim.Adam(policy_net.parameters(), lr=0.00005)
optimizer2 = optim.Adam(value_net.parameters(), lr=0.01)
action_map = ["up", "down", "left", "right"]

In [None]:

gamma = 0.9
i=0
weightsum=0
goal =2


while i<10000:
  state=goal
  while state==goal:
    state = random.randint(0,8)

  I = 1
  i+=1

  #policy_loss = []
  #value_loss = []
  if i%1000==0:
    print(i)
  while state!=goal:
    x = state//3
    y = state%3
    state_tensor = torch.tensor((x,y), dtype=torch.float32)
    action_probs = policy_net(state_tensor)
    action_dist = torch.distributions.Categorical(action_probs) #categorical keeps track of action probs , log probs .
    action_index  = action_dist.sample() #sample action
    action = action_map[action_index.item()]
    log_prob = action_dist.log_prob(action_index)#log probability
    if random.random()>0.7:
      action = random.choice(action_map)

    newx,newy = grid.get_next_state((x,y),action)
    #print(state," , ", action," , ",id)
    reward = grid.get_reward((newx,newy))
    #print(action_probs)

    newstate = newx*3+newy

    Vs = value_net(torch.tensor((x,y), dtype=torch.float32))
    Vs_next = value_net(torch.tensor((newx,newy), dtype=torch.float32)).detach()
    delta = reward + gamma*Vs_next - Vs

    value_loss= (delta**(2)).sum()
    policy_loss=(-log_prob*delta.detach()*I)

    optimizer1.zero_grad()
    policy_loss.backward()
    optimizer1.step()

    optimizer2.zero_grad()
    value_loss.backward()
    optimizer2.step()

    I*=gamma

    state = newstate

    if state == goal:
      break

print(i)


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
10000


In [None]:
print(i)

10000


In [None]:
for i in range(9):
  x = i//3
  y = i%3
  state_tensor = torch.tensor((x,y), dtype=torch.float32)
  action_probs = policy_net(state_tensor)

  print(i)
  print('----------------------------------------------')
  print("Best Action :", action_map[np.argmax(action_probs.detach().numpy())])
  print(action_probs)
  #print(action_probs)
  #print(action_dist.logits)
  #print(action_dist.probs)
  #print(action_index)

  #print(action)
  #print(log_prob)

0
----------------------------------------------
Best Action : right
tensor([0.1359, 0.0228, 0.0248, 0.8166], grad_fn=<SoftmaxBackward0>)
1
----------------------------------------------
Best Action : right
tensor([0.1436, 0.0045, 0.0048, 0.8471], grad_fn=<SoftmaxBackward0>)
2
----------------------------------------------
Best Action : right
tensor([0.2304, 0.0009, 0.0010, 0.7677], grad_fn=<SoftmaxBackward0>)
3
----------------------------------------------
Best Action : up
tensor([0.5902, 0.0115, 0.0119, 0.3864], grad_fn=<SoftmaxBackward0>)
4
----------------------------------------------
Best Action : up
tensor([0.7396, 0.0018, 0.0020, 0.2566], grad_fn=<SoftmaxBackward0>)
5
----------------------------------------------
Best Action : up
tensor([8.3611e-01, 2.5129e-04, 2.8964e-04, 1.6335e-01],
       grad_fn=<SoftmaxBackward0>)
6
----------------------------------------------
Best Action : up
tensor([0.9411, 0.0015, 0.0015, 0.0559], grad_fn=<SoftmaxBackward0>)
7
---------------------

Replicate eligibility traces

where zw stores value weight modification
and z_theta stores policy weight modification