<a href="https://colab.research.google.com/github/pooriaazami/deep_learning_class_notebooks/blob/main/23_Value_functions_and_policy_iteration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

In [126]:
class Environment:
  def __init__(self):
    self.x = 2
    self.y = 2
    self.A = (0, 1)
    self.B = (0, 3)
    self.A_next = (1, 4)
    self.B_next = (3, 2)
    self.edge_size = 5

  def calculate_next(self, y, x, move):
    if isinstance(move, str):
      move = move.lower()
    new_x, new_y = x, y

    if move in ('u', 'up', 0):
      new_y -= 1
    elif move in ('d', 'down', 1):
      new_y += 1
    elif move in ('r', 'right', 2):
      new_x += 1
    elif move in ('l', 'left', 3):
      new_x -= 1

    if (y, x) == self.A:
      new_x, new_y = self.A_next
      reward = 10
    elif (y, x) == self.B:
      new_x, new_y = self.B_next
      reward = 5
    elif new_x < 0 or new_x >= self.edge_size:
      new_x, new_y = x, y
      reward = -1
    elif new_y < 0 or new_y >= self.edge_size:
      new_x, new_y = x, y
      reward = -1
    else:
      reward = 0

    return new_y, new_x, reward

  def step(self, move):
    new_y, new_x, reward = self.calculate_next(self.y, self.x, move)

    self.y = new_y
    self.x = new_x

    return reward

  def predict_reward(self, y, x, move):
    new_y, new_x, reward = self.calculate_next(y, x, move)
    return new_y, new_x, reward

  def reset(self):
    self.x = 2
    self.y = 2

  @property
  def moves(self):
    return range(4)

  def __repr__(self):
    val = ''
    for i in range(self.edge_size):
      for j in range(self.edge_size):
        if i == self.y and j == self.x:
          val += '*'
        else:
          val += '_'
      val += '\n'
    return val

In [161]:
def argmax(values):
  maximum = float('-inf')
  moves = []

  for i, value in enumerate(values):
    if value > maximum:
      moves = [i]
      maximum = value
    elif value == maximum:
      moves.append(i)

  return moves

In [134]:
def calculate_value_function(env, policy, gamma=.9):
  value_function = np.zeros((5, 5))

  for _ in range(50):
    for i in range(5):
      for j in range(5):
        temp = 0
        for a in env.moves:
          next_y, next_x, reward = env.predict_reward(i, j, a)
          temp += policy[i, j, a] * (reward + gamma * value_function[next_y, next_x])
        value_function[i, j] = temp

  return value_function

In [171]:
def update_policy(value_function):
  new_policy = np.zeros((5, 5, 4))

  for i in range(5):
    for j in range(5):
      l = []
      for a in env.moves:
        new_y, new_x, _ = env.predict_reward(i, j, a)
        l.append(value_function[new_y, new_x])
      maximums = argmax(l)

      new_policy[i, j, maximums] = 1 / len(maximums)

  return new_policy

In [172]:
env = Environment()

In [173]:
policy = np.ones((5, 5, 4)) * .25

In [174]:
for _ in range(10):
  value_function = calculate_value_function(env, policy)
  policy = update_policy(value_function)

In [175]:
value_function # Q table

array([[21.97748529, 24.4194281 , 21.97748529, 19.4194281 , 17.47748529],
       [19.77973676, 21.97748529, 19.77973676, 17.80176308, 16.02158677],
       [17.80176308, 19.77973676, 17.80176308, 16.02158677, 14.4194281 ],
       [16.02158677, 17.80176308, 16.02158677, 14.4194281 , 12.97748529],
       [14.4194281 , 16.02158677, 14.4194281 , 12.97748529, 11.67973676]])

In [181]:
moves = ['U', 'D', 'R', 'L']

for i in range(5):
  print('|', end='')
  for j in range(5):
    for a in env.moves:
      if policy[i, j, a] != 0:
        print(moves[a], end='')
      else:
        print(' ', end='')
    print('|', end='')
  print('')

|  R |UDRL|   L|UDRL|   L|
|  R |U   |U  L|   L|   L|
|  R |U   |U  L|U  L|U  L|
|  R |U   |U  L|U  L|U  L|
|  R |U   |U  L|U  L|U  L|
