PLEASE GO THE FOLLOWING LINK TO RUN THE CODE https://colab.research.google.com/drive/1u7PHpb2CAS7DMYpy9sBmvOhdYCzVwZuC?usp=sharing

In [None]:
# -*- coding: utf-8 -*-
"""2D_Matrix.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1u7PHpb2CAS7DMYpy9sBmvOhdYCzVwZuC
"""

In [14]:
import numpy as np

In [15]:
START = (7,0)
BOARD_ROWS = 8
BOARD_COLS = 8
WIN_STATE  = (0,7)
Zombie = (0,6)
Wolf = (3,5)

In [16]:
class State():
  """
    This is a State class which controls all the cells state in the 2d grid.
    In our 2d board depending upon the action by the agent the current state
    can be changed or it can be a episode (win or lose).

    :param START: Players start value anywhere in the board
    :type START: tuple, Mandatory
  """
  def __init__(self, state = START):
    """
    Constructor Method
    """
    self.board = np.zeros([BOARD_ROWS, BOARD_COLS])
    self.state = state
    self.isEnd = False

  def giveReward(self):
    """
    There are 3 obstacles in the 2d grid, and if any of the state falls under
    the below mentioned category they get a reward
    :return: Returns a reward value if it is equal to the any of the mentioned category
    :rtype: int, this returns only one of the following value 1, -1 and -5   
    """
    if self.state == WIN_STATE:
        return 1
    elif self.state == Zombie:
        return -1
    elif self.state == Wolf:
        return -5
    else:
        return 0

  def isEndFunc(self):
    """
    This function is used to stop the reinforcement learning for the current round
    if it falls under any of the following states WIN_STATE, zombie and wolf
    :return: Returns a boolean value "True"
    :rtype: bool
    """
    if (self.state == WIN_STATE) or (self.state == Zombie) or (self.state == Wolf):
      self.isEnd = True

  def nxtPosition(self, action):
    """
    Returns the next position state/cell value based on the agents action and if the next 
    state falls under in any of the teleport dicitonary keys the next state will be not up,
    down, left and right rather it jumps multiple cells depending upon the dictionary value.
    Similarly, there is a variabel called pull_back state, which pulls you back to the previous
    state if your agent action resulted in any of the state present in the pull back list.

    :param action: Contains any one of the following string value (up, down,left, right)
    :param type: String
    :return: Returns the agents tuple value if it not falls under any of the teleport dicitonary or
    pull back method or else returns the agents desired next state.
    :rtype: tuple 
    """
    teleport_dictionary={(4,2):(1,7),(1,2):(4,2),(3,4):(1,7)}
    pull_back = [(5,2),(4,7)]
    if action == "up":
        nxtState = (self.state[0] - 1, self.state[1])
    elif action == "down":
        nxtState = (self.state[0] + 1, self.state[1])
    elif action == "left":
        nxtState = (self.state[0], self.state[1] - 1)
    else:
        nxtState = (self.state[0], self.state[1] + 1)

    # if next state legal
    if (nxtState[0] >= 0) and (nxtState[0] <= (BOARD_ROWS -1)):
        if (nxtState[1] >= 0) and (nxtState[1] <= (BOARD_COLS -1)):
          #Checks if the next state falls in teleport keys
          if nxtState in teleport_dictionary.keys():
            return teleport_dictionary[(nxtState[0],nxtState[1])]
          # Checks if the next state is present in the pull back list
          if nxtState in pull_back:
            print("I'm in pull back")

            return (self.state[0], self.state[1])
          return nxtState
    return self.state

In [17]:
class Agent():
  """
  This class is responsible for choosing the action until it reaches the episode (Win or Lose). 
  This class depends upon the state class to move to the next state. Depending upon each episode
  the agent learns and during the next play takes the best action.
  """
  def __init__(self):
    """
    Constructor Method
    """ 
    self.states = []
    self.actions = ["up", "down", "left", "right"]
    self.State = State()
    self.lr = 0.2
    self.exp_rate = 0.3

    # initial state reward
    self.state_values = {}
    for i in range(BOARD_ROWS):
        for j in range(BOARD_COLS):
            self.state_values[(i, j)] = 0

  def chooseAction(self):
    """
    The agent chooses the action to exploit all the cells and then choose the best possible
    action or the agent greedy method can be controlled by a exploitation value called 
    exp_rate which is 0.2 everytime. 

    :return: Returns any one of the following value "up", "down", "left" or "right" 
    :rtype: String 
    """
    # choose action with most expected value
    mx_nxt_reward = 0
    action = "" 

    if np.random.uniform(0, 1) <= self.exp_rate:
      action = np.random.choice(self.actions)
    else:
        # greedy action
      for a in self.actions:
          # if the action is deterministic
        nxt_reward = self.state_values[self.State.nxtPosition(a)]
        if nxt_reward >= mx_nxt_reward:
          action = a
          mx_nxt_reward = nxt_reward
    return action

  def takeAction(self, action):
    """
    The agent takes the neccessary action (move up,down,left or right) after the action has 
    been chosen from the Class Agent:chooseAction() method.

    :param action: Contains any one of the following string value (up, down,left, right)
    :param type: String

    :return: Updates the state class with new position as the START value
    :rtype: Returns the state object from (Class: STATE)
    """
    position = self.State.nxtPosition(action)
    return State(state=position)
  def reset(self):
    """
    This method resets all the state value and erases the state memory for the entire
    game played
    """
    self.states = []
    self.State = State()

  def play(self, rounds=10):
    """
    This method is responsible for stopping the game if it reaches the episode and then by the 
    help of dictionary the agent backtracks and updates the reward value to all the previous states/cells 
    where it came from using a deterministic reinforcement learning formula called value iteration. By updating
    the states using reward for each episdode will help the agent to reach its goal or the end state
    withlin less steps when the game is played next time.

    :param rounds: Total number of games to be played by the agent
    :param type: int
    """
    i = 0
    while i < rounds:

      if self.State.isEnd:
        # back propagate
        reward = self.State.giveReward()
        # explicitly assign end state to reward values
        self.state_values[self.State.state] = reward  # this is optional
        print("Game End Reward", reward)
        for s in reversed(self.states):
          reward = self.state_values[s] + self.lr * (reward - self.state_values[s])
          self.state_values[s] = round(reward, 3)
        self.reset()
        i += 1
      else:
        action = self.chooseAction()
        # append trace
        self.states.append(self.State.nxtPosition(action))
        print("current position {} action {}".format(self.State.state, action))
        # by taking the action, it reaches the next state
        self.State = self.takeAction(action)
        # mark is end
        self.State.isEndFunc()
        print("nxt state", self.State.state)
        print("---------------------")
  def showValues(self):
    for i in range(0, BOARD_ROWS):
      print('----------------------------------')
      out = '| '
      for j in range(0, BOARD_COLS):
          out += str(self.state_values[(i, j)]).ljust(6) + ' | '
      print(out)
    print('----------------------------------')

In [None]:
if __name__ == "__main__":
  """
  As you can see in the below matrix, the agent start from the position (row:7,column:0) and it reaches the end goal (row:1,column:7) where it gets a 
  reward of 1.0. The agent takes the maximum state values at each step or takes the random action to reduce the exploitation time when the agent starts
  the game again to play.
  """
    ag = Agent()
    ag.play(50)
    print(ag.showValues())