**A basic example of having grid data, initial position and target position.Goal is to train the agent to go from the initial to target position, in an optimal way**

In [2]:
import numpy as np
import time

In [5]:
class GridAgent:

    def __init__(self, grid_size = 5):
        self.grid_size = grid_size
        self.value_table = np.zeros((grid_size, grid_size, 4))
        self.explore_prob = 1.0
        self.target = (grid_size-1, grid_size-1)

    def initialize_position(self):
        self.position = (0, 0)
        return self.position
    
    def take_action(self, move):
        row, col = self.position
        if move == 0: # move up
            row = max(0, row-1)
        elif move == 1: # move down
            row = min(self.grid_size-1, row+1)
        elif move == 2: # move left
            col = max(0, col-1)
        elif move == 3: # move right
            col = min(self.grid_size-1, col+1)
        
        self.position = (row, col)
        reward = 1 if self.position == self.target else -1
        is_done = self.position == self.target
        return self.position, reward, is_done
    
    def select_move(self):
        # selecting move based on exploration and exploitation
        if np.random.rand() < self.explore_prob:
            return np.random.randint(4)
        return np.argmax(self.value_table[self.position])
    
    def learn(self, num_episodes = 500):
        for episode in range(num_episodes):
            current_pos = self.initialize_position()
            finished = False
            while not finished:
                move = self.select_move()
                next_pos, reward, finished = self.take_action(move)
                #updating value_table
                best_future_val = np.max(self.value_table[next_pos])
                self.value_table[current_pos][move] += 0.1 *(
                    reward + 0.9 * best_future_val - self.value_table[current_pos][move]
                )
                current_pos = next_pos
            # reduce exploration probability
            self.explore_prob *= 0.99

    def showcase(self):
        current_pos = self.initialize_position()
        reached_target = False
        while not reached_target:
            print("Current position:", current_pos)
            move = np.argmax(self.value_table[current_pos])
            print("Selected move:", move)
            current_pos, _, reached_target = self.take_action(move)
            time.sleep(0.5)
        print("Target rached at position:", current_pos)


In [None]:
navigator = GridAgent(grid_size=6)

print("Training the agent on the grid....")
navigator.learn()
print("Training completed.\n")

print("Starting demostration....")
navigator.showcase()

Training the agent on the grid....
Training completed.
Starting demostration....
Current position: (0, 0)
Selected move: 3
Current position: (0, 1)
Selected move: 1
Current position: (1, 1)
Selected move: 3
Current position: (1, 2)
Selected move: 1
Current position: (2, 2)
Selected move: 1
Current position: (3, 2)
Selected move: 1
Current position: (4, 2)
Selected move: 3
Current position: (4, 3)
Selected move: 3
Current position: (4, 4)
Selected move: 1
Current position: (5, 4)
Selected move: 3
Target rached at position: (5, 5)
