## Code to simulate dynamics of an agent moving in a 2D grid and learning to move optimally from point A to B

Initial seed for the code comes from `chatGPT-3`

-- S Ganga prasath 21 Mar, 2023

In [2]:
import numpy as np
import random
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

In [3]:
# Parameters in the model
class params():
    def __init__(self) -> None:
        super().__init__()
        self.nPts = 20
        self.rInit = np.array([0, 0])
        self.rEnd = np.array([15, 5])
        self.eps = 0.25

In [4]:
# Define the action selection function using epsilon-greedy policy
def choose_action(state, Q, epsilon):
    if np.random.uniform() < epsilon:
        return np.random.randint(4)
    else:
        return np.argmax(Q[state[0], state[1]])

In [5]:
# Define the function to take an action and get the next state and reward
# action = (left, right, bottom, top)
def take_action(state, action, env, p):
    if action == 0 and state[0] > 0:
        next_state = (state[0]-1, state[1])
    elif action == 1 and state[0] < env.shape[0]-1:
        next_state = (state[0]+1, state[1])
    elif action == 2 and state[1] > 0:
        next_state = (state[0], state[1]-1)
    elif action == 3 and state[1] < env.shape[1]-1:
        next_state = (state[0], state[1]+1)
    else:
        next_state = state
    if (next_state == p.rEnd).all():
        reward = 10
    else:
        reward = -1
    return next_state, reward

In [6]:
# Define the SARSA algorithm
def sarsa(env, p, num_episodes=20000, alpha=0.1, gamma=0.99, epsilon=0.1):
    Q = np.zeros((env.shape[0], env.shape[1], 4))
    for i in range(num_episodes):
        state = p.rInit
        action = choose_action(state, Q, epsilon)
        while (state != p.rEnd).all():
            next_state, reward = take_action(state, action, env, p)
            next_action = choose_action(next_state, Q, epsilon)
            Q[state[0], state[1], action] += alpha * (reward + gamma * Q[next_state[0], next_state[1], next_action] - Q[state[0], state[1], action])
            state = next_state
            action = next_action
    return Q

In [7]:
def main():
    p = params()
    env = np.zeros((p.nPts, p.nPts))
    # Train the agent and get the optimal policy
    Q = sarsa(env, p)
    policy = np.zeros((env.shape[0], env.shape[1]))
    for i in range(p.nPts):
        for j in range(p.nPts):
            policy[i, j] = np.argmax(Q[i, j])

    # Print the optimal policy
    print("Optimal policy:")
    print(policy)

In [8]:
if __name__ == '__main__':
    main()

Optimal policy:
[[3. 3. 3. 3. 3. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [3. 3. 3. 3. 3. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 3. 3. 3. 3. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [2. 2. 1. 3. 3. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 3. 3. 1. 3. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 3. 3. 1. 3. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 3. 1. 3. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 3. 3. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [3. 0. 2. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 2. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 2. 2. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 3. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 3. 2. 0. 3. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 3. 0. 3. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [3. 3. 2. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0