# Prabal Ghosh

**RL LAB**

In [1]:
import numpy as np
import matplotlib.pyplot as plt

# 1. Consider a 1D grid with :

 one goal location (positive reward, e.g. +1)

 one trap location (negative reward, e.g.-1)

 a fixed move cost (e.g.-0.01)

 deterministic actions (probability to go left when trying left is 1)

### Define the environment

Here our environment is 1 dimensional

In [2]:
num_states = 6  # Number of states
trap_state = 0
goal_state = 5
start_state = 2  # Starting position
move_cost = 0.01   # cost of moving one step in any direction

In [3]:
actions = [-1, 1]  # Possible actions: left (-1), right (+1)

In [4]:
# Rewards
rewards = np.zeros(num_states)
rewards[trap_state] = -1
rewards[goal_state] = 1

### Hyperparameters


In [5]:
gamma = 0.9  # Discount factor
alpha = 0.1  # Learning rate
epsilon = 0.1  # Exploration rate
num_episodes = 500

### Q-table

In [6]:
# Initialize Q-table
Q = np.zeros((num_states, len(actions)))
Q

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.]])

### epsilon greedy action selection

In [7]:
def epsilon_greedy(state, epsilon):
    if np.random.rand() < epsilon:
        return np.random.choice(len(actions))  # choose any random action
    else:
        return np.argmax(Q[state])  # choose the best action

In [8]:
# np.random.choice(len(actions))

In [9]:
# Q[3] = [3,1]
# Q

In [10]:
# np.argmax(Q[3])

Q-learning from the equation, with-greedy

In [11]:
for episode in range(num_episodes):
    state = start_state
    while state != trap_state and state != goal_state:
        action_idx = epsilon_greedy(state, epsilon)
        action = actions[action_idx]

        next_state = max(0, min(num_states - 1, state + action))
        reward = rewards[next_state] - move_cost

        # Update Q-value  (Q-learning from the equation, with-greedy)
        best_next_action = np.max(Q[next_state])
        Q[state, action_idx] += alpha * (reward + gamma * best_next_action - Q[state, action_idx])

        state = next_state  # Move to next state

In [12]:
policy = np.argmax(Q, axis=1)
policy_actions = [actions[a] for a in policy]

In [13]:
print("Optimal Q-table:")
print(Q)
print("\n")
print("\nOptimal Policy:")
print(policy_actions)


Optimal Q-table:
[[ 0.          0.        ]
 [-0.27371     0.63852427]
 [ 0.4381714   0.7829    ]
 [ 0.65815932  0.881     ]
 [ 0.70383523  0.99      ]
 [ 0.          0.        ]]



Optimal Policy:
[-1, 1, 1, 1, 1, -1]


In [14]:
# print("policy", policy)

 # 2. Extend to the 2D grid of the classical toy example (lecture 1)

### Define the environment

here the environment is 2 D Grid

In [16]:
grid_size = (3, 4)
trap_position = (1, 3)
goal_position = (0, 3)
start_position = (2, 0)
move_cost = 0.01   # cost of moving one step in any direction


In [17]:
actions = [(0, -1), (0, 1), (-1, 0), (1, 0)]  # LEFT, RIGHT, UP, DOWN


In [18]:
# Rewards
rewards = np.full(grid_size, -move_cost)  # Default move cost
rewards[trap_position] = -1
rewards[goal_position] = 1

In [19]:
rewards

array([[-0.01, -0.01, -0.01,  1.  ],
       [-0.01, -0.01, -0.01, -1.  ],
       [-0.01, -0.01, -0.01, -0.01]])

### Hyperparameters


Hyperparameters are same as the previous question

In [20]:
gamma = 0.9  # Discount factor
alpha = 0.1  # Learning rate
epsilon = 0.1  # Exploration rate
num_episodes = 1000

### Q-table

In [21]:
# Initialize Q-table
Q = np.zeros((*grid_size, len(actions)))
Q


array([[[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]]])

### epsilon greedy action selection

In [22]:
def epsilon_greedy(state, epsilon):
    if np.random.rand() < epsilon:
        return np.random.choice(len(actions))  # choose any random action
    else:
        return np.argmax(Q[state])  # choose the best action

In [23]:
def is_terminal(state):
    return state == trap_position or state == goal_position

Q-learning from the equation, with-greedy

In [24]:
for episode in range(num_episodes):
    state = start_position
    while not is_terminal(state):
        action_idx = epsilon_greedy(state, epsilon)
        action = actions[action_idx]

        next_state = (state[0] + action[0], state[1] + action[1])

        next_state = (
            max(0, min(grid_size[0] - 1, next_state[0])),
            max(0, min(grid_size[1] - 1, next_state[1]))
        )

        reward = rewards[next_state]   # get reward

        # Update Q-value  (Q-learning from the equation, with-greedy)
        best_next_action = np.max(Q[next_state])
        Q[state][action_idx] += alpha * (reward + gamma * best_next_action - Q[state][action_idx])

        state = next_state  # Move to next state




In [25]:
print("Optimal Q-table:")
print(Q)

Optimal Q-table:
[[[-0.00385219  0.7332382   0.10517699  0.10071765]
  [ 0.51927848  0.89        0.75236785  0.66440605]
  [ 0.75305216  1.          0.83249637  0.66934196]
  [ 0.          0.          0.          0.        ]]

 [[-0.00580175  0.69794129  0.00172136  0.09987948]
  [ 0.53487345  0.65921544  0.791       0.59173985]
  [-0.00109    -0.1         0.88455361  0.03174898]
  [ 0.          0.          0.          0.        ]]

 [[ 0.51335856  0.62171     0.52522491  0.47846276]
  [ 0.50957724  0.4168052   0.7019      0.57982255]
  [ 0.58232454 -0.00271     0.07582799 -0.00199   ]
  [ 0.03492463 -0.001      -0.1        -0.001     ]]]


In [26]:
policy = np.argmax(Q, axis=2)

policy_actions = [actions[a] for a in policy.flatten()]
print("policy_actions", policy_actions)

policy_actions [(0, 1), (0, 1), (0, 1), (0, -1), (0, 1), (-1, 0), (-1, 0), (0, -1), (0, 1), (-1, 0), (0, -1), (0, -1)]
