<a href="https://colab.research.google.com/github/srinijalanda93/Predictive_Analystics/blob/main/2448526_lab5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import random
import time

# Define the environment (4x4 Grid)
# 0 - Free cell, 1 - Obstacle, 2 - Goal
env = np.array([
    [0, 0, 0, 0],
    [0, 1, 0, 1],
    [0, 0, 0, 0],
    [1, 0, 2, 0]
])

n_states = env.shape[0] * env.shape[1]
n_actions = 4  # Up, Down, Left, Right

# Q-table initialization
Q = np.zeros((n_states, n_actions))

In [2]:
# Learning parameters
alpha = 0.8      # Learning rate
gamma = 0.9      # Discount factor
epsilon = 0.1    # Exploration rate
episodes = 500   # Training episodes

In [3]:
# Helper functions
def state_to_index(state):
    """Convert (row, col) to single index"""
    return state[0] * env.shape[1] + state[1]

def index_to_state(index):
    """Convert index back to (row, col)"""
    return (index // env.shape[1], index % env.shape[1])

def get_next_state(state, action):
    """Get next state after taking action"""
    row, col = state
    if action == 0 and row > 0: row -= 1       # Up
    elif action == 1 and row < env.shape[0]-1: row += 1  # Down
    elif action == 2 and col > 0: col -= 1     # Left
    elif action == 3 and col < env.shape[1]-1: col += 1  # Right
    return (row, col)

In [4]:


def get_reward(state):
    """Reward function"""
    if env[state] == 2:
        return 10  # Goal
    elif env[state] == 1:
        return -5  # Obstacle
    else:
        return -1  # Normal step

# Training
for episode in range(episodes):
    state = (0, 0)  # start state
    done = False

    while not done:
        s_idx = state_to_index(state)

        # Îµ-greedy policy
        if random.uniform(0, 1) < epsilon:
            action = random.randint(0, n_actions-1)
        else:
            action = np.argmax(Q[s_idx, :])

        next_state = get_next_state(state, action)
        reward = get_reward(next_state)
        next_idx = state_to_index(next_state)

        # Q-learning update rule
        Q[s_idx, action] = Q[s_idx, action] + alpha * (reward + gamma * np.max(Q[next_idx, :]) - Q[s_idx, action])

        # Check if goal reached
        if env[next_state] == 2:
            done = True
        state = next_state

print("Training completed!")
print("\nFinal Q-Table:")
print(Q.round(2))


Training completed!

Final Q-Table:
[[ 1.81  3.12  1.81  0.63]
 [ 0.06 -4.    1.81 -2.1 ]
 [-2.21 -1.71 -1.77 -2.1 ]
 [-2.21 -4.   -2.14 -2.21]
 [ 1.81  4.58  3.12  0.56]
 [-0.96  6.2   3.12 -1.65]
 [-2.32  8.   -4.   -4.  ]
 [-0.8  -0.8  -0.8  -4.  ]
 [ 3.12 -0.88  4.58  6.2 ]
 [ 0.46  8.    4.58  8.  ]
 [ 6.2  10.    6.2   6.2 ]
 [-4.   -0.8   8.    0.  ]
 [ 4.58 -1.5  -4.   -0.8 ]
 [-0.8   6.24 -4.58 10.  ]
 [ 0.    0.    0.    0.  ]
 [-0.8   0.    0.    0.  ]]
