In [1]:
import numpy as np
import random

In [2]:
# Step 1: Environment Setup
class SimpleGridWorld:
    def __init__(self, size=5, goal_state=(4, 4), obstacle_states=[]):
        self.size = size
        self.goal_state = goal_state
        self.obstacle_states = obstacle_states
        self.actions = ['up', 'down', 'left', 'right']
        self.state = (0, 0)  # Start state

    def reset(self):
        self.state = (0, 0)
        return self.state

    def step(self, action):
        row, col = self.state
        if action == 'up':
            row = max(row - 1, 0)
        elif action == 'down':
            row = min(row + 1, self.size - 1)
        elif action == 'left':
            col = max(col - 1, 0)
        elif action == 'right':
            col = min(col + 1, self.size - 1)

        new_state = (row, col)
        reward = 1 if new_state == self.goal_state else -0.01
        done = new_state == self.goal_state
        self.state = new_state
        return new_state, reward, done

In [3]:
# Step 2: Preprocessing (Q-Table Initialization)
size = 5
env = SimpleGridWorld(size)
q_table = np.zeros((size, size, len(env.actions)))

# Step 3: Train-Test Split
train_episodes = 500
test_episodes = 50

In [4]:
# Step 4: Train Base Model (Q-Learning Algorithm)
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 1.0  # Exploration-exploitation balance
epsilon_decay = 0.995
min_epsilon = 0.01

for episode in range(train_episodes):
    state = env.reset()
    done = False
    while not done:
        if random.uniform(0, 1) < epsilon:
            action_idx = random.choice(range(len(env.actions)))  # Exploration
        else:
            action_idx = np.argmax(q_table[state[0], state[1]])  # Exploitation
        
        new_state, reward, done = env.step(env.actions[action_idx])
        q_table[state[0], state[1], action_idx] += alpha * (
            reward + gamma * np.max(q_table[new_state[0], new_state[1]]) - q_table[state[0], state[1], action_idx]
        )
        state = new_state
    
    epsilon = max(min_epsilon, epsilon * epsilon_decay)  # Decay epsilon

In [5]:
# Step 5: Evaluate
success_count = 0
for _ in range(test_episodes):
    state = env.reset()
    done = False
    while not done:
        action_idx = np.argmax(q_table[state[0], state[1]])  # Exploit learned policy
        state, reward, done = env.step(env.actions[action_idx])
        if done:
            success_count += 1

evaluation_score = success_count / test_episodes
print(f"Evaluation Success Rate: {evaluation_score * 100:.2f}%")

Evaluation Success Rate: 100.00%


In [6]:
# Step 6: Deploy Policy (Using Trained Q-Table for Action Selection)
def deploy_policy(state):
    action_idx = np.argmax(q_table[state[0], state[1]])
    return env.actions[action_idx]

# Example Usage
print("Deployed Policy Example: Starting at (0,0)")
deploy_state = (0, 0)
for _ in range(10):
    action = deploy_policy(deploy_state)
    deploy_state, _, done = env.step(action)
    print(f"State: {deploy_state}, Action Taken: {action}")
    if done:
        print("Goal reached!")
        break

Deployed Policy Example: Starting at (0,0)
State: (4, 4), Action Taken: down
Goal reached!
