In [1]:
import numpy as np
import random
from collections import defaultdict

In [2]:
# Environment Setup (Grid World)
# -------------------------
class GridWorld:
    def __init__(self, size=5, goal_state=(4, 4), obstacle_states=[(2, 2)], penalty=-1, reward=10):
        self.size = size
        self.goal_state = goal_state
        self.obstacle_states = obstacle_states
        self.penalty = penalty
        self.reward = reward
        self.actions = ['up', 'down', 'left', 'right']

    def step(self, state, action):
        x, y = state
        if action == 'up':
            x = max(x - 1, 0)
        elif action == 'down':
            x = min(x + 1, self.size - 1)
        elif action == 'left':
            y = max(y - 1, 0)
        elif action == 'right':
            y = min(y + 1, self.size - 1)

        next_state = (x, y)
        reward = self.penalty
        done = False

        if next_state in self.obstacle_states:
            reward = -5
        elif next_state == self.goal_state:
            reward = self.reward
            done = True

        return next_state, reward, done

# -------------------------
# Preprocessing
# -------------------------
def preprocess_state(state, size=5):
    return state[0] * size + state[1]  # Flattened state index

# -------------------------
# Train-Test Split
# -------------------------
def train_test_split(states, train_ratio=0.8):
    random.shuffle(states)
    split_index = int(len(states) * train_ratio)
    return states[:split_index], states[split_index:]

# -------------------------
# Base Model (Q-Network + Model Learning)
# -------------------------
class DynaQAgent:
    def __init__(self, state_size, action_size, alpha=0.1, gamma=0.95, epsilon=0.1, planning_steps=5):
        self.q_table = defaultdict(lambda: np.zeros(action_size))
        self.model = {}
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.planning_steps = planning_steps
        self.actions = range(action_size)

    def choose_action(self, state):
        if random.random() < self.epsilon:
            return random.choice(self.actions)
        return np.argmax(self.q_table[state])

    def update(self, state, action, reward, next_state):
        best_next_action = np.argmax(self.q_table[next_state])
        td_target = reward + self.gamma * self.q_table[next_state][best_next_action]
        self.q_table[state][action] += self.alpha * (td_target - self.q_table[state][action])

    def update_model(self, state, action, reward, next_state):
        self.model[(state, action)] = (reward, next_state)

    def planning(self):
        for _ in range(self.planning_steps):
            if self.model:
                (state, action), (reward, next_state) = random.choice(list(self.model.items()))
                self.update(state, action, reward, next_state)

# -------------------------
# Training Loop
# -------------------------
def train_agent(env, agent, episodes=100):
    for episode in range(episodes):
        state = preprocess_state((0, 0), env.size)
        done = False

        while not done:
            action = agent.choose_action(state)
            action_str = env.actions[action]
            next_state_coords, reward, done = env.step((state // env.size, state % env.size), action_str)
            next_state = preprocess_state(next_state_coords, env.size)

            agent.update(state, action, reward, next_state)
            agent.update_model(state, action, reward, next_state)
            agent.planning()

            state = next_state

# -------------------------
# Evaluation
# -------------------------
def evaluate_agent(env, agent, episodes=10):
    success_count = 0

    for _ in range(episodes):
        state = preprocess_state((0, 0), env.size)
        done = False
        steps = 0

        while not done and steps < 50:
            action = np.argmax(agent.q_table[state])
            action_str = env.actions[action]
            next_state_coords, reward, done = env.step((state // env.size, state % env.size), action_str)
            state = preprocess_state(next_state_coords, env.size)
            steps += 1

        if state == preprocess_state(env.goal_state, env.size):
            success_count += 1

    print(f"Success Rate: {success_count}/{episodes}")

# -------------------------
# Deploy Policy (Inference Example)
# -------------------------
def deploy_policy(env, agent):
    state = preprocess_state((0, 0), env.size)
    done = False
    path = [(0, 0)]

    while not done and len(path) < 50:
        action = np.argmax(agent.q_table[state])
        action_str = env.actions[action]
        next_state_coords, _, done = env.step((state // env.size, state % env.size), action_str)
        path.append(next_state_coords)
        state = preprocess_state(next_state_coords, env.size)

    print("Deployed Path:", path)

In [3]:
# Main Workflow
# -------------------------
if __name__ == "__main__":
    env = GridWorld()

    # Train-Test Split
    all_states = [(i, j) for i in range(env.size) for j in range(env.size)]
    train_states, test_states = train_test_split(all_states)

    # Initialize Dyna-Q Agent
    agent = DynaQAgent(state_size=env.size * env.size, action_size=len(env.actions))

    # Train Base Model
    train_agent(env, agent)

    # Fine-Tuning (Adjust planning steps)
    agent.planning_steps = 10

    # Evaluation
    evaluate_agent(env, agent)

    # Deploy Policy
    deploy_policy(env, agent)

Success Rate: 10/10
Deployed Path: [(0, 0), (0, 1), (1, 1), (1, 2), (1, 3), (1, 4), (2, 4), (3, 4), (4, 4)]
