In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# FIT5226 Project Stage 1 - Tabular Q-Learning for Single Agent

This notebook implements a tabular Q-learning algorithm to train an agent to complete a transport task in a grid world environment. The agent's goal is to pick up an item at location A and deliver it to location B with the fewest steps possible.

In [2]:
class GridWorld:
    def __init__(self, size=5, start=None, item_location=None, goal_location=None):
        # Initialization of the environment
        self.size = size
        self.start = start if start else (0, 0)
        self.item_location = item_location if item_location else (size//2, size//2)
        self.goal_location = goal_location if goal_location else (size-1, size-1)
        self.agent_position = self.start
        self.has_item = False

    def reset(self):
        """Resets the environment to the initial state."""
        self.agent_position = self.start
        self.has_item = False
        return self.get_state()

    def get_state(self):
        """Returns the current state of the environment."""
        return (self.agent_position, self.item_location, self.has_item)

    def step(self, action):
        """Executes an action and updates the environment."""
        # Code for agent movement, reward calculation, and state update
        if action == 'north' and self.agent_position[0] > 0:
            self.agent_position = (self.agent_position[0] - 1, self.agent_position[1])
        elif action == 'south' and self.agent_position[0] < self.size - 1:
            self.agent_position = (self.agent_position[0] + 1, self.agent_position[1])
        elif action == 'east' and self.agent_position[1] < self.size - 1:
            self.agent_position = (self.agent_position[0], self.agent_position[1] + 1)
        elif action == 'west' and self.agent_position[1] > 0:
            self.agent_position = (self.agent_position[0], self.agent_position[1] - 1)

        if self.agent_position == self.item_location:
            self.has_item = True

        done = False
        if self.agent_position == self.goal_location and self.has_item:
            reward = 50
            done = True
        else:
            reward = -1

        if not done and self.agent_position == self.item_location and not self.has_item:
            reward = 10

        next_state = self.get_state()
        return next_state, reward, done

In [3]:
import numpy as np
import random

class QLearningAgent:
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=0.1, grid_size=5):
        """Initializes the Q-learning agent."""
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.q_table = {}
        self.grid_size = grid_size
        self.actions = ['north', 'south', 'east', 'west']

    def get_q_value(self, state, action):
        """Returns the Q-value for a given state-action pair."""
        return self.q_table.get((state, action), 0.0)

    def set_q_value(self, state, action, value):
        """Sets the Q-value for a given state-action pair."""
        self.q_table[(state, action)] = value

    def choose_action(self, state):
        """Chooses an action based on the epsilon-greedy policy."""
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(self.actions)
        else:
            q_values = [self.get_q_value(state, action) for action in self.actions]
            max_q = max(q_values)
            return self.actions[q_values.index(max_q)]

    def update_q_value(self, state, action, reward, next_state):
        """Updates the Q-value based on the received reward and next state."""
        next_q_values = [self.get_q_value(next_state, a) for a in self.actions]
        best_next_q = max(next_q_values)
        old_q_value = self.get_q_value(state, action)
        new_q_value = old_q_value + self.alpha * (reward + self.gamma * best_next_q - old_q_value)
        self.set_q_value(state, action, new_q_value)

In [4]:
def train_agent(agent, environment, episodes=1000):
    """Trains the Q-learning agent."""
    for episode in range(episodes):
        state = environment.reset()
        done = False
        while not done:
            action = agent.choose_action(state)
            next_state, reward, done = environment.step(action)
            agent.update_q_value(state, action, reward, next_state)
            state = next_state
        if (episode + 1) % 100 == 0:
            print(f"Episode {episode + 1} completed.")

In [5]:
def test_agent(agent, environment, trials=100):
    """Tests the Q-learning agent after training."""
    total_steps = 0
    for trial in range(trials):
        state = environment.reset()
        done = False
        steps = 0
        while not done:
            action = agent.choose_action(state)
            next_state, reward, done = environment.step(action)
            state = next_state
            steps += 1
        total_steps += steps
    average_steps = total_steps / trials
    print(f"Average steps taken: {average_steps}")
    return average_steps

In [6]:
import matplotlib.pyplot as plt

def visualize_agent(environment, agent, episodes=10):
    """Visualizes the agent's path in the grid world."""
    for episode in range(episodes):
        state = environment.reset()
        done = False
        path = [state[0]]
        while not done:
            action = agent.choose_action(state)
            next_state, reward, done = environment.step(action)
            path.append(next_state[0])
            state = next_state
        
        grid = np.zeros((environment.size, environment.size))
        for position in path:
            grid[position] += 1
        plt.imshow(grid, cmap='Blues', origin='upper')
        plt.colorbar(label='Number of visits')
        plt.title(f'Path in episode {episode + 1}')
        plt.show()