# Zadanie 6

Celem ćwiczenia jest implementacja algorytmu Q-learning.

Następnie należy stworzyć agenta rozwiązującego problem [Taxi](https://gymnasium.farama.org/environments/toy_text/taxi/). Problem dostępny jest w pakiecie **gym**.

Punktacja (max 7 pkt):
- Implementacja algorytmu Q-learning. [3 pkt]
- Eksperymenty dla różnych wartości hiperparametrów [2 pkt]
- Jakość kodu [1 pkt]
- Wnioski [1 pkt]

Polecane źródła - teoria + intuicja:
- https://distill.pub/2019/paths-perspective-on-value-learning/
- https://www.youtube.com/watch?v=0iqz4tcKN58&ab_channel=SteveBrunton

In [13]:
import numpy as np
import gymnasium
from gymnasium.core import Env

In [14]:
class QLearningSolver:
    """Class containing the Q-learning algorithm that might be used for different discrete environments."""

    def __init__(self, observation_space: int, action_space: int, learning_rate: float = 0.1,
                 gamma: float = 0.9, epsilon: float = 0.1):
        self.observation_space = observation_space
        self.action_space = action_space
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon
        self.Q = np.zeros((observation_space, action_space))

    def __call__(self, state: np.ndarray, action: np.ndarray) -> np.ndarray:
        """Return Q-value of given state and action."""
        return self.Q[state, action]

    def update(self, state: np.ndarray, action: np.ndarray, next_state: np.ndarray, reward: float) -> None:
        """Update Q-value of given state and action."""
        self.Q[state, action] = (self.Q[state, action] + self.learning_rate *
                                 (reward + self.gamma * np.max(self.Q[next_state]) - self.Q[state, action]))

    def get_best_action(self, state: np.ndarray) -> np.ndarray:
        """Return action that maximizes Q-value for a given state."""
        if np.random.uniform(0, 1) < self.epsilon:
            return np.random.randint(0, self.action_space)
        else:
            return np.argmax(self.Q[state])

    def reset(self):
        self.Q = np.zeros((self.observation_space, self.action_space))

    def __repr__(self) -> str:
        """Elegant representation of Q-learning solver."""
        message = f"Observation space: {self.observation_space},\nAction space: {self.action_space},\n"
        message += f"Learning rate: {self.learning_rate},\nGamma: {self.gamma},\nEpsilon: {self.epsilon}"
        if self.Q.shape[0] < 100 and self.Q.shape[1] < 30:
            message += f"Q matrix shape: {self.Q.shape}"
            message += str(self.Q)
        else:
            message += f"Q matrix shape: {self.Q.shape}, to big to display"
        return message

    def __str__(self):
        return self.__repr__()

In [15]:
class TaxiQLearningSolver:
    def __init__(self, environment: Env, learning_rate: float = 0.1, gamma: float = 0.9, epsilon: float = 0.1, verbose=0):
        self.environment = environment
        self.verbose = verbose
        self.solver = QLearningSolver(observation_space=self.environment.observation_space.n, action_space=self.environment.action_space.n,
                                      learning_rate=learning_rate, gamma=gamma, epsilon=epsilon)

    def set_solver_parameters(self, learning_rate: float, gamma: float, epsilon: float):
        self.solver.learning_rate = learning_rate
        self.solver.gamma = gamma
        self.solver.epsilon = epsilon
        self.solver.reset()

    def set_verbose(self, verbose: int):
        self.verbose = verbose

    def train(self, epochs: int, steps_per_epoch: int):
        return self._solve(epochs, steps_per_epoch, True)

    def test(self, tests_number: int, steps_per_test: int):
        return self._solve(tests_number, steps_per_test, False)

    def _solve(self, iterations_number: int, steps_per_iteration: int, is_training: bool):
        rewards = []
        results = []
        steps = []
        successful_iterations = 0
        is_done = False

        for iteration in range(1, iterations_number + 1):
            state = self.environment.reset()[0]
            iteration_reward = 0
            for step in range(1, steps_per_iteration + 1):
                chosen_action = self.solver.get_best_action(state)
                next_state, reward, is_done, _, _ = self.environment.step(int(chosen_action))
                iteration_reward += reward
                if is_training:
                    self.solver.update(state, chosen_action, next_state, reward)
                state = next_state

                if is_done:
                    break

            if is_done:
                successful_iterations += 1
                success_status = "Success"
            else:
                success_status = "Failure"
            if self.verbose == 2:
                if is_training:
                    print(f"Epoch: {iteration:<8} Steps: {step:<8} Epoch reward: {iteration_reward:<13} {success_status:<8}")
                else:
                    print(f"Test number: {iteration:<8} Steps: {step:<8} Test reward: {iteration_reward:<13} {success_status:<8}")
            rewards.append(iteration_reward)
            results.append(is_done)
            steps.append(step)

        if self.verbose:
            print()
            if is_training:
                print(f"Successful epochs: {successful_iterations} ({successful_iterations / iterations_number:.2f}%)")
            else:
                print(f"Successful tests: {successful_iterations} ({successful_iterations / iterations_number:.2f}%)")
            print(f"Max reward: {max(rewards)}; Min reward: {min(rewards)}; Average reward: {sum(rewards)/len(rewards)}")
            print(f"Max steps number: {max(steps)}; Min steps number: {min(steps)}; Average steps number: {sum(steps)/len(steps)}")


        return rewards, results


# Eksperymenty

In [16]:
environment = gymnasium.make('Taxi-v3')
testing_solver = TaxiQLearningSolver(environment, learning_rate=0.1, gamma=0.9, epsilon=0.1, verbose=2)
rewards, results = testing_solver.train(epochs=1000, steps_per_epoch=100)

Epoch: 1        Steps: 100      Epoch reward: -280          Failure 
Epoch: 2        Steps: 100      Epoch reward: -226          Failure 
Epoch: 3        Steps: 100      Epoch reward: -262          Failure 
Epoch: 4        Steps: 100      Epoch reward: -280          Failure 
Epoch: 5        Steps: 100      Epoch reward: -235          Failure 
Epoch: 6        Steps: 100      Epoch reward: -262          Failure 
Epoch: 7        Steps: 100      Epoch reward: -262          Failure 
Epoch: 8        Steps: 100      Epoch reward: -244          Failure 
Epoch: 9        Steps: 100      Epoch reward: -289          Failure 
Epoch: 10       Steps: 100      Epoch reward: -253          Failure 
Epoch: 11       Steps: 100      Epoch reward: -181          Failure 
Epoch: 12       Steps: 100      Epoch reward: -361          Failure 
Epoch: 13       Steps: 100      Epoch reward: -316          Failure 
Epoch: 14       Steps: 100      Epoch reward: -271          Failure 
Epoch: 15       Steps: 100      Ep

# Wnioski