####  $ Tabular \ Q-learning $

In [5]:
import gymnasium as gym
import collections
from tensorboardX import SummaryWriter



In [18]:
class AgentClass:
    """ Module that create a agent object and Initiate values 
    
    Args:
        - environment : Containing the Environment from gym 
        - alpha : float, learning rate for Q-Learning
        - gamma : float, Penalty in Q-Learning
    
    Return:
    
    """

    def __init__(self, train_env :gym.make, alpha_:float, gamma_:float):
        self.train_env = train_env
        self.state = self.train_env.reset()
        self.alpha_ = alpha_
        self.gamma_ = gamma_
        self.values = collections.defaultdict(float)

    def sample_env(self):
        # Initiate Environment 
        action = self.train_env.action_space.sample()
        old_state = self.state

        new_state, reward, is_done, _ , _  = self.train_env.step(action)
        self.state = self.train_env.reset() if is_done else new_state
        return old_state, action, reward, new_state
    
    def best_value_and_action_on_test_environment (self,state):
        best_value, best_action = None, None
        for action in range(self.train_env.action_space.n):
            action_value = self.values[f"{(state, action)}"]
            if best_value is None or best_value < action_value:
                best_value = action_value
                best_action = action
        return best_value, best_action
    
    def value_update (self, s, a, r, next_s):
        """
        The next method receives the state of the environment and finds the best action to 
        take from this state by taking the action with the largest value that we have in the 
        table
        """

        best_v, _ = self.best_value_and_action_on_test_environment(next_s)
        new_v = r + self.gamma_ * best_v
        old_v = self.values[f"{(s,a)}"]
        self.values[f"{(s,a)}"] = old_v * (1-self.alpha_) + new_v * self.alpha_


    def play_episode (self, test_env:gym.make):
        
        total_reward = 0.0 
        state = test_env.reset()
        while True:
            _, action = self.best_value_and_action_on_test_environment(state)
            new_state, reward, is_done, _ , _ = test_env.step(action)
            total_reward += reward

            if is_done:
                break
            state = new_state

        return total_reward
    
# Testing Environment 
ENV_NAME = "FrozenLake-v1"
GAMMA = 0.9
ALPHA = 0.2
TEST_EPISODE = 20 

# Initiate 
train_env = gym.make(id = ENV_NAME)
test_env = gym.make(id = ENV_NAME, render_mode = None)
Agent = AgentClass(train_env=train_env, alpha_=ALPHA, gamma_=GAMMA)
writer = SummaryWriter(comment = "-FrozenLake-Q-Learning")

iter_no = 0
best_reward = 0 


while True:
    iter_no += 1
    state_, action_, reward_ , new_state_ = Agent.sample_env()
    Agent.value_update(state_, action_, reward_, new_state_)
    reward = 0.0

    for _ in range(TEST_EPISODE):
        reward += Agent.play_episode(test_env)

    reward /= TEST_EPISODE
    writer.add_scalar("reward", reward , iter_no)
    if reward > best_reward:
        print(f"Best reward updated : {best_reward} -> {reward}")
        best_reward = reward

    if reward > 0.9:
        print(f"Solved in {iter_no} iterations")
        break 
writer.close()
test_env.close()

Best reward updated : 0 -> 0.1
Best reward updated : 0.1 -> 0.2
Best reward updated : 0.2 -> 0.25
Best reward updated : 0.25 -> 0.35
Best reward updated : 0.35 -> 0.4
Best reward updated : 0.4 -> 0.45
Best reward updated : 0.45 -> 0.5
Best reward updated : 0.5 -> 0.55
Best reward updated : 0.55 -> 0.65
Best reward updated : 0.65 -> 0.75
Best reward updated : 0.75 -> 0.8
Best reward updated : 0.8 -> 0.9
Best reward updated : 0.9 -> 0.95
Solved in 8019 iterations
