# Implement Reinforcement Learning Algorithm

## Definition of RL terminologies 

In [1]:
import numpy as np
from tqdm import tqdm

# Define your QLearning class
class QLearning:
    def __init__(self, state_space, action_space, learning_rate, discount_factor):
        self.state_space = state_space
        self.action_space = action_space
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.q_table = np.zeros((state_space, action_space))

    def get_action(self, state, epsilon):
        if np.random.rand() < epsilon:
            return np.random.choice(self.action_space)  # Random action (exploration)
        else:
            return np.argmax(self.q_table[state])  # Exploitation using learned Q-values

    def update_q_table(self, state, action, reward, next_state):
        old_value = self.q_table[state, action]
        next_max = np.max(self.q_table[next_state])
        new_value = (1 - self.learning_rate) * old_value + self.learning_rate * (reward + self.discount_factor * next_max)
        self.q_table[state, action] = new_value


## Q-Learning Algorithm

In [2]:
# Define your specific state space, action space, learning rate, and discount factor
state_space = 9  # number of states
action_space = 4  # number of actions
learning_rate = 0.2
discount_factor = 0.99

# Create an instance of the QLearning class
ql = QLearning(state_space, action_space, learning_rate, discount_factor)

# Simulate a number of episodes using tqdm for progress visualization
num_episodes = 10000
epsilon = 0.5  # Initial exploration rate
min_epsilon = 0.01  # Minimum exploration rate
decay_rate = 0.001  # Exploration rate decay
rewards = np.random.randn(state_space)  # Define random rewards for each state

for episode in tqdm(range(num_episodes)):
    state = np.random.randint(0, state_space)
    done = False

    while not done:
        action = ql.get_action(state, epsilon)

        # Simulate a transition - here, just a random transition for demonstration
        next_state = np.random.randint(0, state_space)
        reward = rewards[next_state]  # Use the defined rewards

        ql.update_q_table(state, action, reward, next_state)
        state = next_state

        if state == state_space - 1:
            done = True

    # Decay exploration rate epsilon
    epsilon = min_epsilon + (1 - min_epsilon) * np.exp(-decay_rate * episode)

print("Final Q-table:")
print(ql.q_table)


100%|██████████| 10000/10000 [00:00<00:00, 18544.14it/s]

Final Q-table:
[[26.59612402 26.62478417 26.598401   27.02019464]
 [26.54215703 26.5325388  26.88811188 26.53009904]
 [26.53513699 26.4886326  26.52049472 27.32266776]
 [26.6036865  26.56117206 26.80018192 26.66500551]
 [27.08214962 26.55504005 26.55245073 26.4486053 ]
 [26.53188503 26.71258365 26.54135352 26.82801822]
 [26.59497875 26.54551248 26.92728842 26.57705163]
 [26.5434918  26.92533249 26.47621987 26.50384238]
 [24.22957191 25.01815374 26.83266818 21.58941471]]



