In [1]:
import numpy as np

class QLearning:
    def __init__(self, num_states, num_actions, learning_rate=0.1, discount_factor=0.9, epsilon=0.1):
        self.num_states = num_states
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.q_table = np.zeros((num_states, num_actions))

    def choose_action(self, state):
        if np.random.uniform(0, 1) < self.epsilon:
            return np.random.choice(self.num_actions)
        else:
            return np.argmax(self.q_table[state, :])

    def update_q_table(self, state, action, reward, next_state):
        best_next_action = np.argmax(self.q_table[next_state, :])
        td_target = reward + self.discount_factor * self.q_table[next_state, best_next_action]
        td_error = td_target - self.q_table[state, action]
        self.q_table[state, action] += self.learning_rate * td_error

    def train(self, env, num_episodes):
        for _ in range(num_episodes):
            state = env.reset()
            done = False
            while not done:
                action = self.choose_action(state)
                next_state, reward, done, _ = env.step(action)
                self.update_q_table(state, action, reward, next_state)
                state = next_state

    def test(self, env, num_episodes):
        total_rewards = 0
        for _ in range(num_episodes):
            state = env.reset()
            done = False
            while not done:
                action = np.argmax(self.q_table[state, :])
                next_state, reward, done, _ = env.step(action)
                total_rewards += reward
                state = next_state
        return total_rewards

# Example usage
if __name__ == "__main__":
    # Define a simple environment (e.g., a grid world)
    class SimpleEnv:
        def __init__(self):
            self.num_states = 5
            self.num_actions = 2
            self.state = 0

        def reset(self):
            self.state = 0
            return self.state

        def step(self, action):
            if action == 0:
                self.state = max(0, self.state - 1)
            else:
                self.state = min(self.num_states - 1, self.state + 1)
            reward = 1 if self.state == self.num_states - 1 else 0
            done = self.state == self.num_states - 1
            return self.state, reward, done, {}

    env = SimpleEnv()
    agent = QLearning(num_states=env.num_states, num_actions=env.num_actions)
    agent.train(env, num_episodes=1000)
    total_rewards = agent.test(env, num_episodes=100)
    print("Total rewards:", total_rewards)


Total rewards: 100
