In [3]:
from collections import deque
import numpy as np
import argparse
import os
import time
from termcolor import colored

In [1]:
import numpy as np

class Qworld:
    def __init__(self):
        self.col = 4
        self.row = 6
        self.q_table = np.zeros([self.row, self.col])
        self.init_transition_table()
        self.init_reward_table()
        self.gamma = 0.9
        self.epsilon = 0.9
        self.epsilon_decay = 0.9
        self.epsilon_min = 0.1
        self.reset()

    def reset(self):
        self.state = 0
        return self.state

    def is_in_win_state(self):
        return self.state == 2

    def init_reward_table(self):
        self.reward_table = np.zeros([self.row, self.col])
        self.reward_table[1, 2] = 100
        self.reward_table[4, 2] = -100

    def init_transition_table(self):
        self.transition_table = np.zeros([self.row, self.col], dtype=int)
        self.transition_table[0] = [0, 3, 1, 0]
        self.transition_table[1] = [0, 4, 2, 1]
        self.transition_table[2] = [2, 2, 2, 2]
        self.transition_table[3] = [3, 3, 4, 0]
        self.transition_table[4] = [3, 4, 5, 1]
        self.transition_table[5] = [5, 5, 5, 5]

    def step(self, action):
        next_state = self.transition_table[self.state, action]
        done = next_state == 2 or next_state == 5
        reward = self.reward_table[self.state, action]
        return next_state, reward, done

    def act(self):
        if np.random.rand() < self.epsilon:
            return np.random.randint(4)
        return np.argmax(self.q_table[self.state])

    def update_q_table(self, state, action, reward, next_state):
        best_next = np.max(self.q_table[next_state])
        self.q_table[state, action] = reward + self.gamma * best_next

    def update_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


q_world = Qworld()
episode_count = 100
maxwins = 10
wins = 0
scores = []

for episode in range(episode_count):
    state = q_world.reset()
    done = False
    step = 0

    while not done and step < 50:
        action = q_world.act()
        next_state, reward, done = q_world.step(action)
        q_world.update_q_table(state, action, reward, next_state)
        state = next_state
        step += 1

    if q_world.is_in_win_state():
        wins += 1
        scores.append(step)
        if wins >= maxwins:
            break

    q_world.update_epsilon()

print("Training finished. Scores (steps to win):", scores)
print("Learned Q-table:\n", q_world.q_table)


Training finished. Scores (steps to win): []
Learned Q-table:
 [[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [2]:
!pip install gym

Collecting gym
  Downloading gym-0.26.2.tar.gz (721 kB)
     ---------------------------------------- 0.0/721.7 kB ? eta -:--:--
     ------------------------------------- 721.7/721.7 kB 14.8 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting cloudpickle>=1.2.0 (from gym)
  Downloading cloudpickle-3.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting gym_notices>=0.0.4 (from gym)
  Downloading gym_notices-0.0.8-py3-none-any.whl.metadata (1.0 kB)
Downloading cloudpickle-3.1.1-py3-none-any.whl (20 kB)
Downloading gym_notices-0.0.8-py3-none-any.whl (3.0 kB)
Building wheels for collected packages: gym
  Building wheel for gym (pyproject.toml): started
  Building wheel for gym (pyproject.tom


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: C:\Users\prave\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
