In [1]:
import gym
from gym import envs

In [2]:
import numpy as np
import time
from IPython.display import clear_output

In [10]:
from typing import Dict
class QTable:
    def __init__(self, env):
        self.n_observations = env.observation_space.n
        self.n_actions = env.action_space.n
        self.table = np.zeros((self.n_observations, self.n_actions))
        
    def get_action(self, state, epsilon=0.0):
        if np.random.rand() < epsilon:
            return np.random.randint(0, self.n_actions)
        else:
            rewards = self.table[state]
            best_action = np.argmax(rewards)
            return best_action
    
    def update_table(self, prev_state, next_state, action, reward, alpha=0.1, gamma=0.5):
        q = self.table
        ps = prev_state
        ns = next_state
        a = action
        q[ps, a] = q[ps, a] + alpha*(reward + gamma*np.max(q[ns]) - q[ps, a])

    def evaluate(self, env, n):
        scores = []
        for i_game in range(n):
            env.reset()
            total_score = 0
            for turn in range(200):
                ps = env.env.s
                action = self.get_action(ps, 0.0)
                ns, reward, done, info = env.step(action)
                total_score += reward
                if done:
                    break
            scores.append(total_score)
        return np.average(scores)
                
    def animate(self, env, delay=0.5):
        env.reset()
        env.render()
        time.sleep(delay)
        for turn in range(200):
            clear_output(True)
            ps = env.env.s
            action = self.get_action(ps, 0.0)
            ns, reward, done, info = env.step(action)
            env.render()
            print(turn, action)
            if not done or turn == 200:
                time.sleep(delay)
            else:
                break
        
    @classmethod
    def learn_from_env(cls, env, n, alpha, gamma, epsilon):
        table = QTable(env)
        for episode in range(n):
            env.reset()
            total_score = 0
            for turn in range(200):
                ps = env.env.s
                action = table.get_action(ps, epsilon=epsilon)
                ns, reward, done, info = env.step(action)
                total_score += reward
                table.update_table(ps, ns, action, reward, alpha=alpha, gamma=gamma)
                if done:
                    break
            if episode%1000 == 0:
                print(episode, total_score)
        return table

In [11]:
np.random.seed(333)
env = gym.make('Taxi-v3')
table = QTable.learn_from_env(env, n=100000, alpha=0.1, gamma=0.5, epsilon=0.2)



0 -479
1000 -168
2000 -12
3000 2
4000 -3
5000 7
6000 8
7000 -31
8000 -7
9000 -16
10000 -8
11000 6
12000 10
13000 -35
14000 -17
15000 12
16000 -17
17000 10
18000 -2
19000 -16
20000 -22
21000 -6
22000 -3
23000 6
24000 -22
25000 4
26000 -15
27000 11
28000 -22
29000 -9
30000 -4
31000 -12
32000 -9
33000 5
34000 0
35000 3
36000 -8
37000 -5
38000 -11
39000 -10
40000 -20
41000 0
42000 9
43000 -7
44000 6
45000 -10
46000 -5
47000 8
48000 2
49000 4
50000 -4
51000 -2
52000 10
53000 -42
54000 12
55000 12
56000 8
57000 -7
58000 -16
59000 9
60000 -36
61000 10
62000 -1
63000 5
64000 -19
65000 -9
66000 -4
67000 -9
68000 1
69000 -27
70000 -2
71000 -4
72000 6
73000 -15
74000 -15
75000 -20
76000 6
77000 -30
78000 -2
79000 -5
80000 13
81000 3
82000 7
83000 -26
84000 1
85000 4
86000 4
87000 -20
88000 -4
89000 -2
90000 1
91000 -13
92000 10
93000 9
94000 11
95000 8
96000 -11
97000 8
98000 3
99000 1


In [None]:
for i in range(20):
    clear_output(True)
    table.animate(env)

+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : |[43m [0m: : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
2 1


In [12]:
table.evaluate(env, 1000)

7.875