In [1]:
import sys
import gym
import numpy as np
import random
import time
from io import StringIO
from IPython.display import clear_output

In [2]:
color2num = dict(
    gray=30,
    red=31,
    green=32,
    yellow=33,
    blue=34,
    magenta=35,
    cyan=36,
    white=37,
    crimson=38
)


def colorize(string, color, bold=False, highlight = False):
    """Return string surrounded by appropriate terminal color codes to
    print colorized text.  Valid colors: gray, red, green, yellow,
    blue, magenta, cyan, white, crimson
    """

    attr = []
    num = color2num[color]
    if highlight: num += 10
    attr.append(str(num))
    if bold: attr.append('1')
    attrs = ';'.join(attr)
    return '\x1b[%sm%s\x1b[0m' % (attrs, string)

In [3]:
# Q-Learning
env = gym.make("FrozenLake-v0")
action_space_size = env.action_space.n
state_space_size = env.observation_space.n
q_table = np.zeros((state_space_size, action_space_size))
# print(q_table)

num_episodes = 10000
max_steps_per_episode = 100

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001
rewards_all_episodes = []

In [4]:
for episode in range(num_episodes):
#     print('Episode no:'+str(episode+1))
    state = env.reset()
    done = False
    rewards_current_episode = 0
    for step in range(max_steps_per_episode):
        # trading off between exploration and exploitation
        exploration_rate_threshold = random.uniform(0,1)
        if exploration_rate_threshold > exploration_rate and ~np.all(q_table[state,:]==0):
            # exploit
            action = np.argmax(q_table[state, :])
        else:
            # explore
            action = env.action_space.sample()
        new_state, reward, done, info = env.step(action)
        # update Q-table
        q_table[state, action] = q_table[state, action]*(1-learning_rate) + \
                            learning_rate * (reward + discount_rate*np.max(q_table[new_state, :]))
        state = new_state
        rewards_current_episode += reward
        if done==True:
            break
    exploration_rate = min_exploration_rate + (max_exploration_rate-min_exploration_rate)*np.exp(-exploration_decay_rate*episode)
    rewards_all_episodes.append(rewards_current_episode)

rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes/1000)
count = 1000
print('Avg reward per thousand episodes. . .')
for r in rewards_per_thousand_episodes:
    print(str(count) + " : " + str(sum(r)/1000) )
    count+=1000
print('Final Q-table. . .')
print(q_table)

Avg reward per thousand episodes. . .
1000 : 0.047
2000 : 0.239
3000 : 0.395
4000 : 0.585
5000 : 0.625
6000 : 0.628
7000 : 0.655
8000 : 0.674
9000 : 0.656
10000 : 0.657
Final Q-table. . .
[[0.57458138 0.47220179 0.47440993 0.46100452]
 [0.2755426  0.24692047 0.20380788 0.45404105]
 [0.40447248 0.28380283 0.26674807 0.28307903]
 [0.05105296 0.20669668 0.03184018 0.1004704 ]
 [0.62331392 0.38040862 0.32412654 0.40928712]
 [0.         0.         0.         0.        ]
 [0.15360931 0.15024066 0.33161897 0.09272874]
 [0.         0.         0.         0.        ]
 [0.33794472 0.3923832  0.22171078 0.66283106]
 [0.47758364 0.70038628 0.32993201 0.4554218 ]
 [0.70571168 0.39486607 0.39731624 0.27979856]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.56894628 0.6020162  0.7921399  0.57388378]
 [0.71696434 0.88976479 0.7656382  0.72932225]
 [0.         0.         0.         0.        ]]


In [5]:
np.save('q_table.npy', q_table)

In [6]:
q_table = np.load('q_table.npy')
q_table

array([[0.57458138, 0.47220179, 0.47440993, 0.46100452],
       [0.2755426 , 0.24692047, 0.20380788, 0.45404105],
       [0.40447248, 0.28380283, 0.26674807, 0.28307903],
       [0.05105296, 0.20669668, 0.03184018, 0.1004704 ],
       [0.62331392, 0.38040862, 0.32412654, 0.40928712],
       [0.        , 0.        , 0.        , 0.        ],
       [0.15360931, 0.15024066, 0.33161897, 0.09272874],
       [0.        , 0.        , 0.        , 0.        ],
       [0.33794472, 0.3923832 , 0.22171078, 0.66283106],
       [0.47758364, 0.70038628, 0.32993201, 0.4554218 ],
       [0.70571168, 0.39486607, 0.39731624, 0.27979856],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.56894628, 0.6020162 , 0.7921399 , 0.57388378],
       [0.71696434, 0.88976479, 0.7656382 , 0.72932225],
       [0.        , 0.        , 0.        , 0.        ]])

In [7]:
def get_position(env, mode='human'):
#     outfile = StringIO() if mode == 'ansi' else sys.stdout

    row, col = env.s // env.ncol, env.s % env.ncol
    desc = env.desc.tolist()
    desc = [[c.decode('utf-8') for c in line] for line in desc]
    return row, col
#     print(row, col)
#     desc[row][col] = colorize(desc[row][col], "red", highlight=True)
#     if env.lastaction is not None:
#         outfile.write("  ({})\n".format(
#             ["Left", "Down", "Right", "Up"][env.lastaction]))
#     else:
#         outfile.write("\n")
#     outfile.write("\n".join(''.join(line) for line in desc)+"\n")

#     if mode != 'human':
#         with closing(outfile):
#             return outfile.getvalue()

In [9]:
for episode in range(5):
    state = env.reset()
    done = False
    print("Attempt Number : ",episode+1)
    time.sleep(1)

    for step in range(max_steps_per_episode):
#         clear_output(wait=True)
        print(get_position(env))
#         get_position(env)
        time.sleep(0.3)
        action = np.argmax(q_table[state, :])
        new_state, reward, done, info = env.step(action)

        if done:
#             clear_output(wait=True)
#             get_position(env)
            if reward==1:
                print("Goal reached. . .")
            else:
                print("Mission failed. . .")
            clear_output(wait=True)
            break

        state = new_state
env.close()

Attempt Number :  5
(0, 0)
(0, 0)
(0, 0)
(1, 0)
(2, 0)
(1, 0)
(2, 0)
(2, 1)
(2, 2)
(3, 2)
(3, 2)
Goal reached. . .
