In [1]:
import gym
import numpy as np
import random
import time
from IPython.display import clear_output

In [2]:
# Q-Learning
def q_learn():
    env = gym.make("FrozenLake-v0")
    action_space_size = env.action_space.n
    state_space_size = env.observation_space.n
    q_table = np.zeros((state_space_size, action_space_size))
    # print(q_table)

    num_episodes = 10000
    max_steps_per_episode = 100

    learning_rate = 0.1
    discount_rate = 0.99

    exploration_rate = 1
    max_exploration_rate = 1
    min_exploration_rate = 0.01
    exploration_decay_rate = 0.005
    rewards_all_episodes = []
    for episode in range(num_episodes):
    #     print('Episode no:'+str(episode+1))
        state = env.reset()
        done = False
        rewards_current_episode = 0
        for step in range(max_steps_per_episode):
            # trading off between exploration and exploitation
            exploration_rate_threshold = random.uniform(0,1)
            if exploration_rate_threshold > exploration_rate and ~np.all(q_table[state,:]==0):
                # exploit
                action = np.argmax(q_table[state, :])
            else:
                # explore
                action = env.action_space.sample()
            new_state, reward, done, info = env.step(action)
            # update Q-table
            q_table[state, action] = q_table[state, action]*(1-learning_rate) + \
                                learning_rate * (reward + discount_rate*np.max(q_table[new_state, :]))
            state = new_state
            rewards_current_episode += reward
            if done==True:
                break
        exploration_rate = min_exploration_rate + (max_exploration_rate-min_exploration_rate)*np.exp(-exploration_decay_rate*episode)
        rewards_all_episodes.append(rewards_current_episode)

    rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes/1000)
    count = 1000
    print('Avg reward per thousand episodes. . .')
    for r in rewards_per_thousand_episodes:
        print(str(count) + " : " + str(sum(r)/1000) )
        count+=1000
    print('Final Q-table. . .')
    print(q_table)

In [3]:
q_learn()

Avg reward per thousand episodes. . .
1000 : 0.073
2000 : 0.176
3000 : 0.458
4000 : 0.653
5000 : 0.681
6000 : 0.683
7000 : 0.672
8000 : 0.7
9000 : 0.677
10000 : 0.671
Final Q-table. . .
[[0.52361128 0.4926338  0.4754489  0.47199896]
 [0.28347797 0.25482439 0.17058587 0.43988673]
 [0.12898662 0.35450424 0.20427519 0.2764791 ]
 [0.09509045 0.18622603 0.1688444  0.33386656]
 [0.55181839 0.3240733  0.40375176 0.32693301]
 [0.         0.         0.         0.        ]
 [0.15307972 0.15600793 0.29315357 0.04904804]
 [0.         0.         0.         0.        ]
 [0.42649661 0.34041215 0.3337783  0.60812451]
 [0.35568093 0.66722211 0.3667487  0.45235653]
 [0.60141024 0.36947384 0.26862358 0.31393099]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.43333664 0.47940719 0.78101413 0.34111795]
 [0.6957428  0.88678738 0.69898502 0.70256575]
 [0.         0.         0.         0.        ]]


In [None]:
def play():
    for episode in range(3):
        state = env.reset()
        done = False
        print("Attempt Number : ",episode+1)
        time.sleep(1)
        
        for step in range(max_steps_per_episode):
            clear_output(wait=True)
            env.render()
            time.sleep(0.3)
            action = np.argmax(q_table[state, :])
            new_state, reward, done, info = env.step(action)
            
            if done:
                clear_output(wait=True)
                env.render()
                if reward==1:
                    print("Goal reached. . .")
                else:
                    print("Mission failed. . ."
                         )

In [None]:
# Play game
