In [5]:
# You only need to run this cell once (per time you launch the notebook, in colab or jupyterlab)
# Once you have installed gymnasium, you can comment out the next line
!pip install gymnasium
!pip install "gymnasium[classic-control]" # use quotes on mac
# !pip install "gymnasium[box2d]" # has known installation problems

In [2]:
import gymnasium as gym
import random
import time
import numpy as np

In [40]:
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=False)  ##, render_mode = 'human'
action_space_size= env.action_space.n
obs_space_size= env.observation_space.n

q_table= np.zeros((obs_space_size, action_space_size))
print (q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [41]:
num_episodes= 100
t= 1000 ##Steps per episode
lr= 0.1
dr= 0.99
exp_r= 1
max_exp_r= 1
min_exp_r=0.01
exploration_decay_rate= 0.01


In [42]:
##training
total_rewards=[]
episode= 1
while episode<= num_episodes:
    total= 0
    observation, info = env.reset()
    reward_arr=[]
    for steps in range(t):
        exp_r_threshold = random.uniform (0,1)
        if exp_r_threshold > exp_r:
            action=np.argmax(q_table[observation, :])
        else:
            action = env.action_space.sample()
        new_observation, reward, terminated, truncated, info= env.step(action)
        print ("New Observation:", new_observation)
        print ("Action:", action)
        ##Update q_table
        q_table[observation, action]= (q_table[observation, action]*(1-lr)) + (lr* (reward+ (dr * np.max(q_table[new_observation, :]))))
        observation= new_observation
        total = total+reward
        reward_arr.append(reward)
        if terminated or truncated:
           observation, info= env.reset()
    mean_reward= np.mean(reward_arr)
    print (f"Average reward for {episode} episode is: ", mean_reward)
    print ("Cumulative Reward:", total)
    print ("Episode:", episode)
    exp_r= min_exp_r + (max_exp_r- min_exp_r)* np.exp(-exploration_decay_rate*episode)
    episode=episode+1
    total_rewards.append(total)

print ("Average rewards:", np.mean(total_rewards))
print ("Updated Q table:", q_table)
# Close the environment
env.close()

New Observation: 4
Action: 1
New Observation: 5
Action: 2
New Observation: 0
Action: 3
New Observation: 4
Action: 1
New Observation: 5
Action: 2
New Observation: 4
Action: 1
New Observation: 4
Action: 0
New Observation: 4
Action: 0
New Observation: 5
Action: 2
New Observation: 0
Action: 3
New Observation: 0
Action: 3
New Observation: 0
Action: 0
New Observation: 0
Action: 0
New Observation: 0
Action: 0
New Observation: 0
Action: 0
New Observation: 0
Action: 3
New Observation: 0
Action: 0
New Observation: 1
Action: 2
New Observation: 2
Action: 2
New Observation: 6
Action: 1
New Observation: 7
Action: 2
New Observation: 0
Action: 0
New Observation: 1
Action: 2
New Observation: 0
Action: 0
New Observation: 0
Action: 3
New Observation: 0
Action: 3
New Observation: 1
Action: 2
New Observation: 0
Action: 0
New Observation: 0
Action: 0
New Observation: 4
Action: 1
New Observation: 8
Action: 1
New Observation: 4
Action: 3
New Observation: 4
Action: 0
New Observation: 5
Action: 2
New Observatio

In [43]:
##Testing
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=False, render_mode = 'human')  ##
episode=0
while episode<= 4:
    observation, info = env.reset()
    for steps in range(100):
        action=np.argmax(q_table[observation, :])   ##Decides whether at that position, it should go up, down left or right depending on q score
        new_observation, reward, terminated, truncated, info= env.step(action)
        observation= new_observation
        if terminated or truncated:
            if reward==1:
                print ("You have reached the goal!")
            else:
                ##observation, info= env.reset()
                print ("Oops! You are in the hole!")
            break
                
    episode= episode+1

env.close()

You have reached the goal!
You have reached the goal!
You have reached the goal!
You have reached the goal!
You have reached the goal!


In [44]:
q_table

array([[0.94148015, 0.95099005, 0.95099005, 0.94148015],
       [0.94148015, 0.        , 0.96059601, 0.95099005],
       [0.95099005, 0.970299  , 0.95099005, 0.96059601],
       [0.96059601, 0.        , 0.95099005, 0.95099005],
       [0.95099005, 0.96059601, 0.        , 0.94148015],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9801    , 0.        , 0.96059601],
       [0.        , 0.        , 0.        , 0.        ],
       [0.96059601, 0.        , 0.970299  , 0.95099005],
       [0.96059601, 0.9801    , 0.9801    , 0.        ],
       [0.970299  , 0.99      , 0.        , 0.970299  ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9801    , 0.99      , 0.970299  ],
       [0.9801    , 0.99      , 1.        , 0.9801    ],
       [0.        , 0.        , 0.        , 0.        ]])

In [51]:
updated_q=np.zeros((16, 4))
for r in range (len(q_table)):
    if q_table[r].any()>0:
        c= np.argmax(q_table[r])
        updated_q[r][c]= 1
    else:
        c= 4
    print (f"Row {r}:", c)
print (updated_q)
        

Row 0: 1
Row 1: 2
Row 2: 1
Row 3: 0
Row 4: 1
Row 5: 4
Row 6: 1
Row 7: 4
Row 8: 2
Row 9: 1
Row 10: 1
Row 11: 4
Row 12: 4
Row 13: 2
Row 14: 2
Row 15: 4
[[0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 0.]]
