In [1]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl.metadata (10 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


In [2]:
#Making the environment
import gymnasium as gym #alias
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=True,render_mode="rgb_array") #for rendering inside the notebook display, use render_mode ="ansi"

In [3]:
#How to make your own
#desc=["SFFH","HFFF","FFHF","FFFG"]

In [4]:
#Number of states and actions
n_observations = env.observation_space.n #States
n_actions = env.action_space.n           #Action

In [5]:
n_rewards= env.reward_range
print("Range of rewards",n_rewards)

Range of rewards (0, 1)


In [6]:
print('Number of States',n_observations)
print('Number of possible actions',n_actions)

Number of States 16
Number of possible actions 4


In [7]:
#Initialize the Q-table to 0
import numpy as np
Q_table = np.zeros((n_observations,n_actions))
print(Q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [8]:
Q_table.shape #16 States with 4 actions each

(16, 4)

In [9]:
Q_table[9,:] #Q-values for State 10

array([0., 0., 0., 0.])

In [10]:
#Initialising variables for the process
#number of episode we will run
n_episodes = 50000

#maximum of iteration per episode- Or number of steps per episode
steps_allowed = 100

#initialize the exploration probability to 1
epsilon = 1

#exploartion decay rate for exponential decreasing
decay_rate = 0.001

# minimum of exploration proba
min_epsilon = 0.01

#max exploration rate
max_epsilon=1

#discounted factor
gamma = 0.99

#learning rate
lr = 0.1

In [11]:
#Storing rewards after each episode in a list
rewards_per_episode = list()

In [12]:
env.action_space.sample()

1

In [13]:
#we iterate over episodes
for e in range(n_episodes):   # loop over 10000 episode
  #we initialize the first state of the episode
  state = env.reset()[0]
  done = False       #This will keep track of whether the episode has been completed

  #sum the rewards that the agent gets from the environment
  total_reward = 0

  for step in range(steps_allowed):
    # epsilon greedy strategy
    # we initiate a random number between 0 and 1
    # if the random_number is less than the exploration proba(epsilon)
    #     the agent explores
    # else
    #     he exploits his knowledge

    if np.random.uniform(0,1) < epsilon:
        action = env.action_space.sample()
    else: #Exploit
        action = np.argmax(Q_table[state,:])

    # The environment runs the chosen action and returns
    # the next state, a reward and true if the episode is ended.
    next_state, reward, done, truncated, info = env.step(action)

    # We update our Q-table using the Q-learning iteration
    Q_table[state, action] = (1-lr) * Q_table[state, action] \
                       + lr*(reward + gamma* np.max(Q_table[next_state,:]))
    total_reward = total_reward + reward

    state = next_state
    # If the episode is finished, we leave the for loop
    if done:
        break

  #We update the exploration proba using exponential decay formula
  epsilon = exploration_rate = min_epsilon + \
    (max_epsilon-min_epsilon) * np.exp(-decay_rate*e)
  rewards_per_episode.append(total_reward)

In [14]:
len(rewards_per_episode)

50000

In [15]:
rewards_per_thousand_episodes=np.split(np.array(rewards_per_episode),n_episodes/1000)

In [16]:
count=1000
print('-----Average reward per thousand episodes-------')
for r in rewards_per_thousand_episodes:
  print(count,':' ,str(sum(r/1000)))
  count+=1000

-----Average reward per thousand episodes-------
1000 : 0.05100000000000004
2000 : 0.20800000000000016
3000 : 0.4290000000000003
4000 : 0.5430000000000004
5000 : 0.6230000000000004
6000 : 0.6370000000000005
7000 : 0.6970000000000005
8000 : 0.6600000000000005
9000 : 0.6880000000000005
10000 : 0.6800000000000005
11000 : 0.6720000000000005
12000 : 0.6820000000000005
13000 : 0.6950000000000005
14000 : 0.6790000000000005
15000 : 0.6630000000000005
16000 : 0.6580000000000005
17000 : 0.6730000000000005
18000 : 0.6700000000000005
19000 : 0.6940000000000005
20000 : 0.6800000000000005
21000 : 0.6970000000000005
22000 : 0.6820000000000005
23000 : 0.6750000000000005
24000 : 0.6790000000000005
25000 : 0.6970000000000005
26000 : 0.6850000000000005
27000 : 0.6560000000000005
28000 : 0.6990000000000005
29000 : 0.7000000000000005
30000 : 0.6940000000000005
31000 : 0.6670000000000005
32000 : 0.6620000000000005
33000 : 0.6760000000000005
34000 : 0.6480000000000005
35000 : 0.7250000000000005
36000 : 0.684

In [17]:
print('-----------Updated Q Table-----------')
print(Q_table)

-----------Updated Q Table-----------
[[0.52839502 0.49667213 0.49820403 0.49610166]
 [0.41211341 0.33916459 0.34036414 0.48468881]
 [0.38528509 0.42250591 0.4325106  0.45179409]
 [0.38540798 0.29153475 0.3577074  0.43922083]
 [0.54720286 0.46101725 0.36664834 0.43328481]
 [0.         0.         0.         0.        ]
 [0.38790748 0.17546262 0.19124391 0.17660036]
 [0.         0.         0.         0.        ]
 [0.42335323 0.39271393 0.34474778 0.59034506]
 [0.43468399 0.64327112 0.43602673 0.38364287]
 [0.65568866 0.44259121 0.26482783 0.39520299]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.46099164 0.543079   0.74083842 0.5656257 ]
 [0.72097017 0.89694691 0.77688188 0.75921604]
 [0.         0.         0.         0.        ]]


In [18]:
import time
from IPython.display import clear_output

In [19]:
#Visualising the game
for episode in range(3):
  state=env.reset()[0]
  done=False
  print('----------EPISODE:',episode+1,'---------\n\n\n\n')
  time.sleep(0.5)

  for step in range(steps_allowed):
    clear_output(wait=True)
    print(env.render())
    time.sleep(0.4)

    action = np.argmax(Q_table[state,:])
    new_state, reward, done, truncated, info = env.step(action)

    if done:
      clear_output(wait=True)
      print(env.render())
      if reward == 1:
          print("****You reached the goal!****")
          time.sleep(3)
      else:
          print("****You fell through a hole!****")
          time.sleep(3)
      clear_output(wait=True)
      break

    state=new_state

env.close()

[[[180 200 230]
  [180 200 230]
  [180 200 230]
  ...
  [180 200 230]
  [180 200 230]
  [180 200 230]]

 [[180 200 230]
  [204 230 255]
  [204 230 255]
  ...
  [204 230 255]
  [204 230 255]
  [180 200 230]]

 [[180 200 230]
  [235 245 249]
  [204 230 255]
  ...
  [204 230 255]
  [204 230 255]
  [180 200 230]]

 ...

 [[180 200 230]
  [235 245 249]
  [235 245 249]
  ...
  [204 230 255]
  [235 245 249]
  [180 200 230]]

 [[180 200 230]
  [235 245 249]
  [235 245 249]
  ...
  [204 230 255]
  [204 230 255]
  [180 200 230]]

 [[180 200 230]
  [180 200 230]
  [180 200 230]
  ...
  [180 200 230]
  [180 200 230]
  [180 200 230]]]
****You reached the goal!****


In [20]:
#Visualising in video
%pip install gymnasium[classic_control] comet_ml
import comet_ml
comet_ml.init(project_name="frozen_lake")
env = gym.wrappers.RecordVideo(env, 'gameplay video')

Collecting comet_ml
  Downloading comet_ml-3.47.0-py3-none-any.whl.metadata (3.9 kB)
Collecting everett<3.2.0,>=1.0.1 (from everett[ini]<3.2.0,>=1.0.1->comet_ml)
  Downloading everett-3.1.0-py2.py3-none-any.whl.metadata (17 kB)
Collecting python-box<7.0.0 (from comet_ml)
  Downloading python_box-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.8 kB)
Collecting requests-toolbelt>=0.8.0 (from comet_ml)
  Downloading requests_toolbelt-1.0.0-py2.py3-none-any.whl.metadata (14 kB)
Collecting semantic-version>=2.8.0 (from comet_ml)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting sentry-sdk>=1.1.0 (from comet_ml)
  Downloading sentry_sdk-2.14.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting simplejson (from comet_ml)
  Downloading simplejson-3.19.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting wurlitzer>=1.0.2 (from comet_ml)
  Downloading wurl



Please paste your Comet API key from https://www.comet.com/api/my/settings/
(api key may not show as you type)
Comet API key: ··········


[1;38;5;39mCOMET INFO:[0m Valid Comet API Key saved in /root/.comet.config (set COMET_CONFIG to change where it is saved).


In [21]:
#Visualising the game
for episode in range(3):
  state=env.reset()[0]
  done=False
  print('----------EPISODE:',episode+1,'---------\n\n\n\n')
  time.sleep(1)

  for step in range(steps_allowed):
    clear_output(wait=True)
    print(env.render())
    time.sleep(0.4)

    action = np.argmax(Q_table[state,:])
    new_state, reward, done, truncated, info = env.step(action)

    if done:
      clear_output(wait=True)
      print(env.render())
      if reward == 1:
          print("****You reached the goal!****")
          time.sleep(3)
      else:
          print("****You fell through a hole!****")
          time.sleep(3)
      clear_output(wait=True)
      break

    state=new_state

env.close()

[[[180 200 230]
  [180 200 230]
  [180 200 230]
  ...
  [180 200 230]
  [180 200 230]
  [180 200 230]]

 [[180 200 230]
  [204 230 255]
  [204 230 255]
  ...
  [204 230 255]
  [204 230 255]
  [180 200 230]]

 [[180 200 230]
  [235 245 249]
  [204 230 255]
  ...
  [204 230 255]
  [204 230 255]
  [180 200 230]]

 ...

 [[180 200 230]
  [235 245 249]
  [235 245 249]
  ...
  [204 230 255]
  [235 245 249]
  [180 200 230]]

 [[180 200 230]
  [235 245 249]
  [235 245 249]
  ...
  [204 230 255]
  [204 230 255]
  [180 200 230]]

 [[180 200 230]
  [180 200 230]
  [180 200 230]
  ...
  [180 200 230]
  [180 200 230]
  [180 200 230]]]
****You fell through a hole!****
