Installing dependencies

In [1]:
!pip install numpy
!pip install gym



Importing necessary libraries

In [2]:
import numpy as np
import gym
import random

Creating the environment

In [3]:
env = gym.make("FrozenLake-v0")

Initiating Q table

In [4]:
action_size = env.action_space.n
state_size = env.observation_space.n

In [5]:
# create our Q table with state_size rows and action_size column(64x4)
qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


Creating hyper-parameters

In [6]:
total_episodes = 20000 #Total episodes
learning_rate = 0.7 #Learning rate
max_steps = 99 #Max steps per episode
gamma = 0.95 #Discounting rate

#Exploration parameters
epsilon = 1.0 #Exploration rate
max_epsilon = 1.0 #Exploration probability at start
min_epsilon = 0.01 #Minimum exploration probability
decay_rate = 0.005 #Exponential decay rate for exploration prob

Q-Learning algorithm

In [9]:
#List of rewards 
rewards = []

#2 for life or until learning is stopped
for episode in range(total_episodes):
  # Reset the environment
  state = env.reset()
  step = 0
  done = False
  total_rewards = 0
  
  for step in range(max_steps):
    #3. Choose an action a in the current world state (s)
    ## First we randonize a number
    exp_exp_tradeoff = random.uniform(0, 1)

    ## if this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
    if exp_exp_tradeoff > epsilon:
      action = np.argmax(qtable[state, :])
      #print(exp_exp_tradeot, "action", action)

    # Else doing o randon cholce --> exploration
    else:
      action = env.action_space.sample()
      #print("action randon", action)

    # Take the action (a) and observe the outcome state(s') and reward (r)
    new_state, reward, done, info = env.step(action)
    
    # Update Q(s,a):= Q(s,a) + Ir [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
    # qtable[new_state, :] : all the actions we can take from new state
    qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
    
    total_rewards += reward
    
    # our new state is state
    state = new_state
    
    # if done (1F we're dead) : finish episode
    if done == True:
      break
    
    # Reduce epsilon (because we need less and less exploration) 
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    rewards.append(total_rewards) 

print ("Score over time: " + str(sum(rewards)/total_episodes))
print(qtable) 
      


Score over time: 0.0
[[2.59279733e-01 8.04156935e-02 5.99814719e-02 7.40982763e-02]
 [4.90424888e-03 2.75377439e-03 4.02054285e-03 1.26455676e-01]
 [3.66005916e-02 5.14043257e-03 4.24936124e-03 5.70128363e-03]
 [4.86536754e-03 2.65784154e-03 3.47465990e-03 5.64553223e-03]
 [2.66966749e-01 2.76411591e-02 3.94492604e-02 9.58064885e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.29378672e-04 3.60892121e-05 1.44959663e-01 2.47248178e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [4.56318288e-02 1.06548618e-01 9.61038065e-02 3.68840321e-01]
 [4.82132273e-02 4.39660718e-01 1.72248494e-02 1.94575708e-02]
 [7.77430377e-01 1.26066064e-01 1.81698802e-02 7.19659713e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.40677970e-01 2.14671305e-02 6.45848430e-01 1.36547490e-01]
 [1.54882829e-01 8.92863718e-01 4.21762021e-01 3.88792123e-01]
 [0.00000000e+00 0.00000000e+00 0.

Using Q table to play now

In [10]:
env.reset()

for episode in range(5):
  state = env.reset()
  step = 0
  done = False
  print("********************************")
  print("EPISODE ", episode)
  
  for step in range(max_steps):
    
    # Take the action (index) that have the maximum expected future reward given that state
    action = np.argmax(qtable[state, :])
    
    new_state, reward, done, info = env.step(action)
    
    if done:
      # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
      env.render()
      if new_state == 15:
        print("We reached our Goal ")
      else:
        print("We fell into a hole ")
      
      # We print the number of step it took.
      print("Number of steps", step)
      
      break
    state = new_state
env.close()



********************************
EPISODE  0
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
We fell into a hole 
Number of steps 29
********************************
EPISODE  1
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
We reached our Goal 
Number of steps 6
********************************
EPISODE  2
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
We reached our Goal 
Number of steps 33
********************************
EPISODE  3
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
We fell into a hole 
Number of steps 65
********************************
EPISODE  4
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
We reached our Goal 
Number of steps 34
