# Q-Learning with OpenAI's Frozen Lake
## A tabular approach to solving a simple RL environment
### Adapted from https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0


In [1]:
import gym
import numpy as np

 

## First, load the environement from OpenAI's Gym

In [12]:
environment = gym.make('FrozenLake-v0')

## Set up the learning parameters


In [78]:
# We now initialise a table representing the Q-values of different states of the Frozen lake environment
Q = np.zeros([environment.observation_space.n,environment.action_space.n])

# Initialise the learning parameteres for this tabular method
learning_rate = 0.8
gamma = 0.95 # discount factor
num_episodes = 2000

# Bellman Equation

$Q(s,a) = r + \gamma (max(Q(s',a')))$


In [1]:
rewards = []

for i in range(num_episodes):
    s = environment.reset()
    episode_rewards = 0
    dead = False
    
    # Q-learning 
    j = 0
    while j < 99:
        j += 1

        a = np.argmax(Q[s,:] + np.random.randn(1,environment.action_space.n)*(1./(i+1)))    # Select action by epsilon-greedy
        s_next, r, dead, _ = environment.step(a)                                            # Obtain next state and reward from the environment
        Q[s,a] = Q[s,a] + learning_rate * (r + gamma * np.max(Q[s_next,:]) - Q[s,a])        # Update the Q-values

        episode_rewards += r
        s = s_next

        if dead == True: # End the episode if you died
            break

    rewards.append(episode_rewards)

NameError: name 'num_episodes' is not defined

In [80]:
print ("Score over time: " +  str(sum(rewards) / num_episodes))

Score over time: 0.5755


In [81]:
print ("Final Q-Table Values")
print (Q)

Final Q-Table Values
[[1.56786150e-01 1.20493667e-02 1.59964412e-02 7.84281891e-03]
 [1.57986526e-03 7.60872946e-05 3.41776748e-04 1.88285371e-01]
 [3.34543937e-03 5.05446600e-03 8.89221903e-03 2.15764209e-01]
 [1.88620218e-03 5.24769308e-05 4.64019862e-04 9.08262139e-02]
 [1.95922395e-01 6.20259374e-03 2.41488712e-04 2.35492666e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.31216545e-01 1.17066461e-05 1.06573426e-06 2.44585133e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.03230996e-03 5.87196530e-03 2.36439058e-03 2.02143983e-01]
 [3.92068206e-04 6.22622475e-01 2.78508657e-03 1.64853752e-05]
 [8.78431646e-02 9.89828842e-04 3.57954750e-04 7.34154236e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [8.32729862e-03 0.00000000e+00 8.85690855e-01 5.99737559e-03]
 [0.00000000e+00 9.93029302e-01 1.84302579e-03 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.