<a href="https://colab.research.google.com/github/sebastianoscarlopez/learning-deep-learning/blob/master/Q_learning_OpenAI_Gym.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Q-Learning from scratch

Reinforce Learning to solve the easy text based games from OpenaAI Gym

Choose between Frozen Lake 4x4 or 8x8, Taxi, NChain and Roulette.

# Base Library

In [0]:
import numpy as np
import gym

In [2]:
#env = gym.make('FrozenLake-v0')
#env = gym.make('FrozenLake8x8-v0')
env = gym.make('Taxi-v3')
#env = gym.make('NChain-v0')
#env = gym.make('Roulette-v0')
env._max_episode_steps = 500
# The game's look. WARNING NChain-v0 and Roulette-v0 hasn't render
env.reset()
env.render()

+---------+
|[35mR[0m: | : :[34;1mG[0m|
| :[43m [0m| : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+



In [3]:
# q-table
# Rows are every cell on the board (FrozenLake-v0 -> 16 = 4 x 4)
# Columns are the possible actions (FrozenLake-v0 -> 4 = Up, Rigth, Down, Left)
Q = np.zeros((env.observation_space.n, env.action_space.n))
print('q_table shape', Q.shape)

q_table shape (500, 6)


# Generic code for Q-learning algorithm

In [0]:
# defines
num_episodes = 20000
max_steps_per_episode = env._max_episode_steps
# learning rate
alpha = 0.1
# discount rate
gamma = 0.98
# exploration rate
exploration = 1
exploration_max = 1
exploration_min = 0.01
exploration_decay = 0.001

Update q_table  

$Q(s,a) = (1 - \alpha) * Q(s,a) + \alpha * (R_t + \gamma * \frac {maxQ} {a'} (s',a'))$

In [0]:
def update_q_table(s, a, s_new, reward):
  Q[s,a] = (1 - alpha) * Q[s,a] + alpha * (reward + gamma * Q[s_new].max())

In [6]:
rewards_all_episodes = []
total_steps = []
for episode in range(num_episodes):
    # initialize new episode params
    state = env.reset()
    done = False
    reward_episode = 0
    for step in range(max_steps_per_episode): 
      # Exploration-exploitation trade-off
      epsilon = np.random.uniform()
      # explotation or exploration
      action = Q[state].argmax() if epsilon > exploration else env.action_space.sample()

      # Take new action
      state_new, reward, done, info = env.step(action)
      # Update Q-table
      update_q_table(state, action, state_new, reward)
      #learn(state, state_new, reward, action)
      # Set new state
      state = state_new
      # Add new reward
      reward_episode += reward

      # did the game end?
      if done == True:
        break
    total_steps.append(step)

    # Exploration rate decay
    exploration = exploration_min + (exploration_max - exploration_min) * np.exp(-exploration_decay*episode)

    # Add current episode reward to total rewards list
    rewards_all_episodes.append(reward_episode)

    # progress
    if episode > 0 and episode % 1000 == 0:
      print('episode', episode, 'exploration', exploration, 'avg', np.average(rewards_all_episodes[-1000:]))

print('max steps', np.max(total_steps))
print(Q)


episode 1000 exploration 0.3742006467597279 avg -466.156
episode 2000 exploration 0.1439819304042466 avg -11.258
episode 3000 exploration 0.05928919768418531 avg 3.007
episode 4000 exploration 0.028132482499846838 avg 5.75
episode 5000 exploration 0.016670567529094613 avg 6.91
episode 6000 exploration 0.012453964654899695 avg 7.093
episode 7000 exploration 0.010902763145898971 avg 7.591
episode 8000 exploration 0.010332108001623487 avg 7.472
episode 9000 exploration 0.010122175706045813 avg 7.531
episode 10000 exploration 0.010044945930464861 avg 7.278
episode 11000 exploration 0.010016534683782344 avg 7.51
episode 12000 exploration 0.010006082770229794 avg 7.406
episode 13000 exploration 0.010002237726112912 avg 7.353
episode 14000 exploration 0.010000823213431913 avg 7.45
episode 15000 exploration 0.010000302843297297 avg 7.41
episode 16000 exploration 0.010000111409822971 avg 7.199
episode 17000 exploration 0.010000040985383415 avg 7.591
episode 18000 exploration 0.01000001507767994

# One play

With Frozen Lake game bear in mind that it's frozen so don't always move where the agent wants.

In [8]:
state = env.reset()
env.render()
done = False
while(not done):
  action = Q[state].argmax()
  state, reward, done, info = env.step(action)
  env.render()

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|Y| : |[35mB[0m: |
+---------+

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|R: | : :[34;1mG[0m|
| : | :[43m [0m: |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :[34;1mG[0m|
| : | : :[43m [0m|
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|R: | : :[34;1m[43mG[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :[42mG[