<a href="https://colab.research.google.com/github/sahilfaizal01/Reinforcement-Learning/blob/main/Discrete_Q_Learning_on_FrozenLake.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Importing Libraries**

In [65]:
%matplotlib notebook
from IPython.display import clear_output
import time  # slow the game down a little bit
import gym
import numpy as np  # used for all kinds of matrix / vector operations
import matplotlib.pyplot as plt  # for plotting

In [66]:
# SFFF       (S: starting point, safe)
# FHFH       (F: frozen surface, safe)
# FFFH       (H: hole, fall to your doom)
# HFFG       (G: goal, where the frisbee is located)

# **Registering a new environment**

In [67]:
from gym.envs.registration import register

register(
  id='FrozenLakeNotSlippery-v3',
  entry_point='gym.envs.toy_text:FrozenLakeEnv',
  kwargs={'map_name' : '4x4', 'is_slippery': False},
  max_episode_steps=100,
  reward_threshold=.8196,
)

## **Testing environment with some random actions.**

In [68]:
from IPython.display import clear_output

In [69]:
env = gym.make('FrozenLakeNotSlippery-v3')

# Reset the environment to its initial state
state = env.reset()

# Define the number of steps for the agent to take
num_steps = 20

In [70]:
for _ in range(num_steps):
  action = env.action_space.sample()
  next_state, reward, done, _ = env.step(action)
  env.render()
  if done:
    print("Episode finished")
    state = env.reset()
  else:
    state = next_state

env.close()

If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


Episode finished
Episode finished


In [71]:
action_size = env.action_space.n
state_size = env.observation_space.n

## **Initial Q-Table**

In [72]:
# Start with very small values for all our Q(s,a)
q_table = np.zeros([state_size, action_size])

In [73]:
q_table

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [74]:
q_table.shape

(16, 4)

# **Hyperparameters**

In [75]:
EPOCHS=20000  # number of epochs/episodes to train for
ALPHA = 0.8 # aka the learning rate
GAMMA = 0.95 # aka the discount rate

In [76]:
# Exploration vs. Exploitation parameters
epsilon = 1.0 # Exploration rate
max_epsilon = 1.0 # Exploration probability at start
min_epsilon = 0.01 # Minimum exploration probability
decay_rate = 0.001 # Exponential decay rate for exploration prob

# **Q-Table Update Functions Methodology**

In [77]:
def epsilon_greedy_action_selection(epsilon, q_table, discrete_state):
  random_number = np.random.random()
  # EXPLOITATION, USE BEST Q(s,a) Value
  if random_number > epsilon:
    # Action row for a particular state
    state_row = q_table[discrete_state,:]
    # Index of highest action for state
    # Recall action is mapped to index (e.g. 0=LEFT, 1=DOWN, etc..)
    action = np.argmax(state_row)
  # EXPLORATION, USE A RANDOM ACTION
  else:
    # Return a random 0,1,2,3 action
    action = env.action_space.sample()
  return action

## **FUNCTION FOR Q_VALUE COMPUTATION**



In [78]:
def compute_next_q_value(old_q_value, reward, next_optimal_q_value):
  return old_q_value +  ALPHA * (reward + GAMMA * next_optimal_q_value - old_q_value)


## **FUNCTION TO REDUCE EPSILON**

In [79]:
def reduce_epsilon(epsilon,epoch):
  return min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*epoch)

# **Training of Agent and Updating Q-Table**

In [80]:
q_table = np.zeros([state_size, action_size])
total_reward = 0
epsilon = 1

In [81]:
EPOCHS

20000

In [82]:
# List of rewards
rewards = []

# Play 20k games
for episode in range(EPOCHS):
  # Reset the environment
  state = env.reset()
  done = False
  total_rewards = 0

  while not done:
    action = epsilon_greedy_action_selection(epsilon,q_table, state)
    # Take the action (a) and observe the outcome state(s') and reward (r)
    new_state, reward, done, info = env.step(action)
    # Look up current/old qtable value Q(s_t,a_t)
    old_q_value =  q_table[state,action]
    # Get the next optimal Q-Value
    next_optimal_q_value = np.max(q_table[new_state, :])
    # Compute next q value
    next_q = compute_next_q_value(old_q_value, reward, next_optimal_q_value)
    # Update Q Table
    q_table[state,action] = next_q
    total_rewards = total_rewards + reward
    # Our new state is state
    state = new_state

  episode += 1
  # Reduce epsilon - less exploration needed
  epsilon = reduce_epsilon(epsilon,episode)
  rewards.append(total_rewards)
env.close()

In [83]:
q_table

array([[0.73509189, 0.77378094, 0.6983373 , 0.73509189],
       [0.73509189, 0.        , 0.66268191, 0.69833551],
       [0.69826931, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.77378094, 0.81450625, 0.        , 0.73509189],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.81450625, 0.        , 0.857375  , 0.77378094],
       [0.81450625, 0.9025    , 0.9025    , 0.        ],
       [0.857375  , 0.95      , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.95      , 0.857375  ],
       [0.9025    , 0.95      , 1.        , 0.9025    ],
       [0.        , 0.        , 0.        , 0.        ]])

# **Using Learned Q Table Results**

In [85]:
env = gym.make('FrozenLakeNotSlippery-v3')

state = env.reset()
rewards = 0

for _ in range(30):
  action = np.argmax(q_table[state])  # and chose action from the Q-Table
  state, reward, done, info = env.step(action) # Finally perform the action
  env.render(mode='human')
  if done:
    print("Episode finished")
    state = env.reset()
  else:
    state = next_state

env.close()

Episode finished
Episode finished
Episode finished
Episode finished
Episode finished
Episode finished
Episode finished
Episode finished
Episode finished
Episode finished
