In [4]:
#FROZEN LAKE GAME [Q LEARNING]

import numpy as np
import gym
import random

# --ENVIRONMENT CREATION--
env = gym.make("FrozenLake-v0")

# --Q-TABLE INITIALISATION--
action_size = env.action_space.n
state_size = env.observation_space.n
qtable = np.zeros((state_size, action_size))
#print(qtable)
#print("action_size: ", action_size)
#print("state_size: ", state_size)

# --HYPERPARAMETERS--
total_episodes = 1000000		# Total episodes
learning_rate = 0.2 		# Learning rate
max_steps = 1000				# Max steps per episode
gamma = 0.1 				# Discounting rate

# Exploration parameters
epsilon = 1.0 				# Exploration rate
max_epsilon = 1.0 			# Exploration probability at start
min_epsilon = 0.01			# Min exploration probability
decay_rate = 0.01			# Exponential decay rate for exploration probability

# --Q-LEARNING ALGORITHM--
rewards = [] # List of rewards

for episode in range(total_episodes):
	# Reset the environment
	state = env.reset()
	step = 0
	done = False
	total_rewards = 0 # Total rewards obtained in the episode

	for step in range(max_steps):
		exp_exp_tradeoff = random.uniform(0, 1)
		if exp_exp_tradeoff > epsilon:
			# Exploitation --> taking the biggest Q-value for this state
			action = np.argmax(qtable[state, :])
		else:
			# Exploration
			action = env.action_space.sample()

		# Take the action (a) and observe the outcome state (s') and reward(r')
		new_state, reward, done, info = env.step(action)

		# Update Q(s,a):= Q(s,a) + lr [reward + gamma * max Q(s',a) - Q(s,a)]
		# qtable[new_state,:] : all the actions we can take from new_state (s')
		qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])

		total_rewards += reward

		# Move to new state
		state = new_state

		# If done (agent died) --> finish episode
		if done:
			break

	episode += 1

	# Reduce epsilon (because we need less and less exploration)
	epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate*episode)
	rewards.append(total_rewards)

print("Score over time: " + str(sum(rewards)/total_episodes))
print(qtable)



# Play Frozen Lake!

env.reset()

for episode in range(5):
	state = env.reset()
	step = 0
	done = False
	print("**********************************************************")
	print("EPISODE ", episode)
	print(qtable)

	for step in range(max_steps):
		env.render()
		# Take the action (index) that has the maximum expected future reward given that state
		action = np.argmax(qtable[state, :])

		new_state, reward, done, info = env.step(action)

		if done:
			break

		state = new_state

env.close()

Score over time: 0.054376
[[6.96889543e-09 1.27536126e-07 1.47825720e-08 1.36166980e-08]
 [9.68820356e-09 7.10020509e-07 1.07225104e-08 1.05912529e-08]
 [1.37134110e-06 7.04172843e-08 1.67850429e-05 4.37170648e-07]
 [5.24918780e-07 7.17659046e-09 1.75521931e-08 4.81145547e-08]
 [9.92779555e-08 7.01781387e-08 1.13742246e-06 2.37133918e-08]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.56918638e-05 5.47424706e-04 3.52125695e-05 2.95505749e-07]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [6.30533873e-07 2.43596490e-06 1.04291019e-05 1.75848144e-05]
 [4.95649497e-05 2.98436152e-04 8.44564858e-04 2.01391291e-04]
 [1.91756156e-03 3.69726536e-03 2.27674082e-02 3.64161432e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.41580364e-04 7.75232042e-04 1.90313448e-03 1.40945697e-02]
 [5.04245070e-03 2.97642118e-02 1.37993546e-01 3.61683260e-01]
 [0.00000000e+00 0.00000000e+