# Terminology
* Environment 
    * Agent
    * State 
    * Action 
    * Reward

* Agent:  Maximize the reward from taking the actions in states within environment

# Q-Learning

Q[State,Action] = Q[State,Action] + a(reward = rmax(Q[newState,:]) - Q[State, Action])

a = learning Rate r = Discount Factor

In [None]:
import gym
env = gym.make('FrozenLake-v0')
print(env.observation_space.n)
print(env.action_space.n)
env.reset()
action = env.action_space.sample() # get a random action
new_state, reward, done, info = env.step(action) # take action, notice it returns information about the action
env.render() # render the GUI for the environment

# Frozen Lake Environment

In [None]:
import gym
import numpy as np
import time


env = gym.make('FrozenLake-v0')
STATES = env.observation_space.n
ACTIONS = env.action_space.n

In [None]:
Q = np.zeros((STATES,ACTIONS))
Q

# Constatnts

In [None]:
EPISODES = 10000
MAX_STEPS = 100

LEARNING_RATE = 0.81
GAMMA = 0.96

# Picking an Action

In [None]:
epsilon = 0.9 # start with a 90% chance of picking a random action


if np.random.uniform(0,1) < epsilon:
    action = env.action_space.sample()

else:
    action = np.argmax(Q[state, :]) # use Q table to pick best action based on current value

In [None]:
Q[state, action] = Q[state, action] + LEARNING_RATE * (reward + GAMMA * np.max(Q[new_state, : ]) - Q[state, action])

# Putting it together

In [None]:
import gym
import numpy as np
import time


env = gym.make('FrozenLake-v0')
STATES = env.observation_space.n
ACTIONS = env.action_space.n

Q = np.zeros((STATES,ACTIONS))

EPISODES = 1500
MAX_STEPS = 100

LEARNING_RATE = 0.81
GAMMA = 0.96

RENDER = False
epsilon = 0.9 

In [None]:
from numpy.lib.function_base import average
rewards = []
for episode in range(EPISODES):

  state = env.reset()
  for _ in range(MAX_STEPS):
    if RENDER: 
      env.render()

    if np.random.uniform(0, 1) < epsilon:
      action = env.action_space.sample()
    else:
      action = np.argmax(Q[state, :])


    next_state, reward, done, _ = env.step(action)

    Q[state, action] = Q[state, action] + LEARNING_RATE * (reward + GAMMA * np.max(Q[next_state, : ]) - Q[state, action])


    state = next_state

    if done: 
      rewards.append(reward)
      epsilon -= 0.001
      break


print(Q)
print(f"Average reward: {sum(rewards)/len(rewards)}:")



In [None]:
import matplotlib.pyplot as plt

def get_average(values):
  return sum(values)/len(values)


avg_rewards = []

for i in range(0, len(rewards), 100):
  avg_rewards.append(get_average(rewards[i:i+100]))


plt.plot(avg_rewards)
plt.ylabel('average reward')
plt.xlabel('episodes (100\'s)')
plt.show()