# Q_Learning
#### Terminologies in Q-Learning
- *States(S)* : the current position of the agent in the environment.
- *Action(A)* : a step taken by the agent in a particular state.
- *Rewards* : for every action, the agent receives a reward and penalty from environment.
- *Episodes*  : the end of the stage, where agent can't take new action. It happends when the agent has achieved the goal or failed.
- *Q(S_{ t+1 }, At)* : expected optimal Q-value of doing the action in a particular state.
- *Q(St, At)* : it is the current estimation of Q(St+1,A).
- *Q-Table* : the agent maintains the Q-Table of sets of states and actions 


In [1]:
from pyvirtualdisplay import Display
virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7f788b1f3490>

In [2]:
import numpy as np
import gym
import random
import imageio
from tqdm.notebook import trange

In [3]:
# env = gym.make("FrozenLake-v1", desc=None, map_name="4x4", is_slippery=True)
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False)
env.reset()

(0, {'prob': 1})

In [4]:
Q = np.zeros([env.observation_space.n, env.action_space.n])

In [5]:
# training episodes
tre = 10000
# Epsilon
max_e = 1.0
min_e = .05
dr = 0.0005
# max steps
ms = 100
# Gamma
g = .95
# Learning rate
lr = 0.7

In [6]:
def train(tre, min_e, max_e, dr, ms, env, Q, lr, g):
    for i in trange(tre):
        """
        this function train the Frozen Lake model with Q_Learning algorithms
        """
        e = min_e + (max_e -min_e) * np.exp(-dr * i)
        s = env.reset()[0]
        done = False
        for _  in range(ms):
            # action Greedy
            rnd = random.uniform(0,1)
            if rnd > e:
                a = np.argmax(Q[s])
            else:
                a = env.action_space.sample()
            # new State
            s_, r, done, _ , _2= env.step(a)
            # Bellman Equition
            Q[s,a] = Q[s,a] + lr * (r + g * np.max(Q[s_, :]) - Q[s,a])
            # If DONE == TRUE -> break the loop
            if done:
                break
            # Add new State to State in inner for loop
            s = s_
    return Q

In [7]:
Q =train(tre, min_e, max_e, dr, ms, env, Q, lr, g)

  0%|          | 0/10000 [00:00<?, ?it/s]

  if not isinstance(terminated, (bool, np.bool8)):


In [8]:
Q

array([[0.73509189, 0.77378094, 0.77378094, 0.73509189],
       [0.73509189, 0.        , 0.81450625, 0.77378094],
       [0.77378094, 0.857375  , 0.77378094, 0.81450625],
       [0.81450625, 0.        , 0.7737808 , 0.77378089],
       [0.77378094, 0.81450625, 0.        , 0.73509189],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.        , 0.81450625],
       [0.        , 0.        , 0.        , 0.        ],
       [0.81450625, 0.        , 0.857375  , 0.77378094],
       [0.81450625, 0.9025    , 0.9025    , 0.        ],
       [0.857375  , 0.95      , 0.        , 0.857375  ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.95      , 0.857375  ],
       [0.9025    , 0.95      , 1.        , 0.9025    ],
       [0.        , 0.        , 0.        , 0.        ]])