## FrozenLake-v0

In [0]:
import gym
import numpy as np
import sys
from collections import defaultdict

In [0]:
#Loading the envrionment

env = gym.make('FrozenLake-v0')

In [3]:
print(env.action_space)
print(env.observation_space)

Discrete(4)
Discrete(16)


## MDP

So, the above environment is a ***finite Markov Decision Process***, with
- a finite set of states  (in this case 16)
- a finite set of actions (in this case 4)
- a finite set of Rewards (in this case 1 if you reach the goal, 0 otherwise)
- one-step dynamics of the environment (basically the fact that the environment changes from one state to the next one by an action taken by our agent)
- a discount rate  $\gamma\   \epsilon [0,1]$ (a higher gamma corresponds to higher priority of the agent to attain future rewards along with present one)

## Q-Learning

It is an RL algorithm which tries to find the optimal policy for the MDP at hand. It does this by making an ***action-value table***  - a table which has for every state and action, a corresponding value which is the sum of all rewards the agent would get if it takes the specific action for the specific state and follows the policy from thereafter. 

In [0]:
def action_from_eps_greedy_policy(state, Q, epsilon, nA):
    #greedy action
    max_ = np.argmax(Q[state])
    prob = get_prob(nA, epsilon, max_)
    #choosing a random action with probabilities (1 - epsilon) for action having max Q-value
    #and epsilon for all other actions
    #thus giving agent to both exploit it's knowledge and explore the environment
    action = np.random.choice(np.arange(nA), p=prob)
    return action

def get_prob(nA, epsilon, max_):
    #initial probablities
    prob = np.ones(nA)*epsilon/(nA-1)
    prob[max_] = 1 - epsilon
    return prob
    
def q_learning(env, num_episodes, alpha, nActions, gamma=1.0, eps_start=1, eps_decay=0.9999, eps_min=0.05):
    # initialize action-value function (empty dictionary of numpy arrays)
    Q = defaultdict(lambda: np.ones(nActions))
    epsilon = eps_start
    # loop over episodes
    for i_episode in range(1, num_episodes+1):
        # monitor progress
        if i_episode % 100 == 0:
            print("\rEpisode {}/{}".format(i_episode, num_episodes), end="")
            sys.stdout.flush()   
        #decaying epsilon
        epsilon = max(epsilon*eps_decay, eps_min)
        #observing state s0 and taking action
        state_prev = env.reset()
        action_prev = action_from_eps_greedy_policy(state_prev, Q, epsilon, nActions)
        #loop over SARSA
        while True:
            state, reward, done, info = env.step(action_prev)
            Q[state_prev][action_prev] += alpha*(reward + gamma*np.max(Q[state]) - Q[state_prev][action_prev])
            if done:
                break
            #update state and action
            state_prev = state
            action_prev = action_from_eps_greedy_policy(state, Q, epsilon, nActions)
        
    return Q

## Training the agent

In [38]:
Q_table = q_learning(env, num_episodes=30000, alpha=0.01, nActions=4)

Episode 30000/30000

In [0]:
def get_Policy(Q_table):
    nActions = Q_table[0].shape[0]
    #initialising the policy
    policy = defaultdict(lambda: -1)
    for state,action in Q_table.items():
        #choosing the action with highes Q-value
        policy[state] = np.argmax(action)
    return policy

In [0]:
policy = get_Policy(Q_table)

## Testing the agent

In [31]:
num_episodes = 100
total_reward = 0
for i_episode in range(num_episodes):
    state = env.reset()
    while True:
        #env.render()
        action = policy[state]        #choose an action from the policy
        state, reward, done, info = env.step(action)   #perform chosen action to get next_state and reward
        total_reward += reward
        if done:
            break

print('Average reward =',total_reward/num_episodes)

Average reward = 0.75


So the agent reaches the goal in 75 of 100 episodes. Since the environment has random reactions, that is the ice on which agent is walking is slippery, this is acceptable.

Let's see it in action for one episode by printing every state and action.

In [33]:
state = env.reset()
while True:
        env.render()
        action = policy[state]        #choose an action from the policy
        state, reward, done, info = env.step(action)   #perform chosen action to get next_state and reward
        if done:
            break


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FF

Github doesn't render the agent's action properly. You can visit [NbViewer](https://nbviewer.jupyter.org/github/Sachinkumar04428/OpenAI-Gym-Solutions/blob/master/Toy%20text%20Easy/FrozenLake_v0.ipynb) to view it properly.