<a href="https://colab.research.google.com/github/shashwat-dubey/ML/blob/master/FrozenLake.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import gym
import random
import time
import numpy as  np
from IPython.display import clear_output

In [0]:
NumOfEpisodes = 10000
NumOfSteps    = 100

learningRate       = 0.2
discountedReward   = 0.99

ExploreProbability = 1
ExploreRateMin     = 0.001
ExploreRateMax     = 1.0
ExploreDecayRate   = 0.001

#Initializtion:
1. Initialize Gym environment
2. Query state_space & action_space
3. Init Q_Table with zeros

In [3]:
env = gym.make("FrozenLake-v0")

env.reset()
states  = env.observation_space.n   # 4x4 positions : 16 states
actions = env.action_space.n        # 4 Actions -> {L, R, U, D}

q_table = np.zeros((states, actions))

print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


Q-Learning Algorithm

In [0]:
RewardsPerEpisode = []
for ep in range(NumOfEpisodes):
    epReward = 0
    state    = env.reset()
    done     = False
    
    for step in range(NumOfSteps):
        # Decide: Explore or exploit ?
        if random.uniform(0, 1) > ExploreProbability:
            # Exploit
            action = np.argmax(q_table[state, :])
        else:
            # Explore
            action = env.action_space.sample()

            
        # Perform the action in Environment
        new_state, reward, done, info = env.step(action)

        q_table[state, action] = (1.0 - learningRate) * q_table[state, action] +\
            learningRate * (reward + discountedReward * np.max(q_table[new_state,:]))
        
        state = new_state
        epReward += reward

        if (done == True):
            break
    
    ExploreProbability = ExploreRateMin + \
        (ExploreRateMax - ExploreRateMin) * np.exp(-ExploreDecayRate * ep)
    
    RewardsPerEpisode.append(epReward)
    

In [5]:
count = 1000
RewardsPerKEpisodes = np.split(np.array(RewardsPerEpisode), NumOfEpisodes/count)

for r in RewardsPerKEpisodes:
    print (count, '  : ', str(sum(r/1000)))
    count += 1000

1000   :  0.060000000000000046
2000   :  0.19000000000000014
3000   :  0.3960000000000003
4000   :  0.5330000000000004
5000   :  0.6630000000000005
6000   :  0.6900000000000005
7000   :  0.6850000000000005
8000   :  0.7200000000000005
9000   :  0.7170000000000005
10000   :  0.7180000000000005


In [6]:
print (q_table)

[[0.5012777  0.4677381  0.4686373  0.4584095 ]
 [0.36687664 0.33698971 0.38050301 0.4904394 ]
 [0.36050889 0.36978885 0.38379681 0.44613126]
 [0.31762081 0.27543078 0.26449779 0.43186085]
 [0.51730091 0.41696842 0.31780403 0.28590211]
 [0.         0.         0.         0.        ]
 [0.34319984 0.05132503 0.05668858 0.05885511]
 [0.         0.         0.         0.        ]
 [0.38996655 0.40982834 0.40886871 0.54552593]
 [0.38751819 0.62145562 0.41010838 0.20755688]
 [0.63937937 0.27238297 0.22368844 0.26865321]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.52955046 0.46392684 0.78423053 0.51208529]
 [0.6684947  0.93321236 0.66302417 0.66774514]
 [0.         0.         0.         0.        ]]


In [7]:
for episode in range(3):
    done = False
    state = env.reset()
    env.render
    time.sleep(0.3)
    for step in range(NumOfSteps):
        clear_output(wait=True)
        env.render()
        time.sleep(0.3)
        action = np.argmax(q_table[state, :])
        new_state, reward, done, info = env.step(action)
        
        if (done == True):
            clear_output(wait=True)
            env.render()
            time.sleep(3)
            break
        else:
            state = new_state

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
