In [3]:
import numpy as np
np.random.seed(42)
import gym

In [43]:
class QLearnFrozenLake():
    def __init__(self):
        self.env = gym.make("FrozenLake-v0")    
        num_of_actions = self.env.nA
        num_of_states = self.env.nS
        self.lookup_table = np.zeros((num_of_states, num_of_actions))

    def exploit(self, state):
        lookup_table = self.lookup_table
        return np.argmax(lookup_table[state, :])

    def explore(self):
        env = self.env
        return env.action_space.sample()

    def fit(self, epochs, max_eps, alpha, gamma):
        env = self.env
        lookup_table = self.lookup_table
        eps = max_eps
        rewards = []
        for epoch in range(epochs):
            state = env.reset()
            running_reward = 0
            for i in range(150):
                if np.random.random() > eps:
                    action = self.exploit(state)
                else:
                    action = self.explore()
                new_state, reward, won_lost, _ = env.step(action)
                lookup_table[state, action] += alpha * (reward + gamma * np.max(lookup_table[new_state, :]) - lookup_table[state, action])
                running_reward += reward
                state = new_state
                if won_lost:
                    if eps > 0.011:
                        eps = eps - 0.001
                    break
            rewards.append(running_reward)
            self.lookup_table = lookup_table

        print ("mean reward is: " +  str(sum(rewards)/epochs))
        self.best_actions = []
        for state in range(self.env.nS):
            self.best_actions.append(np.argmax(lookup_table[state,:]))    

    def test(self, num_of_trials):
        env = self.env
        best_actions = self.best_actions
        env.reset()
        success_rate = 0
        for i in range(num_of_trials):
            state = env.reset()
            for j in range(150):
                new_state, reward, won_lost, _ = env.step(best_actions[state])
                if won_lost:
                    if reward != 0: 
                        success_rate += (1 / num_of_trials)
                    break
                state = new_state
        print(f"success rate is: {success_rate*100}% ")
        env.close()

In [44]:
qlearn = QLearnFrozenLake()

In [45]:
qlearn.fit(epochs=25000, max_eps=1, alpha=0.08, gamma=0.95)

mean reward is: 0.60408


In [46]:
qlearn.test(num_of_trials=1000)

success rate is: 73.40000000000005% 
