In [None]:
import gym
import numpy as np
import math

In [None]:
def statesIndex2states(stateIndex, numStates):
    stateY = int(stateIndex/numStates)
    stateX = int(stateIndex%numStates)
    return stateX, stateY

def testCaseGenerator(q, rendering=False, epsilon=0):
    obs = FL.reset()
    totalReward = 0
    
    while True:
        # find optimal action based on E-greedy policy
        # If epsilon is equal to 0, the policy is essentially argmax(q) over actions
        qMax = np.amax(q[obs])
        greedyActions = q[obs] == qMax
        numGreedyActions = np.sum(greedyActions)
        policy = epsilon/4*np.ones((4,))
        policy[greedyActions] = (1-epsilon)/numGreedyActions + epsilon/4
        action = np.random.choice(4,1,p=policy.tolist())
        action= action[0]
        
        # Take action and generate next states and reward
        obs, reward, done, _ = FL.step(action)
        
        if rendering:
            FL.render()
        totalReward += reward  
        if done:
            break 
    print("Total reward:  ", totalReward)
    return totalReward

In [None]:
# Initialization

FL = gym.make('FrozenLake-v0') # OpenAI environment

numStates = FL.observation_space.n
numActions = FL.action_space.n

q = np.zeros((numStates, numActions)) # action values initialization
alpha = 0.5 #Learning Rate
gamma = 1 #Discount Factor

obs = FL.reset()
numIterations = 10000
epsilon = 1 # exploration

totalReward = np.zeros((100,))
for i in range(0, numIterations):
    rewardLastEpisode = 0
    # Generate Episode
    while True:
        qMax = np.amax(q[obs])
        greedyActions = q[obs] == qMax
        numGreedyActions = np.sum(greedyActions)
        policy = epsilon/4*np.ones((4,))
        policy[greedyActions] = (1-epsilon)/numGreedyActions + epsilon/4
        action = np.random.choice(4,1,p=policy.tolist())
        action = action[0]
        obsNext, reward, done, _ = FL.step(action)
        q[obs][action] = q[obs][action] + alpha*(reward + gamma*np.amax(q[obsNext]) - q[obs][action])
        rewardLastEpisode += reward
        if done:
            break
        obs = obsNext
        
    # Reward over last 100 episodes. Determine the % efficiency of learning
    totalReward = np.append(totalReward[1:100,], rewardLastEpisode)
    
    # Reset environment
    obs = FL.reset()
    print("Reward over last 100 episodes:  ", np.sum(totalReward))
    
    # update epsilon (exploration factor)
    epsilon = math.exp(-i/100)

np.savetxt('actionValueFunction.txt',q)
print("Reward over last 100 episodes:  ", np.sum(totalReward))
testCaseGenerator(q, rendering = True)