In [1]:
import gym
import numpy as np
import math

In [2]:
def statesIndex2states(stateIndex, numStates):
    stateY = int(stateIndex/numStates)
    stateX = int(stateIndex%numStates)
    return stateX, stateY

def testCaseGenerator(q, rendering=False, epsilon=0):
    obs = FL.reset()
    totalReward = 0
    
    while True:
        # find optimal action based on E-greedy policy
        # If epsilon is equal to 0, the policy is essentially argmax(q) over actions
        qMax = np.amax(q[obs])
        greedyActions = q[obs] == qMax
        numGreedyActions = np.sum(greedyActions)
        policy = epsilon/4*np.ones((4,))
        policy[greedyActions] = (1-epsilon)/numGreedyActions + epsilon/4
        action = np.random.choice(4,1,p=policy.tolist())
        action= action[0]
        
        # Take action and generate next states and reward
        obs, reward, done, _ = FL.step(action)
        
        if rendering:
            FL.render()
        totalReward += reward  
        if done:
            break 
    print("Total reward:  ", totalReward)
    return totalReward

In [6]:
# Initialization

FL = gym.make('FrozenLake-v0') # OpenAI environment

numStates = FL.observation_space.n
numActions = FL.action_space.n

q = np.zeros((numStates, numActions)) # action values initialization
alpha = 0.5 #Learning Rate
gamma = 1 #Discount Factor

obs = FL.reset()
numIterations = 2000
epsilon = 1 # exploration

totalReward = np.zeros((100,))
for i in range(0, numIterations):
    rewardLastEpisode = 0
    # Generate Episode
    while True:
        qMax = np.amax(q[obs])
        greedyActions = q[obs] == qMax
        numGreedyActions = np.sum(greedyActions)
        policy = epsilon/4*np.ones((4,))
        policy[greedyActions] = (1-epsilon)/numGreedyActions + epsilon/4
        action = np.random.choice(4,1,p=policy.tolist())
        action = action[0]
        obsNext, reward, done, _ = FL.step(action)
        q[obs][action] = q[obs][action] + alpha*(reward + gamma*np.amax(q[obsNext]) - q[obs][action])
        rewardLastEpisode += reward
        if done:
            break
        obs = obsNext
        
    # Reward over last 100 episodes. Determine the % efficiency of learning
    totalReward = np.append(totalReward[1:100,], rewardLastEpisode)
    
    # Reset environment
    obs = FL.reset()
    print("Reward over last 100 episodes:  ", np.sum(totalReward))
    
    # update epsilon (exploration factor)
    epsilon = math.exp(-i/100)

[2017-07-09 01:01:47,332] Making new env: FrozenLake-v0


Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
Reward over last 100 episodes:   0.0
R

Reward over last 100 episodes:   23.0
Reward over last 100 episodes:   23.0
Reward over last 100 episodes:   24.0
Reward over last 100 episodes:   23.0
Reward over last 100 episodes:   22.0
Reward over last 100 episodes:   22.0
Reward over last 100 episodes:   22.0
Reward over last 100 episodes:   22.0
Reward over last 100 episodes:   22.0
Reward over last 100 episodes:   23.0
Reward over last 100 episodes:   22.0
Reward over last 100 episodes:   23.0
Reward over last 100 episodes:   23.0
Reward over last 100 episodes:   24.0
Reward over last 100 episodes:   24.0
Reward over last 100 episodes:   24.0
Reward over last 100 episodes:   24.0
Reward over last 100 episodes:   24.0
Reward over last 100 episodes:   24.0
Reward over last 100 episodes:   25.0
Reward over last 100 episodes:   26.0
Reward over last 100 episodes:   27.0
Reward over last 100 episodes:   27.0
Reward over last 100 episodes:   27.0
Reward over last 100 episodes:   27.0
Reward over last 100 episodes:   27.0
Reward over 

Reward over last 100 episodes:   76.0
Reward over last 100 episodes:   76.0
Reward over last 100 episodes:   76.0
Reward over last 100 episodes:   76.0
Reward over last 100 episodes:   75.0
Reward over last 100 episodes:   75.0
Reward over last 100 episodes:   74.0
Reward over last 100 episodes:   73.0
Reward over last 100 episodes:   74.0
Reward over last 100 episodes:   75.0
Reward over last 100 episodes:   75.0
Reward over last 100 episodes:   75.0
Reward over last 100 episodes:   76.0
Reward over last 100 episodes:   76.0
Reward over last 100 episodes:   75.0
Reward over last 100 episodes:   75.0
Reward over last 100 episodes:   74.0
Reward over last 100 episodes:   74.0
Reward over last 100 episodes:   74.0
Reward over last 100 episodes:   73.0
Reward over last 100 episodes:   72.0
Reward over last 100 episodes:   72.0
Reward over last 100 episodes:   71.0
Reward over last 100 episodes:   72.0
Reward over last 100 episodes:   72.0
Reward over last 100 episodes:   72.0
Reward over 

Reward over last 100 episodes:   52.0
Reward over last 100 episodes:   52.0
Reward over last 100 episodes:   53.0
Reward over last 100 episodes:   52.0
Reward over last 100 episodes:   52.0
Reward over last 100 episodes:   52.0
Reward over last 100 episodes:   51.0
Reward over last 100 episodes:   50.0
Reward over last 100 episodes:   50.0
Reward over last 100 episodes:   51.0
Reward over last 100 episodes:   50.0
Reward over last 100 episodes:   51.0
Reward over last 100 episodes:   51.0
Reward over last 100 episodes:   50.0
Reward over last 100 episodes:   49.0
Reward over last 100 episodes:   49.0
Reward over last 100 episodes:   50.0
Reward over last 100 episodes:   50.0
Reward over last 100 episodes:   51.0
Reward over last 100 episodes:   51.0
Reward over last 100 episodes:   51.0
Reward over last 100 episodes:   52.0
Reward over last 100 episodes:   52.0
Reward over last 100 episodes:   53.0
Reward over last 100 episodes:   53.0
Reward over last 100 episodes:   54.0
Reward over 

Reward over last 100 episodes:   73.0
Reward over last 100 episodes:   74.0
Reward over last 100 episodes:   74.0
Reward over last 100 episodes:   74.0
Reward over last 100 episodes:   74.0
Reward over last 100 episodes:   74.0
Reward over last 100 episodes:   74.0
Reward over last 100 episodes:   73.0
Reward over last 100 episodes:   73.0
Reward over last 100 episodes:   74.0
Reward over last 100 episodes:   73.0
Reward over last 100 episodes:   74.0
Reward over last 100 episodes:   75.0
Reward over last 100 episodes:   74.0
Reward over last 100 episodes:   73.0
Reward over last 100 episodes:   73.0
Reward over last 100 episodes:   73.0
Reward over last 100 episodes:   74.0
Reward over last 100 episodes:   75.0
Reward over last 100 episodes:   75.0
Reward over last 100 episodes:   74.0
Reward over last 100 episodes:   73.0
Reward over last 100 episodes:   73.0
Reward over last 100 episodes:   73.0
Reward over last 100 episodes:   73.0
Reward over last 100 episodes:   73.0
Reward over 

Reward over last 100 episodes:   77.0
Reward over last 100 episodes:   77.0
Reward over last 100 episodes:   77.0
Reward over last 100 episodes:   77.0
Reward over last 100 episodes:   76.0
Reward over last 100 episodes:   76.0
Reward over last 100 episodes:   76.0
Reward over last 100 episodes:   76.0
Reward over last 100 episodes:   76.0
Reward over last 100 episodes:   77.0
Reward over last 100 episodes:   77.0
Reward over last 100 episodes:   77.0
Reward over last 100 episodes:   77.0
Reward over last 100 episodes:   77.0
Reward over last 100 episodes:   77.0
Reward over last 100 episodes:   78.0
Reward over last 100 episodes:   78.0
Reward over last 100 episodes:   78.0
Reward over last 100 episodes:   79.0
Reward over last 100 episodes:   78.0
Reward over last 100 episodes:   78.0
Reward over last 100 episodes:   78.0
Reward over last 100 episodes:   78.0
Reward over last 100 episodes:   77.0
Reward over last 100 episodes:   76.0
Reward over last 100 episodes:   76.0
Reward over 

Reward over last 100 episodes:   63.0
Reward over last 100 episodes:   64.0
Reward over last 100 episodes:   65.0
Reward over last 100 episodes:   64.0
Reward over last 100 episodes:   64.0
Reward over last 100 episodes:   64.0
Reward over last 100 episodes:   63.0
Reward over last 100 episodes:   64.0
Reward over last 100 episodes:   64.0
Reward over last 100 episodes:   65.0
Reward over last 100 episodes:   65.0
Reward over last 100 episodes:   65.0
Reward over last 100 episodes:   66.0
Reward over last 100 episodes:   66.0
Reward over last 100 episodes:   66.0
Reward over last 100 episodes:   66.0
Reward over last 100 episodes:   66.0
Reward over last 100 episodes:   66.0
Reward over last 100 episodes:   67.0
Reward over last 100 episodes:   67.0
Reward over last 100 episodes:   67.0
Reward over last 100 episodes:   68.0
Reward over last 100 episodes:   68.0
Reward over last 100 episodes:   69.0
Reward over last 100 episodes:   70.0
Reward over last 100 episodes:   69.0
Reward over 

Reward over last 100 episodes:   75.0
Reward over last 100 episodes:   76.0
Reward over last 100 episodes:   76.0
Reward over last 100 episodes:   76.0
Reward over last 100 episodes:   76.0
Reward over last 100 episodes:   76.0
Reward over last 100 episodes:   76.0
Reward over last 100 episodes:   76.0
Reward over last 100 episodes:   76.0
Reward over last 100 episodes:   77.0
Reward over last 100 episodes:   77.0
Reward over last 100 episodes:   78.0
Reward over last 100 episodes:   77.0
Reward over last 100 episodes:   76.0
Reward over last 100 episodes:   77.0
Reward over last 100 episodes:   76.0
Reward over last 100 episodes:   75.0
Reward over last 100 episodes:   74.0
Reward over last 100 episodes:   74.0
Reward over last 100 episodes:   74.0
Reward over last 100 episodes:   75.0
Reward over last 100 episodes:   75.0
Reward over last 100 episodes:   75.0
Reward over last 100 episodes:   74.0
Reward over last 100 episodes:   74.0
Reward over last 100 episodes:   75.0
Reward over 

In [7]:
np.savetxt('actionValueFunction.txt',q)
print("Reward over last 100 episodes:  ", np.sum(totalReward))
testCaseGenerator(q, rendering = True)

Reward over last 100 episodes:   80.0
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Left)
SFFF
FH[41mF[0mH
FFFH
HFFG
  (Right)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Left)
SFFF
FH[41mF[0mH
FFFH
HFFG
  (Right)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Left)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH

1.0