In [1]:
import numpy as np
from numpy import random

class SARSA:
    """On-Policy"""
    def __init__(self, row, col, gamma=0.9, alpha=1.0, epsilon=0.1, episodios=30):

        self.gamma = gamma
        self.alpha = alpha
        self.epsilon = epsilon
        self.episodeCount = episodios
        self.Q = np.zeros((row, col, 4))
        self.newQ = np.ones((row, col, 4)) + float('-inf')
        self.temp = np.zeros((row, col, 8))
        self.action = -1
        self.nextAction = -1

    def chooseAction(self, state):
        # 0<N<1
        randN = (0 + (random.random() * 1))

        # e-Greedy
        if randN < self.epsilon:
            action = random.choice(range(4))
        else:
            maxQ = max(self.Q[state.row, state.col, :])
            action = list(self.Q[state.row, state.col, :]).index(maxQ)


        return action

    def learn(self, state1, action1, reward, state2, action2):
        qnext = self.Q[state2.row, state2.col, action2]
        #self.nextAction = list(self.Q[state2.row, state2.col, :]).index(qnext)
        currentQ = self.Q[state1.row, state1.col, action1]

        if(currentQ == 0.0):
            self.Q[state1.row, state1.col, action1] = reward
        else:
            self.Q[state1.row, state1.col, action1] = currentQ + self.alpha * (reward + (self.gamma * qnext) - currentQ)

class QL:
    """Off-Policy
    """

    def __init__(self, row, col, gamma=0.9, alpha=1.0, epsilon=0.1, episodios=30):

        self.gamma = gamma
        self.alpha = alpha
        self.epsilon = epsilon
        self.episodeCount = episodios
        self.Q = np.zeros((row, col, 4))
        self.newQ = np.ones((row, col, 4)) + float('-inf')
        self.temp = np.zeros((row, col, 8))
        self.action = -1
        self.nextAction = -1

    def chooseAction(self, state):
        # 0<N<1
        randN = (0 + (random.random() * 1))

        # e-Greedy
        if randN > self.epsilon:
            maxQ = max(self.Q[state.row, state.col, :])
            action = list(self.Q[state.row, state.col, :]).index(maxQ)
        else:
            action = round((random.random() * 3))

        return action

    def learn(self, state1, action1, reward, state2):
        maxQ = max(self.Q[state2.row, state2.col, :])
        self.nextAction = list(self.Q[state2.row, state2.col, :]).index(maxQ)

        randN = (random.random() * 1)
        if randN <= self.epsilon:
            self.nextAction = round((random.random() * 3))

        currentQ = self.Q[state1.row, state1.col, action1]

        self.Q[state1.row, state1.col, action1] = currentQ + self.alpha * (reward + (self.gamma * maxQ) - currentQ)