In [1]:
import numpy as np

BOARD_ROWS = 3
BOARD_COLS = 4
WIN_STATE = (0, 3)
LOSE_STATE = (1, 3)
BLOCKED_STATE = (1, 1)
START = (2, 0)
DETERMINISTIC = False

In [18]:
class State:
    def __init__(self, state = START):
        self.state = state
        self.isEnd = False
        self.determine = DETERMINISTIC

    def giveReward(self):
        if self.state == WIN_STATE:
            return 1
        elif self.state == LOSE_STATE:
            return -1
        else:
            return 0

    def isEndFunc(self):
        if (self.state == WIN_STATE) or (self.state == LOSE_STATE):
            self.isEnd = True

    def _chooseActionProb(self, action):
        if action == "U":
            return np.random.choice(["U", "L", "R"], p = [0.8, 0.1, 0.1])
        if action == "D":
            return np.random.choice(["D", "L", "R"], p = [0.8, 0.1, 0.1])
        if action == "L":
            return np.random.choice(["L", "U", "D"], p = [0.8, 0.1, 0.1])
        if action == "R":
            return np.random.choice(["R", "U", "D"], p = [0.8, 0.1, 0.1])

    def nxtPosition(self, action):
        if self.determine:
            if action == "U":
                nxtState = (self.state[0] - 1, self.state[1])
            elif action == "D":
                nxtState = (self.state[0] + 1, self.state[1])
            elif action == "L":
                nxtState = (self.state[0], self.state[1] - 1)
            else:
                nxtState = (self.state[0], self.state[1] + 1)
            self.determine = False
        else:
            action = self._chooseActionProb(action)
            self.determine = True
            nxtState = self.nxtPosition(action)

        if (nxtState[0] >= 0) and (nxtState[0] <= 2):
            if (nxtState[1] >= 0) and (nxtState[1] <= 3):
                if nxtState != BLOCKED_STATE:
                    return nxtState
        return self.state

In [19]:
class Agent:

    def __init__(self):
        self.states = []
        self.actions = ["U", "D", "L", "R"]
        self.State = State()
        self.isEnd = self.State.isEnd
        self.lr = 0.2
        self.decay_gamma = 0.9

        self.Q_values = {}
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.Q_values[(i, j)] = {}
                for a in self.actions:
                    self.Q_values[(i, j)][a] = 0

    def chooseAction(self):
        max_nxt_reward = 0
        action = ""

        for a in self.actions:
            current_position = self.State.state
            nxt_reward = self.Q_values[current_position][a]
            if nxt_reward >= max_nxt_reward:
                action = a
                max_nxt_reward = nxt_reward
        return action

    def takeAction(self, action):
        position = self.State.nxtPosition(action)
        return State(state = position)

    def reset(self):
        self.states = []
        self.State = State()
        self.isEnd = self.State.isEnd

    def play(self, episodes = 10):
        i = 0
        while i < episodes:
            if self.State.isEnd:
                reward = self.State.giveReward()
                for a in self.actions:
                    self.Q_values[self.State.state][a] = reward
                for s in reversed(self.states):
                    current_q_value = self.Q_values[s[0]][s[1]]
                    reward = current_q_value + self.lr * (self.decay_gamma * reward - current_q_value)
                    self.Q_values[s[0]][s[1]] = round(reward, 3)
                self.reset()
                i += 1
            else:
                action = self.chooseAction()
                self.states.append([(self.State.state), action])
                self.State = self.takeAction(action)
                self.State.isEndFunc()
                self.isEnd = self.State.isEnd

In [20]:
ag = Agent()

ag.play(1000)
print("latest Q-values ... \n")
print(ag.Q_values)

latest Q-values ... 

{(0, 0): {'U': 0, 'D': 0, 'L': 0, 'R': 0.559}, (0, 1): {'U': 0, 'D': 0, 'L': 0, 'R': 0.762}, (0, 2): {'U': 0, 'D': 0, 'L': 0, 'R': 0.88}, (0, 3): {'U': 1, 'D': 1, 'L': 1, 'R': 1}, (1, 0): {'U': 0, 'D': 0, 'L': 0, 'R': 0.126}, (1, 1): {'U': 0, 'D': 0, 'L': 0, 'R': 0}, (1, 2): {'U': 0, 'D': 0, 'L': 0.348, 'R': -0.18}, (1, 3): {'U': -1, 'D': -1, 'L': -1, 'R': -1}, (2, 0): {'U': 0, 'D': 0, 'L': 0.081, 'R': -0.001}, (2, 1): {'U': 0, 'D': 0, 'L': 0.091, 'R': -0.005}, (2, 2): {'U': 0, 'D': 0, 'L': 0.164, 'R': -0.03}, (2, 3): {'U': 0, 'D': 0, 'L': 0, 'R': -0.173}}
