In [None]:
import numpy as np

# global variables
BOARD_ROWS = 3
BOARD_COLS = 4
WIN_STATE = (0, 3)
LOSE_STATE = (1, 3)
START = (2, 0)
DETERMINISTIC = True


class State:
    def __init__(self, state=START):
        self.board = np.zeros([BOARD_ROWS, BOARD_COLS])
        self.board[1, 1] = -1
        self.state = state
        self.isEnd = False
        self.determine = DETERMINISTIC

    def giveReward(self):
        if self.state == WIN_STATE:
            return 1
        elif self.state == LOSE_STATE:
            return -1
        else:
            return 0

    def isEndFunc(self):
        if (self.state == WIN_STATE) or (self.state == LOSE_STATE):
            self.isEnd = True

    def nxtPosition(self, action):
        """
        action: up, down, left, right
        -------------
        0 | 1 | 2| 3|
        1 |
        2 |
        return next position
        """
        if self.determine:
            if action == "up":
                nxtState = (self.state[0] - 1, self.state[1])
            elif action == "down":
                nxtState = (self.state[0] + 1, self.state[1])
            elif action == "left":
                nxtState = (self.state[0], self.state[1] - 1)
            else:
                nxtState = (self.state[0], self.state[1] + 1)
            # if next state legal
            if (nxtState[0] >= 0) and (nxtState[0] <= (BOARD_ROWS -1)):
                if (nxtState[1] >= 0) and (nxtState[1] <= (BOARD_COLS -1)):
                  if nxtState != (1, 1):
                      return nxtState
            return self.state

    def showBoard(self):
        self.board[self.state] = 1
        for i in range(0, BOARD_ROWS):
            print('-----------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if self.board[i, j] == 1:
                    token = '*'
                if self.board[i, j] == -1:
                    token = 'z'
                if self.board[i, j] == 0:
                    token = '0'
                out += token + ' | '
            print(out)
        print('-----------------')


# Agent of player

class Agent:

    def __init__(self):
        self.states = []
        self.actions = ["up", "down", "left", "right"]
        self.State = State()
        self.lr = 0.2
        self.exp_rate = 0.3

        # initial state reward
        self.state_values = {}
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.state_values[(i, j)] = 0  # set initial value to 0

    def chooseAction(self):
        # choose action with most expected value
        mx_nxt_reward = 0
        action = ""

        if np.random.uniform(0, 1) <= self.exp_rate:
            action = np.random.choice(self.actions)
        else:
            # greedy action
            for a in self.actions:
                # if the action is deterministic
                nxt_reward = self.state_values[self.State.nxtPosition(a)]
                if nxt_reward >= mx_nxt_reward:
                    action = a
                    mx_nxt_reward = nxt_reward
        return action

    def takeAction(self, action):
        position = self.State.nxtPosition(action)
        return State(state=position)

    def reset(self):
        self.states = []
        self.State = State()

    def play(self, rounds=10):
        i = 0
        while i < rounds:
            # to the end of game back propagate reward
            if self.State.isEnd:
                # back propagate
                reward = self.State.giveReward()
                # explicitly assign end state to reward values
                self.state_values[self.State.state] = reward  # this is optional
                print("Game End Reward", reward)
                for s in reversed(self.states):
                    reward = self.state_values[s] + self.lr * (reward - self.state_values[s])
                    self.state_values[s] = round(reward, 3)
                self.reset()
                i += 1
            else:
                action = self.chooseAction()
                # append trace
                self.states.append(self.State.nxtPosition(action))
                print("current position {} action {}".format(self.State.state, action))
                # by taking the action, it reaches the next state
                self.State = self.takeAction(action)
                # mark is end
                self.State.isEndFunc()
                print("nxt state", self.State.state)
                print("---------------------")

    def showValues(self):
        for i in range(0, BOARD_ROWS):
            print('----------------------------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                out += str(self.state_values[(i, j)]).ljust(6) + ' | '
            print(out)
        print('----------------------------------')


if __name__ == "__main__":
    ag = Agent()
    ag.play(50)
    print(ag.showValues())

current position (2, 0) action right
nxt state (2, 1)
---------------------
current position (2, 1) action right
nxt state (2, 2)
---------------------
current position (2, 2) action right
nxt state (2, 3)
---------------------
current position (2, 3) action right
nxt state (2, 3)
---------------------
current position (2, 3) action right
nxt state (2, 3)
---------------------
current position (2, 3) action right
nxt state (2, 3)
---------------------
current position (2, 3) action right
nxt state (2, 3)
---------------------
current position (2, 3) action right
nxt state (2, 3)
---------------------
current position (2, 3) action right
nxt state (2, 3)
---------------------
current position (2, 3) action right
nxt state (2, 3)
---------------------
current position (2, 3) action right
nxt state (2, 3)
---------------------
current position (2, 3) action right
nxt state (2, 3)
---------------------
current position (2, 3) action right
nxt state (2, 3)
---------------------
current posi

In [4]:
#######################################################################
# Copyright (C)                                                       #
# 2016-2018 Shangtong Zhang(zhangshangtong.cpp@gmail.com)             #
# 2016 Kenta Shimada(hyperkentakun@gmail.com)                         #
# Permission given to modify the code as long as you keep this        #
# declaration at the top                                              #
#######################################################################

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.table import Table

matplotlib.use('Agg')

WORLD_SIZE = 5
A_POS = [0, 1]
A_PRIME_POS = [4, 1]
B_POS = [0, 3]
B_PRIME_POS = [2, 3]
DISCOUNT = 0.9

# left, up, right, down
ACTIONS = [np.array([0, -1]),
           np.array([-1, 0]),
           np.array([0, 1]),
           np.array([1, 0])]
ACTIONS_FIGS=[ '←', '↑', '→', '↓']


ACTION_PROB = 0.25


def step(state, action):
    if state == A_POS:
        return A_PRIME_POS, 10
    if state == B_POS:
        return B_PRIME_POS, 5

    next_state = (np.array(state) + action).tolist()
    x, y = next_state
    if x < 0 or x >= WORLD_SIZE or y < 0 or y >= WORLD_SIZE:
        reward = -1.0
        next_state = state
    else:
        reward = 0
    return next_state, reward


def draw_image(image):
    fig, ax = plt.subplots()
    ax.set_axis_off()
    tb = Table(ax, bbox=[0, 0, 1, 1])

    nrows, ncols = image.shape
    width, height = 1.0 / ncols, 1.0 / nrows

    # Add cells
    for (i, j), val in np.ndenumerate(image):

        # add state labels
        if [i, j] == A_POS:
            val = str(val) + " (A)"
        if [i, j] == A_PRIME_POS:
            val = str(val) + " (A')"
        if [i, j] == B_POS:
            val = str(val) + " (B)"
        if [i, j] == B_PRIME_POS:
            val = str(val) + " (B')"
        
        tb.add_cell(i, j, width, height, text=val,
                    loc='center', facecolor='white')
        

    # Row and column labels...
    for i in range(len(image)):
        tb.add_cell(i, -1, width, height, text=i+1, loc='right',
                    edgecolor='none', facecolor='none')
        tb.add_cell(-1, i, width, height/2, text=i+1, loc='center',
                    edgecolor='none', facecolor='none')

    ax.add_table(tb)

def draw_policy(optimal_values):
    fig, ax = plt.subplots()
    ax.set_axis_off()
    tb = Table(ax, bbox=[0, 0, 1, 1])

    nrows, ncols = optimal_values.shape
    width, height = 1.0 / ncols, 1.0 / nrows

    # Add cells
    for (i, j), val in np.ndenumerate(optimal_values):
        next_vals=[]
        for action in ACTIONS:
            next_state, _ = step([i, j], action)
            next_vals.append(optimal_values[next_state[0],next_state[1]])

        best_actions=np.where(next_vals == np.max(next_vals))[0]
        val=''
        for ba in best_actions:
            val+=ACTIONS_FIGS[ba]
        
        # add state labels
        if [i, j] == A_POS:
            val = str(val) + " (A)"
        if [i, j] == A_PRIME_POS:
            val = str(val) + " (A')"
        if [i, j] == B_POS:
            val = str(val) + " (B)"
        if [i, j] == B_PRIME_POS:
            val = str(val) + " (B')"
        
        tb.add_cell(i, j, width, height, text=val,
                loc='center', facecolor='white')

    # Row and column labels...
    for i in range(len(optimal_values)):
        tb.add_cell(i, -1, width, height, text=i+1, loc='right',
                    edgecolor='none', facecolor='none')
        tb.add_cell(-1, i, width, height/2, text=i+1, loc='center',
                   edgecolor='none', facecolor='none')

    ax.add_table(tb)


def figure_3_2():
    value = np.zeros((WORLD_SIZE, WORLD_SIZE))
    while True:
        # keep iteration until convergence
        new_value = np.zeros_like(value)
        for i in range(WORLD_SIZE):
            for j in range(WORLD_SIZE):
                for action in ACTIONS:
                    (next_i, next_j), reward = step([i, j], action)
                    # bellman equation
                    new_value[i, j] += ACTION_PROB * (reward + DISCOUNT * value[next_i, next_j])
        if np.sum(np.abs(value - new_value)) < 1e-4:
            draw_image(np.round(new_value, decimals=2))
            plt.savefig('figure_3_21.png')
            plt.close()
            break
        value = new_value

def figure_3_2_linear_system():
    '''
    Here we solve the linear system of equations to find the exact solution.
    We do this by filling the coefficients for each of the states with their respective right side constant.
    '''
    A = -1 * np.eye(WORLD_SIZE * WORLD_SIZE)
    b = np.zeros(WORLD_SIZE * WORLD_SIZE)
    for i in range(WORLD_SIZE):
        for j in range(WORLD_SIZE):
            s = [i, j]  # current state
            index_s = np.ravel_multi_index(s, (WORLD_SIZE, WORLD_SIZE))
            for a in ACTIONS:
                s_, r = step(s, a)
                index_s_ = np.ravel_multi_index(s_, (WORLD_SIZE, WORLD_SIZE))

                A[index_s, index_s_] += ACTION_PROB * DISCOUNT
                b[index_s] -= ACTION_PROB * r

    x = np.linalg.solve(A, b)
    draw_image(np.round(x.reshape(WORLD_SIZE, WORLD_SIZE), decimals=2))
    plt.savefig('figure_3_21_linear_system.png')
    plt.close()

def figure_3_5():
    value = np.zeros((WORLD_SIZE, WORLD_SIZE))
    while True:
        # keep iteration until convergence
        new_value = np.zeros_like(value)
        for i in range(WORLD_SIZE):
            for j in range(WORLD_SIZE):
                values = []
                for action in ACTIONS:
                    (next_i, next_j), reward = step([i, j], action)
                    # value iteration
                    values.append(reward + DISCOUNT * value[next_i, next_j])
                new_value[i, j] = np.max(values)
        if np.sum(np.abs(new_value - value)) < 1e-4:
            draw_image(np.round(new_value, decimals=2))
            plt.savefig('figure_3_51.png')
            plt.close()
            draw_policy(new_value)
            plt.savefig('figure_3_51_policy.png')
            plt.close()
            break
        value = new_value


if __name__ == '__main__':
    figure_3_2_linear_system()
    figure_3_2()
    figure_3_5()

In [None]:
import numpy as np


size = 5
a = [0, 1]
aPrime = [4, 1]
b = [0, 3]
bPrime = [2, 3]
discount = 0.9
actions = [[0, -1], [-1, 0], [0, 1],[1, 0]]
actionArrows = [ '←', '↑', '→', '↓']
actionProb = 0.25
grid = np.zeros((size,size))
copygrid = np.copy(grid)
states = [[i,j] for i in range(size) for j in range (size)]
numIterations = 1000
theta = 0.1
pi = np.ones((size,size))/4
pi1 = np.array((size, size))
#pi1[:] = 'a'

def actionValue(state, action):
  if state == a:
        return aPrime, 10
  if state == b:
        return bPrime, 5

  next_state = (np.array(state) + action)
  x, y = next_state
  if x < 0 or x >= size or y < 0 or y >= size:
      reward = -1.0
      next_state = state
  else:
      reward = 0
  
  return next_state, reward


def policyEvaluation(numIterations, discount, theta, grid):
  for i in range(numIterations):
    delta = 0
    for state in states:
      weightedRewards = 0
      for action in actions:
        finalPosition, reward = actionValue(state, action)
        weightedRewards += (1/4 * (reward + discount * grid [finalPosition[0], finalPosition[1]]))
      copygrid[state[0], state[1]] = weightedRewards
      delta = max(delta,abs(weightedRewards-grid[state[0],state[1]]))
    grid = np.copy(copygrid)
    if (delta < 0.01):
      print(grid)
      break

def policy_evaluate(states,actions,discount, grid):
    for state in states:
        weightedRewards=0
        for action in actions:
            finalPosition,reward = actionValue(state,action)
            weightedRewards += 1/4* (reward + discount * grid[finalPosition[0],finalPosition][1])
        copygrid[state[0],state[1]]=weightedRewards
    grid = np.copy(copygrid)
    return(grid)

def argmax(q_values):
    index=np.argmax(q_values)
    return(np.random.choice(np.where(a==a[index])[0].tolist()))


# Compute the best action in each state
def greedify_policy(state,pi,pi1,discount,grid):  
        q_values=np.zeros(len(actions))
        for index,action in enumerate(actions):
            finalPosition,reward = actionValue(state,action)
            q_values[index] += 1/4* (reward + discount * grid[finalPosition[0],finalPosition][1])
        # Find the index of the action for which the q_value is 
        index=q_values.argmax()
        pi[state[0],state[1]]=index 
        if(index == 0):
            pi1[state[0],state[1]]= 'up'
        elif(index == 1):
            pi1[state[0],state[1]]= 'down'
        elif(index == 2):
            pi1[state[0],state[1]]= 'right'
        elif(index == 3):
            pi1[state[0],state[1]]= 'left'

def improve_policy(pi, pi1, discount, grid):
    policy_stable = True
    for state in states:
        old = pi[state].copy()
        # Greedify policy for state
        greedify_policy(state,pi,pi1,discount,grid)
        if not np.array_equal(pi[state], old):
            policy_stable = False
    #print(pi)
    print(pi1)
    return pi, pi1, policy_stable


def policy_iteration(discount, theta):
    grid = np.zeros((size, size))
    pi = np.ones((size,size))/4
    pi1 = np.chararray((size, size))
    pi1[:] = 'a'
    policy_stable = False
    print("Policy Iteration")
    while not policy_stable:
        grid = policy_evaluate(states,actions,discount,grid)
        pi,pi1, policy_stable = improve_policy(pi,pi1,discount,grid)
    return grid, pi,pi1


def bellman_optimality_update(grid, state, discount):

    q_values=np.zeros(len(actions))
    
    for index,action in enumerate(actions):
        finalPosition,reward = actionValue(state,action)
        q_values[index] += 1/4* (reward + discount * grid[finalPosition[0],finalPosition[1]]) 
    index=q_values.argmax()           
    max = np.argmax(q_values)
    grid[state[0],state[1]] = q_values[max]    
    


def value_iteration(discount, theta):
    grid = np.zeros((size, size))
    while True:
        delta = 0
        for state in states:
            v_old = grid[state[0],state[1]]
            bellman_optimality_update(grid, state, discount)
            delta = max(delta, abs(v_old - grid[state[0],state[1]]))
        if delta < theta:
            break
    pi = np.ones((size, size))/4
    for state in states:
        greedify_policy(state,pi,pi1,discount,grid)
    #print(pi)
    print("Value Iteration")
    print(pi1)
    return grid, pi,pi1



policyEvaluation(numIterations, discount, theta, grid)
grid, pi,pi1 = policy_iteration(discount, theta)
grid,pi,pi1=value_iteration(discount, theta)


In [3]:
#######################################################################
# Copyright (C)                                                       #
# 2016-2018 Shangtong Zhang(zhangshangtong.cpp@gmail.com)             #
# 2016 Kenta Shimada(hyperkentakun@gmail.com)                         #
# Permission given to modify the code as long as you keep this        #
# declaration at the top                                              #
#######################################################################

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.table import Table

matplotlib.use('Agg')
ROWS = 4
COLUMNS = 2
DISCOUNT = 0.6666

# left, up, right, down
ACTIONS = [np.array([0, -1]),
           np.array([-1, 0]),
           np.array([0, 1]),
           np.array([1, 0])]
ACTIONS_FIGS=[ '←', '↑', '→', '↓']


ACTION_PROB = 0.25

def is_terminal(state):
  x, y = state
  return(x == 0 and y == 0) or (x == 1 and y == 2) or (x == 1 and y == 3) or (x == COLUMNS - 1 and y == ROWS - 1)

def step(state, action):
  if is_terminal(state):
    return state, 0

  next_state = (np.array(state) + action).tolist()
  x, y = next_state
   
  if x < 0 or x >= COLUMNS or y < 0 or y >= ROWS:
    next_state = state
    
  reward = -3.0
    
  return next_state, reward


def draw_image(image):
    fig, ax = plt.subplots()
    ax.set_axis_off()
    tb = Table(ax, bbox=[0, 0, 1, 1])

    nrows, ncols = image.shape
    width, height = 1.0 / ncols, 1.0 / nrows

    # Add cells
    for (i, j), val in np.ndenumerate(image):        
        tb.add_cell(i, j, width, height, text=val,
                    loc='center', facecolor='white')
        

    # Row and column labels...
    for i in range(len(image)):
        tb.add_cell(i, -1, width, height, text=i+1, loc='right',
                    edgecolor='none', facecolor='none')
        tb.add_cell(-1, i, width, height/2, text=i+1, loc='center',
                    edgecolor='none', facecolor='none')

    ax.add_table(tb)

def draw_policy(optimal_values):
    fig, ax = plt.subplots()
    ax.set_axis_off()
    tb = Table(ax, bbox=[0, 0, 1, 1])

    nrows, ncols = optimal_values.shape
    width, height = 1.0 / ncols, 1.0 / nrows

    # Add cells
    for (i, j), val in np.ndenumerate(optimal_values):
        next_vals=[]
        for action in ACTIONS:
            next_state, _ = step([i, j], action)
            next_vals.append(optimal_values[next_state[0],next_state[1]])

        best_actions=np.where(next_vals == np.max(next_vals))[0]
        val=''
        for ba in best_actions:
            val+=ACTIONS_FIGS[ba]        
        tb.add_cell(i, j, width, height, text=val,
                loc='center', facecolor='white')

    # Row and column labels...
    for i in range(len(optimal_values)):
        tb.add_cell(i, -1, width, height, text=i+1, loc='right',
                    edgecolor='none', facecolor='none')
        tb.add_cell(-1, i, width, height/2, text=i+1, loc='center',
                   edgecolor='none', facecolor='none')

    ax.add_table(tb)


def figure_3_2():
    value = np.zeros((COLUMNS, ROWS))
    while True:
        # keep iteration until convergence
        new_value = np.zeros_like(value)
        for i in range(COLUMNS):
            for j in range(ROWS):
                for action in ACTIONS:
                    (next_i, next_j), reward = step([i, j], action)
                    # bellman equation
                    new_value[i, j] += ACTION_PROB * (reward + DISCOUNT * value[next_i, next_j])
        if np.sum(np.abs(value - new_value)) < 1e-4:
            draw_image(np.round(new_value, decimals=2))
            plt.savefig('figure_3_2.png')
            plt.close()
            break
        value = new_value

def figure_3_2_linear_system():
    '''
    Here we solve the linear system of equations to find the exact solution.
    We do this by filling the coefficients for each of the states with their respective right side constant.
    '''
    A = -1 * np.eye(COLUMNS * ROWS)
    b = np.zeros(COLUMNS * ROWS)
    for i in range(COLUMNS):
        for j in range(ROWS):
            s = [i, j]  # current state
            index_s = np.ravel_multi_index(s, (COLUMNS, ROWS))
            for a in ACTIONS:
                s_, r = step(s, a)
                index_s_ = np.ravel_multi_index(s_, (COLUMNS, ROWS))

                A[index_s, index_s_] += ACTION_PROB * DISCOUNT
                b[index_s] -= ACTION_PROB * r

    x = np.linalg.solve(A, b)
    draw_image(np.round(x.reshape(COLUMNS, ROWS), decimals=2))
    plt.savefig('figure_3_2_linear_system.png')
    plt.close()

def figure_3_5():
    value = np.zeros((COLUMNS, ROWS))
    while True:
        # keep iteration until convergence
        new_value = np.zeros_like(value)
        for i in range(COLUMNS):
            for j in range(ROWS):
                values = []
                for action in ACTIONS:
                    (next_i, next_j), reward = step([i, j], action)
                    # value iteration
                    values.append(reward + DISCOUNT * value[next_i, next_j])
                new_value[i, j] = np.max(values)
        if np.sum(np.abs(new_value - value)) < 1e-4:
            draw_image(np.round(new_value, decimals=2))
            plt.savefig('figure_3_5.png')
            plt.close()
            draw_policy(new_value)
            plt.savefig('figure_3_5_policy.png')
            plt.close()
            break
        value = new_value


if __name__ == '__main__':
    figure_3_2_linear_system()
    figure_3_2()
    figure_3_5()

In [None]:
import numpy as np

# global variables
BOARD_ROWS = 4
BOARD_COLS = 2
WIN_STATE = (0, 3)
#LOSE_STATE = (1, 3)
START = (2, 0)
DETERMINISTIC = True


class State:
    def __init__(self, state=START):
        self.board = np.zeros([BOARD_ROWS, BOARD_COLS])
        self.board[1, 1] = -1
        self.state = state
        self.isEnd = False
        self.determine = DETERMINISTIC

    def giveReward(self):
        if self.state == WIN_STATE:
            return 1
        elif self.state == LOSE_STATE:
            return -1
        else:
            return 0

    def isEndFunc(self):
        if (self.state == WIN_STATE) or (self.state == LOSE_STATE):
            self.isEnd = True

    def nxtPosition(self, action):
        """
        action: up, down, left, right
        -------------
        0 | 1 | 2| 3|
        1 |
        2 |
        return next position
        """
        if self.determine:
            if action == "up":
                nxtState = (self.state[0] - 1, self.state[1])
            elif action == "down":
                nxtState = (self.state[0] + 1, self.state[1])
            elif action == "left":
                nxtState = (self.state[0], self.state[1] - 1)
            else:
                nxtState = (self.state[0], self.state[1] + 1)
            # if next state legal
            if (nxtState[0] >= 0) and (nxtState[0] <= (BOARD_ROWS -1)):
                if (nxtState[1] >= 0) and (nxtState[1] <= (BOARD_COLS -1)):
                  if nxtState != (1, 1):
                      return nxtState
            return self.state

    def showBoard(self):
        self.board[self.state] = 1
        for i in range(0, BOARD_ROWS):
            print('-----------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if self.board[i, j] == 1:
                    token = '*'
                if self.board[i, j] == -1:
                    token = 'z'
                if self.board[i, j] == 0:
                    token = '0'
                out += token + ' | '
            print(out)
        print('-----------------')


# Agent of player

class Agent:

    def __init__(self):
        self.states = []
        self.actions = ["up", "down", "left", "right"]
        self.State = State()
        self.lr = 0.2
        self.exp_rate = 0.3

        # initial state reward
        self.state_values = {}
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.state_values[(i, j)] = 0  # set initial value to 0

    def chooseAction(self):
        # choose action with most expected value
        mx_nxt_reward = 0
        action = ""

        if np.random.uniform(0, 1) <= self.exp_rate:
            action = np.random.choice(self.actions)
        else:
            # greedy action
            for a in self.actions:
                # if the action is deterministic
                nxt_reward = self.state_values[self.State.nxtPosition(a)]
                if nxt_reward >= mx_nxt_reward:
                    action = a
                    mx_nxt_reward = nxt_reward
        return action

    def takeAction(self, action):
        position = self.State.nxtPosition(action)
        return State(state=position)

    def reset(self):
        self.states = []
        self.State = State()

    def play(self, rounds=10):
        i = 0
        while i < rounds:
            # to the end of game back propagate reward
            if self.State.isEnd:
                # back propagate
                reward = self.State.giveReward()
                # explicitly assign end state to reward values
                self.state_values[self.State.state] = reward  # this is optional
                print("Game End Reward", reward)
                for s in reversed(self.states):
                    reward = self.state_values[s] + self.lr * (reward - self.state_values[s])
                    self.state_values[s] = round(reward, 3)
                self.reset()
                i += 1
            else:
                action = self.chooseAction()
                # append trace
                self.states.append(self.State.nxtPosition(action))
                print("current position {} action {}".format(self.State.state, action))
                # by taking the action, it reaches the next state
                self.State = self.takeAction(action)
                # mark is end
                self.State.isEndFunc()
                print("nxt state", self.State.state)
                print("---------------------")

    def showValues(self):
        for i in range(0, BOARD_ROWS):
            print('----------------------------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                out += str(self.state_values[(i, j)]).ljust(6) + ' | '
            print(out)
        print('----------------------------------')


if __name__ == "__main__":
    ag = Agent()
    ag.play(50)
    print(ag.showValues())

In [None]:
import numpy as np
import tensorflow as tf
from math import log2

W = np.array([[1, -1, 1],[1, -2, -1]])
b = np.array([[1], [2], [0]])
x = np.array([[2], [-1]])
z = np.array(np.dot(np.transpose(W), x) + b)
y = np.array([[0.0], [0.0], [1.0]])

f = np.array([[0.2119] , [0.2119], [-0.4239]])
g = (np.dot(f,np.reshape(x, (1,2)))) * 0.1
g = np.reshape(g, (2,3))

#j = W - g
h = b - (f * 0.1) 

c = np.array(np.dot(np.transpose(j), x) + h)
d= softmax(c)


def softmax(z):
  return np.exp(z) / np.sum(np.exp(z))

a = softmax(z)

def cross_entropy(p, q):
	return -sum([p[i]*log2(q[i]) for i in range(len(p))])

b = cross_entropy(y, a)


print(c, d)

[[1.87286]
 [1.93641]
 [3.00002]] [[0.19408092]
 [0.2068151 ]
 [0.59910398]]


In [2]:
import numpy as np


size = 5
a = [0, 1]
aPrime = [4, 1]
b = [0, 3]
bPrime = [2, 3]
discount = 0.9
actions = [[0, -1], [-1, 0], [0, 1],[1, 0]]
actionArrows = [ '←', '↑', '→', '↓']
actionProb = 0.25
grid = np.zeros((size,size))
states = [[i,j] for i in range(size) for j in range (size)]
numIterations = 1000
theta = 1e-4


def step(state, action):
  if state == a:
        return aPrime, 10
  if state == b:
        return bPrime, 5

  next_state = (np.array(state) + action)
  x, y = next_state
  if x < 0 or x >= size or y < 0 or y >= size:
      reward = -1.0
      next_state = state
  else:
      reward = 0
  return next_state, reward

def stateValuefunc(grid, discount):
  copygrid = np.copy(grid)
  while True:
    for i in range(size):
      for j in range(size):
        for action in actions:
          finalPosition, reward = step(state, action)
          copygrid = actionProb * (reward + discount * grid [finalPosition[0], finalPosition[1]])
    if np.sum(np.abs(grid - copygrid)) < 1e-4:
      print('Random Policy')
      print(copygrid)
      break
  grid = copygrid
  return grid
    
def policyIteration(grid, discount, numIterations):
  deltas = []
  for i in range(numIterations):
    copygrid = np.copy(grid)
    deltaState = []
    for state in states:
      weightedRewards = 0
      for action in actions:
        finalPosition, reward = step(state, action)
        weightedRewards += (1/len(actions) * (reward + discount * grid [finalPosition[0], finalPosition[1]]))
        deltaState.append(np.abs(copygrid[state[0], state[1]]- weightedRewards))
        copygrid[state[0], state[1]] = weightedRewards
    deltas.append(deltaState)
    grid = copygrid
    if i in [0,1,2,9, 99, numIterations-1]:
      print("Iteration {}".format(i+1))
      print(grid)
      print("")


def valueIteration(grid, discount, theta):
  copygrid = np.copy(grid)
  while True:
    for i in range(size):
      for j in range(size):
        values = []
        for state in states:
          for action in actions:
            finalPosition, reward = step(state, action)
            values.append(reward + (discount * grid [finalPosition[0], finalPosition[1]]))
            copygrid[i][j] = np.max(values)
          if np.sum(np.abs(grid - copygrid)) < theta:
            print('Value Iteration')
            print(copygrid)
            break
            grid = copygrid
  

#stateValuefunc(grid, discount)
policyIteration(grid, discount, numIterations)
valueIteration(grid, discount, theta)

    


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
Value Iteration
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
Value Iteration
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
Value Iteration
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
Value Iteration
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
Value Iteration
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
Value Iteration
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
Value Iteration
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
Value Iteration
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
Value Iteration
[[0. 0. 0. 0. 0.]
 [0. 0. 

KeyboardInterrupt: ignored