In [1]:
import numpy as np

from IPython.display import clear_output
import time

In [2]:
class Environment:
    def __init__(self, jumps, board_size):
        self.__board_size = board_size
        self.__jumps = jumps
        self.__x = board_size // 2
        self.__y = board_size // 2
        
    def reset(self):
        self.__x = 2
        self.__y = 2
        
    def __apply_move(self, x, y, action):
        new_x, new_y = x, y
        
        if action == 'L':
            new_y = y - 1
        elif action == 'R':
            new_y = y + 1
        elif action == 'D':
            new_x = x + 1
        elif action == 'U':
            new_x = x - 1
            
        return new_x, new_y
    
    def __check_jump(self, current_x, current_y):
        for jump in self.__jumps:
            if jump[0][0] == current_x and jump[0][1] == current_y:
                return jump
            
        return None
    
    def __get_new_pos(self, new_x, new_y):
        reward = 0
        
        if 0 > new_x or new_x > self.__board_size - 1:
            new_x = max(min(new_x, self.__board_size - 1), 0)
            reward = -1
        if 0 > new_y or new_y > self.__board_size - 1:
            new_y = max(min(new_y, self.__board_size - 1), 0)
            reward = -1
        
        return new_x, new_y, reward

    
    def predict_reward(self, current_x, current_y, action):
        jump = self.__check_jump(current_x, current_y)
        if jump is not None:
            new_x, new_y, reward = jump[1][0], jump[1][1], jump[2]
        else:
            new_x, new_y = self.__apply_move(current_x, current_y, action)
            new_x, new_y, reward = self.__get_new_pos(new_x, new_y)
        
        return new_x, new_y, reward
    
    def move(self, action):
        jump = self.__check_jump(self.__x, self.__y)
        if jump is not None:
            self.__x, self.__y, reward = jump[1][0], jump[1][1], jump[2]
        else:
            new_x, new_y = self.__apply_move(self.__x, self.__y, action)
            self.__x, self.__y, reward = self.__get_new_pos(new_x, new_y)
        
        return reward
        
    def show(self):
        for i in range(self.__board_size):
            for j in range(self.__board_size):
                if i == self.__x and j == self.__y:
                    print('A', end=' ')
                else:
                    print('_', end=' ')
            print('\n')
    
    def percept(self):
        return self.__x, self.__y
    
            
    @property
    def actions(self):
        return ['U', 'D', 'L', 'R']
    
    @property
    def num_actions(self):
        return 4
    
    @property
    def board_size(self):
        return self.__board_size

In [21]:
def choose_an_action(env, state_probs):
    probs = [state_probs[action] for action in env.actions]
    action = np.random.choice(env.actions, p=probs)
    
    return action

In [22]:
def calculate_bellman_equation(env, discount_factor, values, policy, i, j):
    value = 0
    
    for action in env.actions:
        new_x, new_y, reward = env.predict_reward(i, j, action)
        value += policy[i][j][action] * (reward + discount_factor * values[new_x, new_y])
        
    return value

In [23]:
def update_values(env, discount_factor, current_values, policy):
    new_values = np.zeros((env.board_size, env.board_size))
    
    for i in range(env.board_size):
        for j in range(env.board_size):
            new_values[i, j] = calculate_bellman_equation(env, discount_factor, current_values, policy, i, j)
            
    return new_values

In [24]:
def calculate_value_function(env, policy, discount_factor):
    values = np.zeros((env.board_size, env.board_size))
    
    for i in range(10):
        values = update_values(env, discount_factor, values, policy)

    return values

In [25]:
def flatten_values(env, policy, values):
    policy = [[{action: 0 for action in env.actions} for _ in range(env.board_size)] for _ in range(env.board_size)] 
    
    for i in range(env.board_size):
        for j in range(env.board_size):
            for action in env.actions:
                new_x, new_y, _ = env.predict_reward(i, j, action)
                
                policy[i][j][action] = values[new_x, new_y]
                
    return policy

In [26]:
def multi_max(env, state_values):
    maximum_value = float('-inf')
    maximums = []
    
    for action in env.actions:
        if state_values[action] > maximum_value:
            maximum_value = state_values[action]
            maximums = [action]
        elif state_values[action] == maximum_value:
            maximums.append(action)
            
    return maximums

In [27]:
def update_policy(env, policy):
    new_policy = [[{action: 0 for action in env.actions} for _ in range(env.board_size)] for _ in range(env.board_size)] 
    
    for i in range(env.board_size):
        for j in range(env.board_size):
            optimal_actions = multi_max(env, policy[i][j])
            prob = 1 / len(optimal_actions)
            new_policy[i][j] = {action: prob if action in optimal_actions else 0 for action in env.actions}
    
    return new_policy

In [28]:
def print_policy(env, policy):
    
    for i in range(env.board_size):
        row = []
        for j in range(env.board_size):
            actions = ' '.join(multi_max(env, policy[i][j])).center(10)
            print(actions, end='|')
            
        print()

In [29]:
def policy_iteration(env, discount_factor, iterations, print_steps=False):
    policy = [[{action: 1/4 for action in env.actions} for _ in range(env.board_size)] for _ in range(env.board_size)]
    
    for i in range(iterations):
        values = calculate_value_function(env, policy, discount_factor)
        state_values = flatten_values(env, policy, values)
        policy = update_policy(env, state_values)
        
        if print_steps:
            print_policy(env, policy)
            
            if i != iterations - 1:
                print('-' * 100)
    
    return policy, values

In [30]:
def default_env():
    return Environment(
        [
            ((0, 1), (4, 1), 10),
            ((0, 3), (2, 3), 5)
        ],
        5
     )

In [31]:
def run_episode(env, num_steps, policy, plot=False):
    env.reset()
    
    if plot:
        clear_output(wait=True)
        env.show()
        time.sleep(.25)
        print()
        
    sum_reward = 0
    
    for _ in range(num_steps):
        x, y = env.percept()
        action = choose_an_action(env, policy[x][y])
        reward = env.move(action)
        
        if plot:
            clear_output(wait=True)
            env.show()
            time.sleep(.25)
            print()
            
        sum_reward += reward
        
    return sum_reward

In [56]:
discount_factor = .9

In [57]:
env = default_env()

policy = [[{action: 1/4 for action in env.actions} for _ in range(env.board_size)] for _ in range(env.board_size)]

In [59]:
run_episode(env, 20, policy, True)

_ _ _ _ _ 

_ A _ _ _ 

_ _ _ _ _ 

_ _ _ _ _ 

_ _ _ _ _ 




4

In [60]:
print_policy(env, policy)
print()
policy, values = policy_iteration(env, discount_factor, 10, True)
print()
print_policy(env, policy)
values

 U D L R  | U D L R  | U D L R  | U D L R  | U D L R  |
 U D L R  | U D L R  | U D L R  | U D L R  | U D L R  |
 U D L R  | U D L R  | U D L R  | U D L R  | U D L R  |
 U D L R  | U D L R  | U D L R  | U D L R  | U D L R  |
 U D L R  | U D L R  | U D L R  | U D L R  | U D L R  |

    R     | U D L R  |    L     | U D L R  |    L     |
    U     |    U     |    U     |    U     |    L     |
    U     |    U     |    U     |    U     |    U     |
    U     |    U     |    U     |    U     |    U     |
    U     |    U     |    U     |    U     |    U     |
----------------------------------------------------------------------------------------------------
    R     | U D L R  |    L     | U D L R  |    L     |
   U R    |    U     |   U L    |    U     |   U L    |
   U R    |    U     |   U L    |    L     |   U L    |
   U R    |    U     |   U L    |    L     |   U L    |
   U R    |    U     |   U L    |    U     |   U L    |
----------------------------------------------------------

array([[14.31441   , 15.9049    , 14.31441   , 10.9049    ,  9.81441   ],
       [12.882969  , 14.31441   , 12.882969  , 11.5946721 , 10.43520489],
       [11.5946721 , 12.882969  , 11.5946721 , 10.43520489,  5.9049    ],
       [10.43520489, 11.5946721 , 10.43520489,  5.9049    ,  5.31441   ],
       [ 5.9049    , 10.43520489,  5.9049    ,  5.31441   ,  4.782969  ]])

In [62]:
run_episode(env, 20, policy, True)

_ _ _ _ _ 

_ _ _ _ _ 

_ _ _ _ _ 

_ A _ _ _ 

_ _ _ _ _ 




40

In [63]:
env = Environment(
    [
        ((0, 0), (0, 4), 10),
        ((4, 4), (4, 0), 10),
    ],
    5
 )

policy = [[{action: 1/4 for action in env.actions} for _ in range(env.board_size)] for _ in range(env.board_size)]
print_policy(env, policy)

 U D L R  | U D L R  | U D L R  | U D L R  | U D L R  |
 U D L R  | U D L R  | U D L R  | U D L R  | U D L R  |
 U D L R  | U D L R  | U D L R  | U D L R  | U D L R  |
 U D L R  | U D L R  | U D L R  | U D L R  | U D L R  |
 U D L R  | U D L R  | U D L R  | U D L R  | U D L R  |


In [64]:
run_episode(env, 20, policy, True)

_ _ _ _ _ 

_ _ _ _ _ 

_ _ _ _ _ 

_ _ A _ _ 

_ _ _ _ _ 




-3

In [65]:
policy, values = policy_iteration(env, discount_factor, 10, True)
print()
print_policy(env, policy)
values

 U D L R  |    L     |    L     |    L     |   D L    |
    U     |   U L    |    L     |    L     |    D     |
    U     |    U     |    U     |    D     |    D     |
    U     |   U R    |    R     |   D R    |    D     |
   U R    |    R     |    R     |    R     | U D L R  |
----------------------------------------------------------------------------------------------------
 U D L R  |    L     |    L     |    L     |   D L    |
    U     |   U L    |   U L    | U D L R  |    D     |
    U     |   U L    | U D L R  |   D R    |    D     |
    U     | U D L R  |   D R    |   D R    |    D     |
   U R    |    R     |    R     |    R     | U D L R  |
----------------------------------------------------------------------------------------------------
 U D L R  |    L     |    L     |    L     |   D L    |
    U     |   U L    |   U L    | U D L R  |    D     |
    U     |   U L    | U D L R  |   D R    |    D     |
    U     | U D L R  |   D R    |   D R    |    D     |
   U R    |   

array([[15.9049    , 14.31441   , 12.882969  , 11.5946721 , 10.43520489],
       [14.31441   , 12.882969  , 11.5946721 , 10.43520489, 11.5946721 ],
       [12.882969  , 11.5946721 , 10.43520489, 11.5946721 , 12.882969  ],
       [11.5946721 , 10.43520489, 11.5946721 , 12.882969  , 14.31441   ],
       [10.43520489, 11.5946721 , 12.882969  , 14.31441   , 15.9049    ]])

In [22]:
env = Environment(
    [
        ((0, 0), (0, 4), -10),
        ((4, 4), (4, 0), -10),
        
        ((0, 2), (2, 3), 5),
        ((2, 3), (4, 2), 5),
        ((4, 2), (2, 1), 5),
        ((2, 1), (0, 2), 5),
    ],
    5
 )

policy = [[{action: 1/4 for action in env.actions} for _ in range(env.board_size)] for _ in range(env.board_size)]

In [23]:
run_episode(env, 20, policy, True)

_ _ _ _ _ 

_ _ _ _ _ 

_ _ _ _ _ 

_ _ _ _ _ 

_ _ A _ _ 




95

In [24]:
print_policy(env, policy)
print()
policy, values = policy_iteration(env, discount_factor, 10, True)
print()
print_policy(env, policy)
values

 U D L R  | U D L R  | U D L R  | U D L R  | U D L R  |
 U D L R  | U D L R  | U D L R  | U D L R  | U D L R  |
 U D L R  | U D L R  | U D L R  | U D L R  | U D L R  |
 U D L R  | U D L R  | U D L R  | U D L R  | U D L R  |
 U D L R  | U D L R  | U D L R  | U D L R  | U D L R  |

 U D L R  |    R     | U D L R  |    L     |    L     |
    R     |    D     |    U     |    D     |    L     |
    R     | U D L R  |   L R    | U D L R  |    L     |
    R     |    U     |    D     |    U     |    L     |
    R     |    R     | U D L R  |    L     | U D L R  |
----------------------------------------------------------------------------------------------------
 U D L R  |    R     | U D L R  |    L     |    L     |
   D R    |    D     |    U     |    D     |   D L    |
    R     | U D L R  |   L R    | U D L R  |    L     |
   U R    |    U     |    D     |    U     |   U L    |
    R     |    R     | U D L R  |    L     | U D L R  |
----------------------------------------------------------

array([[ 9.01607799, 27.566078  , 32.566078  , 27.566078  , 23.066078  ],
       [23.066078  , 27.566078  , 27.566078  , 27.566078  , 23.066078  ],
       [27.566078  , 32.566078  , 27.566078  , 32.566078  , 27.566078  ],
       [23.066078  , 27.566078  , 27.566078  , 27.566078  , 23.066078  ],
       [23.066078  , 27.566078  , 32.566078  , 27.566078  ,  9.01607799]])

In [25]:
run_episode(env, 20, policy, True)

_ _ A _ _ 

_ _ _ _ _ 

_ _ _ _ _ 

_ _ _ _ _ 

_ _ _ _ _ 




95

In [5]:
env = default_env()

In [6]:
dir(env)

['_Environment__apply_move',
 '_Environment__board_size',
 '_Environment__check_jump',
 '_Environment__get_new_pos',
 '_Environment__jumps',
 '_Environment__x',
 '_Environment__y',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'actions',
 'board_size',
 'move',
 'num_actions',
 'percept',
 'predict_reward',
 'reset',
 'show']

In [7]:
env.show()

_ _ _ _ _ 

_ _ _ _ _ 

_ _ A _ _ 

_ _ _ _ _ 

_ _ _ _ _ 



In [8]:
env.actions

['U', 'D', 'L', 'R']

In [19]:
env.move('L')

-1

In [20]:
env.show()

_ _ _ _ _ 

_ _ _ _ _ 

_ _ _ _ _ 

_ _ _ _ _ 

A _ _ _ _ 



In [32]:
policy = [[{action: 1/4 for action in env.actions} for _ in range(env.board_size)] for _ in range(env.board_size)]

In [34]:
values = np.zeros((env.board_size, env.board_size))
values

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [55]:
discount_factor = .9
values = update_values(env, discount_factor, values, policy)
values

array([[ 3.32950088,  8.81232386,  4.44174139,  5.33239658,  1.49339085],
       [ 1.53892828,  3.00916308,  2.26201892,  1.91483565,  0.55011646],
       [ 0.06767379,  0.75382695,  0.68710607,  0.36862986, -0.39461048],
       [-0.95469859, -0.41624036, -0.33705503, -0.56804394, -1.16679105],
       [-1.8360794 , -1.32347617, -1.20689835, -1.40079456, -1.95294622]])