In [1]:
import numpy as np
from matplotlib import pyplot

In [2]:
state_space = np.array([0,1,2,3,4,5])
action_space = np.array([0,1])

In [3]:
reward_matrix = np.zeros((6,2))

In [4]:
reward_matrix[0,0] = 50 # 50 - 2x0
reward_matrix[1,0] = 48 # 50 - 2x1
reward_matrix[2,0] = 42 # 50 - 2x4
reward_matrix[3,0] = 32 # 50 - 2x9
reward_matrix[4,0] = 18 # 50 - 2x16
reward_matrix[5,0] = 0
reward_matrix[0,1] = -100
reward_matrix[1,1] = -100
reward_matrix[2,1] = -100
reward_matrix[3,1] = -100
reward_matrix[4,1] = -100
reward_matrix[5,1] = -100

In [5]:
transition_probability_matrix = np.zeros((2,6,6))
t = np.array([[0.7, 0.3, 0, 0, 0, 0], [0, 0.6, 0.4, 0, 0, 0], [0, 0, 0.5, 0.5, 0, 0], [0, 0, 0, 0.4, 0.6, 0], [0, 0, 0, 0, 0.3, 0.7], [0, 0, 0, 0, 0, 1]])
transition_probability_matrix[0, :, :] = t
t = np.array([[1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0]])
transition_probability_matrix[1, :, :] = t

In [6]:
transition_probability_matrix

array([[[0.7, 0.3, 0. , 0. , 0. , 0. ],
        [0. , 0.6, 0.4, 0. , 0. , 0. ],
        [0. , 0. , 0.5, 0.5, 0. , 0. ],
        [0. , 0. , 0. , 0.4, 0.6, 0. ],
        [0. , 0. , 0. , 0. , 0.3, 0.7],
        [0. , 0. , 0. , 0. , 0. , 1. ]],

       [[1. , 0. , 0. , 0. , 0. , 0. ],
        [1. , 0. , 0. , 0. , 0. , 0. ],
        [1. , 0. , 0. , 0. , 0. , 0. ],
        [1. , 0. , 0. , 0. , 0. , 0. ],
        [1. , 0. , 0. , 0. , 0. , 0. ],
        [1. , 0. , 0. , 0. , 0. , 0. ]]])

In [7]:
class MachineRepair_Environment:
    def __init__(self, transition_probability_matrix, reward_matrix, initial_state):
        self.transition_probability_matrix = transition_probability_matrix
        self.reward_matrix = reward_matrix
        self.current_state = initial_state
    
    def get_current_state(self):
        return self.current_state
        
    def run_one_step(self, action):
        transition_probability = transition_probability_matrix[action, self.current_state, :]
        # obtain the reward
        reward = reward_matrix[self.current_state, action]
        # update the state
        next_state = np.random.choice(state_space, p = transition_probability)
        self.current_state = next_state
        return [next_state, reward]
        

In [8]:
num_episodes = 2000
horizon = 100
discount_factor = 0.8
step_size = 0.01
epsilon = 0.8

every_visit_estimate = np.zeros([len(state_space),len(action_space)])
every_visit_count = np.zeros([len(state_space),len(action_space)])

for e in range(num_episodes):
    initial_state = np.random.choice(state_space) # with exploring starts
    # initial_state = 0
    environment = MachineRepair_Environment(transition_probability_matrix, reward_matrix, initial_state)
    state_sequence = []
    action_sequence = []
    reward_sequence = []

    for t in range(horizon):
        state = environment.get_current_state()
        state_sequence.append(state)
        # if np.random.rand() <= epsilon:
        #    action = np.random.choice(action_space)
        # else:
        #    action = action_space[np.argmax(every_visit_estimate[state, :])]
        action = np.random.choice(action_space)
        action_sequence.append(action)
        [state, reward] = environment.run_one_step(action)
        reward_sequence.append(reward)
 
    for t,s in enumerate(state_sequence):
        if t == len(state_sequence) - 1:
            break
        a = action_sequence[t]
        qlearning_estimate = np.max(every_visit_estimate[state_sequence[t + 1], :])        
        every_visit_return = reward_sequence[t] + discount_factor * qlearning_estimate
        every_visit_estimate[s, a] = every_visit_estimate[s, a] + step_size * (every_visit_return - every_visit_estimate[s, a])

In [9]:
every_visit_estimate

array([[207.77267125,  66.4391158 ],
       [176.6807793 ,  67.08998329],
       [134.76696199,  67.32968041],
       [ 96.72238067,  67.15474737],
       [ 71.71567834,  66.6064594 ],
       [ 52.87544225,  66.9275585 ]])