In [1]:
import numpy as np
from matplotlib import pyplot

In [2]:
state_space = np.array([0,1,2,3,4,5])
action_space = np.array([0,1])

In [3]:
reward_matrix = np.zeros((6,2))

In [4]:
reward_matrix[0,0] = 50 # 50 - 2x0
reward_matrix[1,0] = 48 # 50 - 2x1
reward_matrix[2,0] = 42 # 50 - 2x4
reward_matrix[3,0] = 32 # 50 - 2x9
reward_matrix[4,0] = 18 # 50 - 2x16
reward_matrix[5,0] = 0
reward_matrix[0,1] = -100
reward_matrix[1,1] = -100
reward_matrix[2,1] = -100
reward_matrix[3,1] = -100
reward_matrix[4,1] = -100
reward_matrix[5,1] = -100

In [5]:
transition_probability_matrix = np.zeros((2,6,6))
t = np.array([[0.7, 0.3, 0, 0, 0, 0], [0, 0.6, 0.4, 0, 0, 0], [0, 0, 0.5, 0.5, 0, 0], [0, 0, 0, 0.4, 0.6, 0], [0, 0, 0, 0, 0.3, 0.7], [0, 0, 0, 0, 0, 1]])
transition_probability_matrix[0, :, :] = t
t = np.array([[1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0]])
transition_probability_matrix[1, :, :] = t

In [6]:
transition_probability_matrix

array([[[0.7, 0.3, 0. , 0. , 0. , 0. ],
        [0. , 0.6, 0.4, 0. , 0. , 0. ],
        [0. , 0. , 0.5, 0.5, 0. , 0. ],
        [0. , 0. , 0. , 0.4, 0.6, 0. ],
        [0. , 0. , 0. , 0. , 0.3, 0.7],
        [0. , 0. , 0. , 0. , 0. , 1. ]],

       [[1. , 0. , 0. , 0. , 0. , 0. ],
        [1. , 0. , 0. , 0. , 0. , 0. ],
        [1. , 0. , 0. , 0. , 0. , 0. ],
        [1. , 0. , 0. , 0. , 0. , 0. ],
        [1. , 0. , 0. , 0. , 0. , 0. ],
        [1. , 0. , 0. , 0. , 0. , 0. ]]])

In [7]:
transition_probability_matrix[0, 1]

array([0. , 0.6, 0.4, 0. , 0. , 0. ])

In [10]:
policy_function = np.array([0,0,0,1,0,1])

In [11]:
class MachineRepair_Environment:
    def __init__(self, transition_probability_matrix, reward_matrix, initial_state):
        self.transition_probability_matrix = transition_probability_matrix
        self.reward_matrix = reward_matrix
        self.current_state = initial_state
    
    def get_current_state(self):
        return self.current_state
        
    def run_one_step(self, action):
        transition_probability = transition_probability_matrix[action, self.current_state, :]
        # obtain the reward
        reward = reward_matrix[self.current_state, action]
        # update the state
        next_state = np.random.choice(state_space, p = transition_probability)
        self.current_state = next_state
        return [next_state, reward]
        

In [12]:
# First Visit Monte carlo estimate of a policy
num_episodes = 100
horizon = 100
discount_factor = 0.8

first_visit_estimate = np.zeros([len(state_space),1])
first_visit_count = np.zeros([len(state_space),1])


for e in range(num_episodes):
    # initial_state = np.random.choice(state_space) # with exploring starts
    initial_state = 0
    environment = MachineRepair_Environment(transition_probability_matrix, reward_matrix, initial_state)
    state_sequence = []
    reward_sequence = []

    for t in range(horizon):
        state = environment.get_current_state()
        state_sequence.append(state)
        action = policy_function[state]
        [state, reward] = environment.run_one_step(action)
        reward_sequence.append(reward)
    # print state_sequence
    # print reward_sequence
    for s in state_space:
        try:
            first_visit_time = state_sequence.index(s)
            first_visit_return = sum(reward_sequence[first_visit_time:] * np.power(discount_factor, range(len(state_sequence) - first_visit_time)))            
            first_visit_count[s] = first_visit_count[s] + 1.0
        except ValueError:
            pass
        
        if first_visit_count[s] > 0:
            first_visit_estimate[s] = ((first_visit_count[s] - 1) * first_visit_estimate[s] + first_visit_return)/first_visit_count[s]
        
        # print s, first_visit_time, first_visit_return, first_visit_count[s], first_visit_estimate[s]
   

In [13]:
first_visit_estimate

array([[202.92314911],
       [158.16765374],
       [111.98460215],
       [ 63.6900119 ],
       [  0.        ],
       [  0.        ]])

In [14]:
first_visit_count

array([[100.],
       [100.],
       [100.],
       [100.],
       [  0.],
       [  0.]])