In [1]:
import numpy as np

In [2]:
# R Matrix
R = np.matrix([[-1,-1,-1,-1,0,-1],
              [-1,-1,-1,0,-1,100],
              [-1,-1,-1,0,-1,-1],
              [-1,0,0,-1,0,-1],
              [-1,0,0,-1,-1,100],
              [-1,0,-1,-1,0,100]])

In [3]:
# Q Matrix
Q = np.matrix(np.zeros([6,6]))

In [4]:
# Gamma (learning parameter)
gamma = 0.8

In [5]:
# Initial state. (Usually choosen at random)
initial_state = 1

In [6]:
# This function returns all the available actions in the state given as in argument
def available_action(state):
    current_state_row = R[state,]
    av_act = np.where(current_state_row>=0)[1]
    return av_act

In [7]:
available_act = available_action(initial_state)

In [8]:
# This function chooses at random which action to be performed within the range of all the above actions
def sample_next_action(available_act):
    next_action = int(np.random.choice(available_act, 1))
    return next_action

In [9]:
# Sample next action
action = sample_next_action(available_act)

In [10]:
# This function update the Q Matrix according to the path selected and the Q-Learning algorithm
def update(current_state, action, gamma):
    max_index = np.where(Q[action,] == np.max(Q[action,]))[1]
    
    if max_index.shape[0]>1:
        max_index =int(np.random.choice(max_index, size=1))
    else:
        max_index = int(max_index)
    max_value = Q[action, max_index]
    
    
    # Q learning formula
    
    Q[current_state, action] = R[current_state, action] + gamma*max_value
    
    

In [11]:
# Update Q Matrix
update(initial_state, action, gamma)

In [12]:
# Training

# Train over 10000 iterations. Re-iterate the process above
for i in range(10000):
    current_state = np.random.randint(0, int(Q.shape[0]))
    available_act = available_action(current_state)
    action = sample_next_action(available_act)
    update(current_state, action, gamma)

# Normalize the Q Matrix
print("Trained Q Matrix:")
print(Q/np.max(Q)*100)
#______________________________

Trained Q Matrix:
[[  0.    0.    0.    0.   80.    0. ]
 [  0.    0.    0.   64.    0.  100. ]
 [  0.    0.    0.   64.    0.    0. ]
 [  0.   80.   51.2   0.   80.    0. ]
 [  0.   80.   51.2   0.    0.  100. ]
 [  0.   80.    0.    0.   80.  100. ]]


In [None]:
# Testing

# Goal State = 1

# Best Sequence path starting from 2  -> 2 ,3 ,1, 5

current_state = 1
steps = [current_state]

while current_state != 5:
    
    next_step_index = np.where(Q[current_state,] == np.max(Q[current_state,])) [1]
    
    if next_step_index.shape[0]>1:
        next_step_index = int(np.random.choice(next_step_index,size=1))
    else:
        next_step_index = int(next_step_index)
        
    steps.append(next_step_index)

# Print Selected sequence of steps

print("Selected path: ")
print(steps)
    