# Example Code For Q-Learning
## (Book: Mastering Machine Learning with Python in Six Steps)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import random

In [6]:
# Define the reward/link connection graph

R = np.matrix([
        [-1, -1, -1, -1, 0, -1],
        [-1, -1, -1, 0, -1, 100],
        [-1, -1, -1, 0, -1, -1],
        [-1, 0, 0, -1, 0, -1],
        [ 0, -1, -1, 0, -1, 100],
        [-1, 0, -1, -1, 0, 100]
]).astype("float32")
Q = np.zeros_like(R)

In [16]:
# Learning parameter
gamma = 0.8

# Initialize random state
initial_state = random.randint(0, 4)

def available_actions(state):
    # Return all available actions in the state given as an argument
    current_state_row = R[state,]
    av_act = np.where(current_state_row >= 0)[1]
    return av_act

def sample_next_action(available_actions_range):
    # Choose at random which action to be performed within the range of all available actions
    next_action = int(np.random.choice(available_act, 1))
    return next_action

In [17]:
def update(current_state, action, gamma):
    # Update the Q matrix according to the path selected and the Q learning algorithm
    max_index = np.where(Q[action,] == np.max(Q[action,]))[1]
    if max_index.shape[0] > 1:
        max_index = int(np.random.choice(max_index, size = 1))
    else:
        max_index = int(max_index)
    max_value = Q[action, max_index]
    
    # Q learning formula
    Q[current_state, action] = R[current_state, action] + gamma * max_value

In [18]:
available_act = available_actions(initial_state)
action = sample_next_action(available_act)

In [19]:
# Train over 100 iterations, re-iterate the process above).
for i in range(100):
    current_state = np.random.randint(0, int(Q.shape[0]))
    available_act = available_actions(current_state)
    action = sample_next_action(available_act)
    update(current_state, action, gamma)

In [20]:
# Normalize the "trained" Q matrix
print("Trained Q matrix: \n", Q/np.max(Q)*100)

Trained Q matrix: 
 [[  0.           0.           0.           0.          80.00000119
    0.        ]
 [  0.           0.           0.          46.55193686   0.
   97.62266278]
 [  0.           0.           0.          63.32377791   0.
    0.        ]
 [  0.          78.09813023  50.65902472   0.          79.15472388
    0.        ]
 [ 63.99999857   0.           0.          62.47850657   0.
  100.        ]
 [  0.          78.09813023   0.           0.          80.00000119
   98.94340634]]


In [21]:
current_state = 2
steps = [current_state]

while current_state != 5:
    next_step_index = np.where(Q[current_state,] == np.max(Q[current_state,]))[1]
    if next_step_index.shape[0] > 1:
        next_step_index = int(np.random.choice(next_step_index, size=1))
    else:
        next_step_index = int(next_step_index)
    steps.append(next_step_index)
    current_state = next_step_index

In [22]:
# Print selected sequence of steps
print (f"Best sequence path: {steps}")

Best sequence path: [2, 3, 4, 5]
