In [1]:
import numpy as np

# Define the environment (Frozen Lake)
environment_rows = 4
environment_columns = 4

# Create a 3D numpy array to hold the current Q-values for each state and action pair
q_values = np.zeros((environment_rows, environment_columns, 4))  # 4 actions: up, right, down, left

# Define actions
actions = ['up', 'right', 'down', 'left']

# Define rewards for each state
rewards = np.array([[0, 0, 0, 0],
                    [0, -1, 0, -1],
                    [0, 0, 0, -1],
                    [-1, 0, 0, 1]])

# Define a function to check if a state is terminal
def is_terminal_state(row_index, column_index):
    return rewards[row_index, column_index] != 0

# Define a function to choose a starting location
def get_starting_location():
    return np.random.randint(environment_rows), np.random.randint(environment_columns)

# Define an epsilon-greedy action selection function
def get_next_action(current_row_index, current_column_index, epsilon):
    if np.random.random() < epsilon:
        return np.argmax(q_values[current_row_index, current_column_index])
    else:
        return np.random.randint(4)

# Define a function to get the next location based on the action taken
def get_next_location(current_row_index, current_column_index, action_index):
    if actions[action_index] == 'up':
        return max(0, current_row_index - 1), current_column_index
    elif actions[action_index] == 'right':
        return current_row_index, min(environment_columns - 1, current_column_index + 1)
    elif actions[action_index] == 'down':
        return min(environment_rows - 1, current_row_index + 1), current_column_index
    elif actions[action_index] == 'left':
        return current_row_index, max(0, current_column_index - 1)

# Define a function to find the path from a given starting location
def get_path(start_row_index, start_column_index):
    if is_terminal_state(start_row_index, start_column_index):
        return []
    else:
        current_row_index, current_column_index = start_row_index, start_column_index
        shortest_path = []
        shortest_path.append([current_row_index, current_column_index])
        while not is_terminal_state(current_row_index, current_column_index):
            action_index = get_next_action(current_row_index, current_column_index, 1.)  # Choose the best action
            current_row_index, current_column_index = get_next_location(current_row_index, current_column_index, action_index)
            shortest_path.append([current_row_index, current_column_index])
        return shortest_path

# Training parameters
epsilon = 0.9
discount_factor = 0.9
learning_rate = 0.9
num_episodes = 1000

# Training loop
for episode in range(num_episodes):
    row_index, column_index = get_starting_location()
    while not is_terminal_state(row_index, column_index):
        action_index = get_next_action(row_index, column_index, epsilon)
        old_row_index, old_column_index = row_index, column_index
        row_index, column_index = get_next_location(row_index, column_index, action_index)
        reward = rewards[row_index, column_index]
        old_q_value = q_values[old_row_index, old_column_index, action_index]
        temporal_difference = reward + (discount_factor * np.max(q_values[row_index, column_index])) - old_q_value
        new_q_value = old_q_value + (learning_rate * temporal_difference)
        q_values[old_row_index, old_column_index, action_index] = new_q_value

print("Frozen Lake")
for row in rewards:
  print(row)
print()
print('Training complete!')
print('Q-values:')
print(q_values)
print('Final path from Start state [0, 0]:', get_path(0,0))

Frozen Lake
[0 0 0 0]
[ 0 -1  0 -1]
[ 0  0  0 -1]
[-1  0  0  1]

Training complete!
Q-values:
[[[ 0.53138786  0.47351393  0.59049     0.53143569]
  [ 0.4778186   0.42616254 -1.          0.531441  ]
  [ 0.43003674  0.34832951  0.38703307  0.4782969 ]
  [ 0.38354628  0.         -1.          0.43046721]]

 [[ 0.53129698 -1.          0.6561      0.5845851 ]
  [ 0.          0.          0.          0.        ]
  [ 0.43046721 -1.          0.80839539 -1.        ]
  [ 0.          0.          0.          0.        ]]

 [[ 0.59043095  0.729      -0.9999      0.65609934]
  [-1.          0.65609934  0.81        0.65603439]
  [ 0.34867844 -0.99        0.9         0.729     ]
  [ 0.          0.          0.          0.        ]]

 [[ 0.          0.          0.          0.        ]
  [ 0.72899993  0.9         0.80999992 -1.        ]
  [ 0.80999846  1.          0.9         0.80999919]
  [ 0.          0.          0.          0.        ]]]
Final path from Start state [0, 0]: [[0, 0], [1, 0], [2, 0], [2, 1