In [None]:
import numpy as np

# Define the environment
env = np.array([[-1, -1, -1],
                [-1, -1, -1],
                [-1, -1, -1]])

# Define the rewards
rewards = np.array([[-1, -1, -1],
                    [-1, -1, -1],
                    [-1, -1, 10]])

# Define the Q-Table
q_table = np.zeros((env.shape[0], env.shape[1], 4))

# Define the hyperparameters
alpha = 0.1
gamma = 0.99
epsilon = 1.0
epsilon_decay = 0.99
epsilon_min = 0.01
episodes = 100000

# Define the actions
actions = {
    0: 'Up',
    1: 'Down',
    2: 'Left',
    3: 'Right'
}

# Define the Q-Learning algorithm
for episode in range(episodes):
    state = [0, 0]
    done = False
    while not done:
        # Choose an action
        if np.random.random() > epsilon:
            action = np.argmax(q_table[state[0], state[1]])
        else:
            action = np.random.randint(0, 4)
        
        # Take the action and observe the next state and reward
        if actions[action] == 'Up':
            next_state = [max(state[0]-1, 0), state[1]]
        elif actions[action] == 'Down':
            next_state = [min(state[0]+1, 2), state[1]]
        elif actions[action] == 'Left':
            next_state = [state[0], max(state[1]-1, 0)]
        else:
            next_state = [state[0], min(state[1]+1, 2)]
        reward = rewards[next_state[0], next_state[1]]
        
        # Update the Q-Table
        q_table[state[0], state[1], action] = (1 - alpha) * q_table[state[0], state[1], action] + alpha * (reward + gamma * np.max(q_table[next_state[0], next_state[1]]))
        
        # Update the state
        state = next_state
        
        # Check if the episode is done
        if state == [2, 2]:
            done = True
        
    # Decay the epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

# Print the learned Q-Table
print(q_table)


[[[ 5.6655611   6.73289     5.6655611   6.73289   ]
  [ 6.73289     7.811       5.6655611   7.811     ]
  [ 1.8932734   8.9         0.85844384  1.13640991]]

 [[ 1.67282844  7.811       1.83144887  3.68324788]
  [ 6.73289     8.9         6.73289     8.9       ]
  [ 1.96891838 10.          2.66852709  5.2705602 ]]

 [[ 1.83126381  5.51732074  5.29369585  8.9       ]
  [ 7.811       8.9         7.811      10.        ]
  [ 0.          0.          0.          0.        ]]]
