In [None]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:
# R matrix
R = np.array([[-1,-1,-1,-1,0,-1],
             [-1,-1,-1,0,-1,100],
             [-1,-1,-1,0,-1,-1],
             [-1,0,0,-1,0,-1],
             [-1,0,0,-1,-1,100],
             [-1,0,-1,-1,0,100]])

print(f'Reward Matrix \n \n {R}')

In [None]:
# Q Matrix
Q = np.array(np.zeros([6,6]))
print(f'Q Matrix \n \n {Q}')

In [None]:
# Initial State - choosen at random
initial_state = 1

# Gamma (discount paramaters)
gamma = 0.8

In [None]:
np.where(R[1]>=0)[0]

In [None]:
# Let's now return all available actions in the state given as an argument
def available_actions(state):
    current_state_row = R[state]
    aaction = np.where(current_state_row >=0)[0]
    return aaction

# Get available actions in the current state
available_act = available_actions(initial_state)
available_act

In [None]:
# Next action to be performed
def next_action(available_action_range):
    naction = int(np.random.choice(available_act,1))
    return naction

# Action to be performed
action = next_action(available_act)
action

In [None]:
# Update Q Matrix
def update(current_state, action, gamma):
    max_index = np.where(Q[action,] == np.max(Q[action,]))[0]
    
    if max_index.shape[0] > 1:
        max_index = int(np.random.choice(max_index,size=1))
    else:
        max_index = int(max_index)
    
    max_value = Q[action, max_index]
    
    # Q learning formula 
    Q[current_state, action] = R[current_state, action] + gamma * max_value
    
# Update Q-matrix
update(initial_state, action, gamma)

In [None]:
# Training for 10000 iterations

for i in range(10000):
    current_state = np.random.randint(0,int(Q.shape[0]))
    available_act = available_actions(current_state)
    action = next_action(available_act)
    update(current_state, action, gamma)
    
# Normalize the Q matrix
print(f'Trained Q-Matrix \n \n {Q/np.max(Q)*100}')

In [None]:
current_state = 2
steps = [current_state]

while current_state !=5:
    next_step_index = np.where(Q[current_state,] == np.max(Q[current_state,]))[0]
    
    if next_step_index.shape[0] > 1:
        next_step_index = int(np.random.choice(next_step_index,size=1))
    else:
        next_step_index = int(next_step_index)
        
    steps.append(next_step_index)
    current_state = next_step_index

In [None]:
# Print selected sequence of steps
print(f'Selected Path {steps}')

In [None]:
import gymnasium as gym
env = gym.make("CartPole-v1", render_mode="human")

env.reset()
for _ in range(200):
    env.render()
    env.step(env.action_space.sample()) # take a random action

In [None]:
# action space
env.action_space

In [None]:
# state or observation space
env.observation_space

In [None]:
# check box bounds
print(f'High: {env.observation_space.high}')
print(f'Low: {env.observation_space.low}')

In [None]:
env.reset()
for e in range(1, 200):
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)                 # stepping forward one step 
    print(f'step={e:2d} | state={observation} | action={action} | reward={reward}')
    if (terminated or truncated) and (e + 1) <= 200:                                    # failure if less than 200 steps
        print('*** FAILED ***')
        break