In [115]:
#import libraries
import numpy as np

In [116]:
#define environment
env_rows=7
env_cols=7

#3D array to hold the Q values
Q_values=np.ones((env_rows,env_cols,4)) 

In [117]:
#define actions
actions = ['left', 'down', 'right', 'up']

In [118]:
#define tuples for tile types
SuperSlippery = ([2,0],[5,5])
NonSlippery = ([6,0],[1,2],[1,3],[1,4],[4,2],[5,4])
Goal = ([3,3])
Pitfalls = ([0,3],[1,1],[1,5],[3,2],[3,4],[3,6],[4,3],[5,1],[6,5])

In [119]:
rewards = np.full((env_rows, env_cols), -1.)
rewards[3, 3] = 100. #set the reward for the goal to 100
rewards[0,3]=rewards[1,1]=rewards[1,5]=rewards[3,2]=rewards[3,4]=rewards[3,6]=rewards[4,3]=rewards[5,1]=rewards[6,5]=-100 # Pitfalls
print(rewards)

[[  -1.   -1.   -1. -100.   -1.   -1.   -1.]
 [  -1. -100.   -1.   -1.   -1. -100.   -1.]
 [  -1.   -1.   -1.   -1.   -1.   -1.   -1.]
 [  -1.   -1. -100.  100. -100.   -1. -100.]
 [  -1.   -1.   -1. -100.   -1.   -1.   -1.]
 [  -1. -100.   -1.   -1.   -1.   -1.   -1.]
 [  -1.   -1.   -1.   -1.   -1. -100.   -1.]]


In [120]:
def tile_type(current_row,current_col):
    if [current_row,current_col] in SuperSlippery :
        return 'super_slippery_tile'
    elif [current_row,current_col] in NonSlippery:
        return "non_slippery_tile"
    elif [current_row,current_col] in Pitfalls:
        return 'pitfall'
    elif [current_row,current_col] == [3,3] :
        return 'goal'
    return 'std_slippery_tile'

In [121]:
def valid_starting(current_row,current_col):
    if tile_type(current_row,current_col) == 'std_slippery_tile':
        return True
    else :
        return False

In [122]:
def start_position():
    current_row = np.random.randint(env_rows)
    current_col =np.random.randint(env_cols)
    while not valid_starting(current_row,current_col):
        current_row = np.random.randint(env_rows)
        current_col = np.random.randint(env_cols)
    return current_row,current_col

In [123]:
def intended_action(current_row, current_col, eps):
    if np.random.random() < eps: #epsilon greedy algorithm
        return np.argmax(Q_values[current_row, current_col])
    else: 
        return np.random.randint(4)

In [124]:
def next_action(tile_type,intended_action):
    if tile_type == 'std_slippery_tile':
        return np.random.choice(3,1,[0.8,0.1,0.1])
    elif tile_type == 'non_slippery_tile':
        return intended_action
    elif tile_type == 'super_slippery_tile':
        return np.random.choice(3,1,[0.5,0.25,0.25]) 

In [125]:
def next_location(current_row, current_col, action_index):
    new_row = current_row
    new_col = current_col
    if actions[action_index] == 'left' and current_col > 0:
        new_col -= 1
    elif actions[action_index] == 'down' and current_row < env_rows - 1:
        new_row += 1
    elif actions[action_index] == 'right' and current_col < env_cols - 1:
        new_col += 1
    elif actions[action_index] == 'up' and current_row> 0:
        new_row -= 1
    return new_row, new_col

In [126]:
def terminal_state(current_row,current_col):
    if rewards[current_row,current_col] == -1:
        return False
    else:
        return True

In [127]:
eps = 0.9 #the percentage of time when we should take the best action (instead of a random action)
gamma = 0.9 #discount factor for future rewards
alpha = 0.9 #the rate at which the AI agent should learn


for episode in range(15500):
    row, col = start_position() #get the starting location for this episode
    x = tile_type(row,col) 
    #continue taking actions until we reach a terminal state
    while not terminal_state(row, col):
        #choose action to take
        y = intended_action(row, col, eps)
        action = next_action(x,y).item() 
        #perform the chosen action, and transition to the next state
        old_row, old_col = row, col #store the old row and column indexes
        row, col = next_location(row, col, action)
        
        reward = rewards[row, col]
        old_Q_value = Q_values[old_row, old_col, action]
        T_D = reward + (gamma * np.max(Q_values[row, col])) - old_Q_value

        new_Q_value = old_Q_value + (alpha * T_D)
        Q_values[old_row, old_col, action] = new_Q_value 

In [128]:
def shortest_path(row,col):
    if not valid_starting(row, col):
        return []
    else:
        current_row, current_col = row, col
        shortest_path = []
        shortest_path.append([current_row, current_col])
        while not [current_row, current_col] == [3,3]:
            action = next_action(tile_type(row,col),intended_action(row,col,1)).item()
            current_row, current_col = next_location(current_row, current_col, action)
            shortest_path.append([current_row, current_col])
        return shortest_path

In [None]:
shortest_path(0,0)