In [65]:
import numpy as np
import random 

def path_cost(path):
    def euclidean_dist(x,y):
        return ((graph[x][0]-graph[y][0])**2+ (graph[x][1]-graph[y][1])**2)**0.5
    cost = 0 
    for i in range(1, len(path)): 
        cost+=euclidean_dist(path[i], path[i-1])
    cost+=euclidean_dist(path[-1], path[0])
    return cost 

def dist(x,y):  
    return ((graph[x][0]-graph[y][0])**2+ (graph[x][1]-graph[y][1])**2)**0.5
 
#define an epsilon greedy algorithm that will choose which action to take next (i.e., where to move next)
def get_next_location(current_index, epsilon):
    #if a randomly chosen value between 0 and 1 is less than epsilon,
    #then choose the most promising value from the Q-table for this state.
    
    candidate_next_locations = [x for x in range(len(q_values[current_index])) if visited[x] == False and x!=current_index] 
    
#     print('current_index: ', current_index, ' candidate_next_locations: ', candidate_next_locations)
    
#     print('visited: ', visited)
    
#     print('q_values[current_index][candidate_next_locations]: ', q_values[current_index][candidate_next_locations])
    
    if np.random.random() < epsilon:
        return candidate_next_locations[np.argmax(q_values[current_index][candidate_next_locations])]  
    else: #choose a random action
        return random.choice(candidate_next_locations)  
 
if __name__ == '__main__': 
    
    #define training parameters
    epsilon = 0.9 #the percentage of time when we should take the best action (instead of a random action)
    discount_factor = 0.9 #discount factor for future rewards
    learning_rate = 0.1 #the rate at which the AI agent should learn
     
    #graph  = [(20833.3333, 17100.0000),(20900.0000, 17066.6667),(21300.0000, 13016.6667)]
    #graph = [(1,2), (2,3), (4,5), (14,1), (8,6), (1,3)]
     
    graph = [(1,2), (2,3), (4,5), (14,1), (8,6), (1,3)]
    
    graph = [(1,2), (2,3), (4,5), (14,1), (8,6)] 
    
    q_values = np.full((len(graph), len(graph)), 0)

    actions = [i for i in range(len(graph))]

    visited = {i:False for i in range(len(graph))}

    depot = 0 
    
    #run through training episodes
    for episode in range(1000):
        #get the starting location for this episode
        current_index = 0 
        visited = [False for i in range(len(graph))]
        num_unique_visited = 0 
        #continue taking actions (i.e., moving) until we reach a terminal state
        #(i.e., until we reach the item packaging area or crash into an item storage location)
        while num_unique_visited < len(graph):
            #choose which location to go to (i.e., where to move next)
            
            old_index = current_index
            current_index = get_next_location(current_index, epsilon)

            reward = -dist(old_index, current_index) #aim is to maximize reward 
 
            if visited[current_index] == False: 
                num_unique_visited+=1 
                visited[current_index] = True 
                    
            #calculate the temporal difference 
            old_q_value = q_values[old_index, current_index]
            
            #print('x: ', x)
            #print('visited: ', visited)
            #print('num_unique_visited: ', num_unique_visited, ' len(graph): ', len(graph))
            #print([q_values[current_index][x] for x in range(len(q_values[current_index])) if visited[x] == False and x!=current_index])
            
            if num_unique_visited == len(graph): 
                q_values[old_index, current_index]+=learning_rate*(discount_factor*q_values[current_index][depot] - old_q_value)
                break 
            
            temporal_difference = reward + (discount_factor * np.max([q_values[current_index][x] for x in range(len(q_values[current_index])) if visited[x] == False and x!=current_index])  - old_q_value)
            
            #update the Q-value for the previous state and action pair
            new_q_value = old_q_value + (learning_rate * temporal_difference)
            q_values[old_index, current_index] = new_q_value
            print('q_values: ', q_values)
            
    print('q_values: ', q_values)
    
    optimal_path = [depot]
    
    visited = [False for i in range(len(graph))]
    visited[depot] = True
    
    for i in range(len(graph)-1): 
        candidate_next_locations = [x for x in range(len(q_values[optimal_path[-1]])) if x!=optimal_path[-1] and visited[x] == False] 
        next_loc = candidate_next_locations[np.argmax(q_values[optimal_path[-1]][candidate_next_locations])]
        visited[next_loc] = True 
        
        optimal_path.append(next_loc)
    
    print('optimal_path: ', optimal_path)
    
    print('cost: ', path_cost(optimal_path))
    
    

q_values:  [[0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]]
q_values:  [[0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]]
q_values:  [[0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]]
q_values:  [[ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0 -1  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]]
q_values:  [[ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0 -1  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]]
q_values:  [[ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0 -1  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]]
q_values:  [[ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0 -1  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]]
q_values:  [[ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0 -1  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]]
q_values:  [[ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0 -1  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]]
q_values:  [[ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0  0 -1  0]
 [ 0  0  0  0  0]
 [ 0  0  0  0  0]]
q_values:  [[ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0 