# Pipeline Learning (PL) Allocation Algorithm

## The Algorithm Itself

In [1]:
# Reminder of what each state needs:
# state0_PL()
# state1_PL(available_processors, current_layer, current_path, paths)
# state2_PL(selected_processor, available_processors, current_layer, current_path, paths)
# state3_PL(train_times, available_processors, current_layer, current_path, paths)
# state4_PL(available_processors, current_layer, current_path, paths)
def PL_allocation_alg(event_layers, event_processors, event_links, event_split):
    
    # Set global variables so all states run correctly
    global layers
    layers = event_layers
    global processors
    processors = event_processors
    global links
    links = event_links
    global state
    state = 0
    global previous_state
    previous_state = 0
    global min_split
    min_split = event_split
    global needed_split
    needed_split= event_split
    
    # Create local local variable used to store the result of state4_PL(), which needs to be used later on
    result_state4 = None
    
    # With the obtained data, run the simulator for the desired single event
    start = time.time()
    
    while state < 5:

        if state == 0:        
            result_state0 = state0_PL()

        elif state == 1:        
            # If I got here from state0_PL()
            if previous_state == 0:
                result_state1 = state1_PL(result_state0[0], result_state0[1], result_state0[2], 
                                       result_state0[3])

            # If I got here from state3_PL()
            elif previous_state == 3:
                result_state1 = state1_PL(result_state3[0], result_state3[1], result_state3[2], 
                                   result_state3[3])

        elif state == 2:
            # If I got here from state1_PL()
            if previous_state == 1:
                result_state2 = state2_PL(result_state1[0], result_state1[1], result_state1[2], 
                               result_state1[3], result_state1[4])

            # If I got here from state3_PL()
            elif previous_state == 3:
                result_state2 = state2_PL(result_state3[0], result_state3[1], result_state3[2], 
                               result_state3[3], result_state3[4])

        elif state == 3:
            # If I got here from state2_PL()
            if previous_state == 2:
                result_state3 = state3_PL(result_state2[0], result_state2[1], result_state2[2], result_state2[3], 
                                   result_state2[4])

            # If I got here from state3_PL()
            elif previous_state == 3:
                result_state3 = state3_PL(result_state3[0], result_state3[1], result_state3[2], result_state3[3], 
                                   result_state3[4])

        elif state == 4:
            # If I got here from state1_PL()
            if previous_state == 1:
                result_state4 = state4_PL(result_state1[0], result_state1[1], result_state1[2], 
                               result_state1[3])

            # If I got here from state3_PL()
            elif previous_state == 3:
                result_state4 = state4_PL(result_state3[0], result_state3[1], result_state3[2], 
                               result_state3[3])

    end = time.time()
    runtime = end - start
    
    # Return a specific result depending on the final state at the end of the algorithm
    if state == 5:
        return ["Success", result_state4[1], result_state4[2], runtime]
    elif state == 6:
        return ["Unfeasible Type A", None, None, runtime]
    elif state == 7:
        return ["Unfeasible Type B", None, None, runtime]
    elif state == 8:
        return ["Unfeasible Type C", None, None, runtime]
    
    # OR %%timeit for multiple runs of the algorithm and std dev and all the stuff
    # OR %%time

## Each individual state_PL() method

In [2]:
def state0_PL():
    # Create our variables that will be passed from one state to another
    current_layer = 0
    current_path = []
    paths = []
    available_processors = []
    
    # Create our global state and previous_state monitor variables
    global state
    state = 0
    global previous_state
    previous_state = 0
    
    # Clean up the residual memory of all processors before starting
    for processor in processors:    
        processor.residual = processor.initial_residual
        
    # Reset the split variable before starting the algorithm
    global needed_split
    needed_split = min_split
    
    # If min_split > amount of layers in the NN then end the program:
    if min_split > len(layers):
        # End the program
        state = 8
        return
    
    # Start the algorithm by scanning for available processors for the current layer
    # Plus a heuristic to only use "max_paths" potential processors of all the availabl ones
    for j in range(len(processors)):
        if processors[j].residual >= (layers[current_layer].memory + total_batch_size) and len(available_processors) < max_paths:
            available_processors.append(processors[j])
    
    if len(available_processors) == 0:
        # End the program
        state = 6
        
        # Reset all processor residual memory
        # Clean up the residual memory of all processors before finishing so it does not affect future 
        # algorithms that use this variable
        for processor in processors:    
            processor.residual = processor.initial_residual
        return
        
    else:
         # Define the state that just finished!
        previous_state = 0
        
         # Define the next state!
        state = 1
        return (available_processors, current_layer, current_path, paths)

In [3]:
def state1_PL(available_processors, current_layer, current_path, paths):
    
    global previous_state
    global state
    
    # If for some reason we came back to this state AFTER at least ONE path was finished and we've gone through
    # ALL possible available processors as starting points
    if len(available_processors) == 0 and len(paths) != 0:
        
        # Define the state that just finished!
        previous_state = 1
        
        # Just proceed to calculate the minimum path with the already available paths in the final list
        state = 4
        return (available_processors, current_layer, current_path, paths)
    
    # If we came back here after having tried ALL processors and ALL paths, AND not having found a viable path
    # then end the program:
    elif len(available_processors) == 0 and len(paths) == 0:
        state = 7
        
        # Reset all processor residual memory
        # Clean up the residual memory of all processors before finishing so it does not affect future 
        # algorithms that use this variable
        for processor in processors:    
            processor.residual = processor.initial_residual
        
        return
        
    # This is what will normally be done!
    else:
        
        # Reset the available residual memory in all processors before proceeding, just in case we are
        # coming back here from a failed path attempt
        for processor in available_processors:    
            processor.residual = processor.initial_residual
        
        # Reset the needed_split variable just in case we are coming back from a failed attempt
        global needed_split
        needed_split = min_split
        
        # Select the first processor from the list to start calculating the best path from there
        selected_processor = available_processors[0]
        selected_processor.residual = selected_processor.residual - layers[current_layer].memory - total_batch_size
        
        # Define the state that just finished!
        previous_state = 1
        
        # Proceed to the next state
        state = 2
        return (selected_processor, available_processors, current_layer, current_path, paths)

In [4]:
def state2_PL(selected_processor, available_processors, current_layer, current_path, paths):
    
    global previous_state
    global state
    
    # The currently selected processor is always given by the function calling this one!
    
    # Establish where the origin of the data for the current layer is coming from
    if current_layer == 0:
        # If I'm dealing with the first layer, I'm assuming the data will be fetched from that currently
        # selected processor
        origin_processor_ID = selected_processor.ID
        
    else:
        # The PREVIOUS "current" processor is the CURRENT origin processor, so I take that info from the
        # info appended to the "current_path" list
        origin_processor_ID = current_path[-1][2]
        
    # Calculate all possible training times for the current layer running on the selected processor
    train_times = []
    
    # Apply the heuristic "max_paths" here as well!
    if len(processors) > max_paths:
        for j in range(max_paths):

            # Log the real training time of the current layer on the selected processor
            # This one will only be used at the very end IF the current potential destination processor 
            # is chosen as the best destination
            real_train_time = ( 1.0 * (layers[current_layer].fetch / 
                               (find_in_list(links, "link_" + 
                                            str(selected_processor.ID) + str(origin_processor_ID))).value) 
                               + (1+1.5) * (layers[current_layer].flops / processors[selected_processor.ID].power) 
                               + 1.0 * (layers[current_layer].write / (find_in_list(links, "link_" + 
                                                                              str(selected_processor.ID) + str(j))).value) 
                              )
            # j here represents the processor I will be outputting to!

            # Log the algorithm training time of the current layer on the selected processor
            # This is used for the algorithm to make the decision of WHERE to process the next layer
            if current_layer < (len(layers) - 1):
                alg_train_time = (layers[current_layer].fetch / (find_in_list(links, "link_" + str(selected_processor.ID) + str(origin_processor_ID))).value) + (layers[current_layer].flops / processors[selected_processor.ID].power) + 2 * (layers[current_layer].write / (find_in_list(links, "link_" + 
                                            str(selected_processor.ID) + str(j))).value) + layers[current_layer + 1].flops / processors[j].power
            else:
                final_bandwidth = queue[0].links[0].value
                alg_train_time = (layers[current_layer].fetch / (find_in_list(links, "link_" + str(selected_processor.ID) + str(origin_processor_ID))).value) + (layers[current_layer].flops / processors[selected_processor.ID].power) + (layers[current_layer].write / final_bandwidth)

            # Append all the necessary info to a list to be accessed later
            train_times.append([real_train_time, # For final training time calculation 
                                alg_train_time, # For algorithm destination processor selection
                                processors[selected_processor.ID].ID, # To keep a log of the layer origin
                                processors[j].ID]) # To log the layer destination for this potential selection
    else:
        for j in range(len(processors)):
            # Log the real training time of the current layer on the selected processor
            # This one will only be used at the very end IF the current potential destination processor 
            # is chosen as the best destination
            real_train_time = ( 1.0 * (layers[current_layer].fetch / 
                               (find_in_list(links, "link_" + 
                                            str(selected_processor.ID) + str(origin_processor_ID))).value) 
                               + (1+1.5) * (layers[current_layer].flops / processors[selected_processor.ID].power) 
                               + 1.0 * (layers[current_layer].write / (find_in_list(links, "link_" + 
                                                                                  str(selected_processor.ID) + str(j))).value) 
                              )
            # j here represents the processor I will be outputting to!

            # Log the algorithm training time of the current layer on the selected processor
            # This is used for the algorithm to make the decision of WHERE to process the next layer
            if current_layer < (len(layers) - 1):
                alg_train_time = (layers[current_layer].fetch / (find_in_list(links, "link_" + str(selected_processor.ID) + str(origin_processor_ID))).value) + (layers[current_layer].flops / processors[selected_processor.ID].power) + 2 * (layers[current_layer].write / (find_in_list(links, "link_" + 
                                            str(selected_processor.ID) + str(j))).value) + layers[current_layer + 1].flops / processors[j].power
            else:
                final_bandwidth = queue[0].links[0].value
                alg_train_time = (layers[current_layer].fetch / (find_in_list(links, "link_" + str(selected_processor.ID) + str(origin_processor_ID))).value) + (layers[current_layer].flops / processors[selected_processor.ID].power) + (layers[current_layer].write / final_bandwidth)

            # Append all the necessary info to a list to be accessed later
            train_times.append([real_train_time, # For final training time calculation 
                                alg_train_time, # For algorithm destination processor selection
                                processors[selected_processor.ID].ID, # To keep a log of the layer origin
                                processors[j].ID]) # To log the layer destination for this potential selection

    # Define the state that just finished!
    previous_state = 2
    
    # Define the next state!
    state = 3
    return (train_times, available_processors, current_layer, current_path, paths)

In [5]:
def state3_PL(train_times, available_processors, current_layer, current_path, paths):
    
    global previous_state
    global state
    
    # Get the complete sublist which includes the minimum Algorithm Training Time (NOT the Real Training Time)
    minimum_train_time = min(train_times, key=itemgetter(1))
    
    # Remember:
    # minimum_train_time[0] -> used for final real training time calculation
    # minimum_train_time[1] -> used for next layer selection
    # minimum_train_time[2] -> to know where the current layer is being run at
    # minimum_train_time[3] -> to know the current potential destination for the next layer
    
    if current_layer == len(layers) - 1:
        
        # This was the last layer, so we add it to the current_path and then add that current path to the bigger
        # PATHS list
        current_path.append(minimum_train_time)
        paths.append(current_path)
        
        # Since we are done with this potential path, we remove the initial processor from the
        # available_processors list, so that we can calculate a new path for the next processor in that list
        available_processors.pop(0)
        
        # We also clean the "current_path" list for the future calculation
        current_path = []
        
        if len(available_processors) > 0:
            
            # Define the state that just finished!
            previous_state = 3
            
            # Loop back to state1 to calculate paths for the next processor in the list
            current_layer = 0
            state = 1
            return (available_processors, current_layer, current_path, paths)
        else: 
            
            # Define the state that just finished!
            previous_state = 3
            
            # No more available processors for layer0, so we proceed to the final best path calculation
            state = 4
            return (available_processors, current_layer, current_path, paths)
        
    elif processors[minimum_train_time[3]].residual >= layers[current_layer + 1].memory:
        
        # Potential destination CAN house the next layer, so we add it to the current path
        current_path.append(minimum_train_time)
        
        # Create an easier to read current_path which only includes ints because I will use the set() method
        # later to identify how many unique processors I have in my current path so far. This is because I need
        # to apply the minimum_split constraint!
        current_path_easy_read = []
        for selection in current_path:
            current_path_easy_read.append(selection[2])
        
        # Check if we need to perform the split now. If we DO, then make the current processor's residual = 0
        # to force the algorithm to change processors. If NOT, then we proceed as normal
        global needed_split        
        if len(current_path_easy_read) >= int((len(layers) / needed_split)) and len(set(current_path_easy_read)) <= min_split and needed_split > 0:
                processors[minimum_train_time[3]].residual = 0
                needed_split = needed_split - 1
        else:
            # Update the residual memory of the processor, now that we know it will be part of our path
            processors[minimum_train_time[3]].residual = processors[minimum_train_time[3]].residual - layers[current_layer + 1].memory
        
        # Progress on to the next layer
        current_layer = current_layer + 1
        
        # Define the state that just finished!
        previous_state = 3
        
        # Establish the "current" processor for the next calc. The "current" processor for the NEXT calc is
        # this current calc's destination
        state = 2
        return (processors[minimum_train_time[3]], available_processors, current_layer, current_path, paths)
    
    else: 
        # The processor that led to THIS min(train_times) is not capable of housing the next layer
        # So we must remove this train_time from the list and choose another new minimum. Then try again
        train_times.remove(minimum_train_time)
        
        # If it happens that we've already deleted all train_time paths, then that means that we must choose a new
        # direction all together, because after the current layer, NO OTHER PROCESSOR can house the next layer
        if len(train_times) < 1:
            
            # It's safe to remove specifically the first element from the list because we always choose
            # the first one in state1_PL()
            available_processors.pop(0)
            
            # Define the state that just finished!
            previous_state = 3
            
            # Go back to state1_PL() and choose a different available processor to create a path again
            current_path = []
            current_layer = 0
            state = 1
            return (available_processors, current_layer, current_path, paths)
        
        else:
            # Define the state that just finished!
            previous_state = 3
            
            # Normally we will just try this state again!
            state = 3
            return (train_times, available_processors, current_layer, current_path, paths)

In [1]:
def state4_PL(available_processors, current_layer, current_path, paths):
    
    global previous_state
    global state 
    
    # Remember that:
    # paths is a list of finalized current_path's
    # and each current_path is a list containing [real train time, alg train time, origin, dest]
    
    path_totals = []
    for path in paths:
        
        # Each "path" here is a list of [T_real1, T_alg1, origin1, dest1]
        path_value = 0
        
        for selection in path:
            # Add the real training time from each processor selection in each path to the path value
            path_value = path_value + selection[0]
        
        # After all Treal's have been added up, we add that path_value to the path_totals list
        path_totals.append(path_value)
    
    # After all path values have been totaled, we choose the path with the minimum path value
    best_path_index = path_totals.index(min(path_totals))
    best_path_value = min(path_totals)
    best_path = paths[best_path_index]
    
    # Let's obtain the best path route for easier reading
    best_path_route = []
    for selection in best_path:
        best_path_route.append(selection[2])
            
    # Program is done
    state = 5
    
    # Reset all processor residual memory
    # Clean up the residual memory of all processors before finishing so it does not affect future 
    # algorithms that use this variable
    for processor in processors:    
        processor.residual = processor.initial_residual
    
    # The best_path_value is currently in seconds and only considers ONE epoch, so we overwrite it to consider
    # the "epochs" variable defined in config.ipynb AND have its unit be HOURS, not seconds
#     return (state, best_path_route, (best_path_value * epochs * batch_size * total_batches_PL) / time_factor)
    return (state, best_path_route, (best_path_value * epochs * total_batches_PL) / time_factor)