### 10.1 Merge sorted files

In [1]:
import heapq


### Remarks

I guess that I don't know heaps well enough yet, because I immediately looked for an approach that did not require a heap.  It appears to run in O(n * k) time where n is the number of elements and k is the number of lists.  It's a bit wasteful if k is large and n is small, since we are comparing the first entries of k lists.  If we could keep a data structure like a min_heap to maintain the smallest value, then the runtime would be independent of k.

#### Solution 1

In [7]:
def solution_1(lists, combined):
    smallest = None
    smallest_index = None
    for i in range(len(lists)):
        if smallest is None or lists[i][0] < smallest:
            smallest = lists[i][0]
            smallest_index = i
    
    smallest = lists[smallest_index].pop(0)
    if len(lists[smallest_index]) == 0:
        lists.pop(smallest_index)
    combined.append(smallest)

# Test Solution 1
combined = []
lists = [ [2, 5, 7, 8, 9], [1, 1, 4], [0, 2, 2, 5]]
while lists:
    print("Lists: {}".format(lists))
    solution_1(lists, combined)
    print("Output: {}".format(combined))


Lists: [[2, 5, 7, 8, 9], [1, 1, 4], [0, 2, 2, 5]]
Output: [0]
Lists: [[2, 5, 7, 8, 9], [1, 1, 4], [2, 2, 5]]
Output: [0, 1]
Lists: [[2, 5, 7, 8, 9], [1, 4], [2, 2, 5]]
Output: [0, 1, 1]
Lists: [[2, 5, 7, 8, 9], [4], [2, 2, 5]]
Output: [0, 1, 1, 2]
Lists: [[5, 7, 8, 9], [4], [2, 2, 5]]
Output: [0, 1, 1, 2, 2]
Lists: [[5, 7, 8, 9], [4], [2, 5]]
Output: [0, 1, 1, 2, 2, 2]
Lists: [[5, 7, 8, 9], [4], [5]]
Output: [0, 1, 1, 2, 2, 2, 4]
Lists: [[5, 7, 8, 9], [5]]
Output: [0, 1, 1, 2, 2, 2, 4, 5]
Lists: [[7, 8, 9], [5]]
Output: [0, 1, 1, 2, 2, 2, 4, 5, 5]
Lists: [[7, 8, 9]]
Output: [0, 1, 1, 2, 2, 2, 4, 5, 5, 7]
Lists: [[8, 9]]
Output: [0, 1, 1, 2, 2, 2, 4, 5, 5, 7, 8]
Lists: [[9]]
Output: [0, 1, 1, 2, 2, 2, 4, 5, 5, 7, 8, 9]


### Remarks
After reading more about the solution in the book, I see how the runtime could be improved from O(n x k) to O((log k) x n) by using a min heap.  The O(log k) describes the cost of inserting into a minheap of k elements, and the O(n) still refers to the number of total elements.

#### Solution 2

In [9]:
class HeapItem:
    def __init__(self, data, index):
        self.data = data
        self.index = index
        
    def __lt__(self, rhs):
        return self.data < rhs.data
    
    def __repr__(self):
        return "({}, {})".format(self.data, self.index)

def solution_2(lists):
    output = []
    min_heap = []
    smallest = None
    
    def status():
        print("Lists: {}".format(lists))
        print("Output {}".format(output))
        print("Min Heap: {}\n".format(min_heap))
        

    # initialze the min_heap
    status()
    min_index = None
    if not min_heap:
        for i in range(len(lists)):
            if not min_index:
                min_index = i
            elif lists[i] < lists[min_index]:
                min_index = i
            min_heap.append(HeapItem(lists[i].pop(0), i))

        heapq.heapify(min_heap)

    # clear the remaining lists
    while min_heap:
        status()
        # add the smallest
        smallest = heapq.heappop(min_heap)
        # print("Smallest: {}".format(smallest))
        output.append(smallest.data)
        # refill the min_heap
        if len(lists[smallest.index]) > 0:
            heapq.heappush(min_heap, HeapItem(lists[smallest.index].pop(0), smallest.index))

    return output

# Test Solution 2
lists = [ [2, 5, 7, 8, 9], [1, 1, 4], [0, 2, 2, 5]]
print(solution_2(lists))


Lists: [[2, 5, 7, 8, 9], [1, 1, 4], [0, 2, 2, 5]]
Output []
Min Heap: []

Lists: [[5, 7, 8, 9], [1, 4], [2, 2, 5]]
Output []
Min Heap: [(0, 2), (1, 1), (2, 0)]

Lists: [[5, 7, 8, 9], [1, 4], [2, 5]]
Output [0]
Min Heap: [(1, 1), (2, 0), (2, 2)]

Lists: [[5, 7, 8, 9], [4], [2, 5]]
Output [0, 1]
Min Heap: [(1, 1), (2, 2), (2, 0)]

Lists: [[5, 7, 8, 9], [], [2, 5]]
Output [0, 1, 1]
Min Heap: [(2, 2), (2, 0), (4, 1)]

Lists: [[5, 7, 8, 9], [], [5]]
Output [0, 1, 1, 2]
Min Heap: [(2, 0), (4, 1), (2, 2)]

Lists: [[7, 8, 9], [], [5]]
Output [0, 1, 1, 2, 2]
Min Heap: [(2, 2), (4, 1), (5, 0)]

Lists: [[7, 8, 9], [], []]
Output [0, 1, 1, 2, 2, 2]
Min Heap: [(4, 1), (5, 0), (5, 2)]

Lists: [[7, 8, 9], [], []]
Output [0, 1, 1, 2, 2, 2, 4]
Min Heap: [(5, 0), (5, 2)]

Lists: [[8, 9], [], []]
Output [0, 1, 1, 2, 2, 2, 4, 5]
Min Heap: [(5, 2), (7, 0)]

Lists: [[8, 9], [], []]
Output [0, 1, 1, 2, 2, 2, 4, 5, 5]
Min Heap: [(7, 0)]

Lists: [[9], [], []]
Output [0, 1, 1, 2, 2, 2, 4, 5, 5, 7]
Min Heap: [(8

### Remarks
Notice how processing the last list was sort of wasteful.  We can do a little better in the worst case (all elements are in a single list) if we just append the heap and the final list once there is only a single member of the heap.

#### Solution 3

In [11]:
def solution_3(lists):
    output = []
    min_heap = []
    smallest = None
    
    def status():
        print("Lists: {}".format(lists))
        print("Output {}".format(output))
        print("Min Heap: {}\n".format(min_heap))
        

    # initialze the min_heap
    status()
    min_index = None
    if not min_heap:
        for i in range(len(lists)):
            if not min_index:
                min_index = i
            elif lists[i] < lists[min_index]:
                min_index = i
            min_heap.append(HeapItem(lists[i].pop(0), i))

        heapq.heapify(min_heap)

    # clear the remaining lists
    while len(min_heap) > 1:
        status()
        # add the smallest
        smallest = heapq.heappop(min_heap)
        # print("Smallest: {}".format(smallest))
        output.append(smallest.data)
        # refill the min_heap
        if len(lists[smallest.index]) > 0:
            heapq.heappush(min_heap, HeapItem(lists[smallest.index].pop(0), smallest.index))
    # eat the remaining min_heap element
    smallest = heapq.heappop(min_heap)
    status()
    output.append(smallest.data)
    status()
    output.extend(lists[smallest.index])
    status()
            
    return output

# Test Solution 3
lists = [ [2, 5, 7, 8, 9, 9, 9], [1, 1, 4], [0, 2, 2, 5]]
print(solution_3(lists))

Lists: [[2, 5, 7, 8, 9, 9, 9], [1, 1, 4], [0, 2, 2, 5]]
Output []
Min Heap: []

Lists: [[5, 7, 8, 9, 9, 9], [1, 4], [2, 2, 5]]
Output []
Min Heap: [(0, 2), (1, 1), (2, 0)]

Lists: [[5, 7, 8, 9, 9, 9], [1, 4], [2, 5]]
Output [0]
Min Heap: [(1, 1), (2, 0), (2, 2)]

Lists: [[5, 7, 8, 9, 9, 9], [4], [2, 5]]
Output [0, 1]
Min Heap: [(1, 1), (2, 2), (2, 0)]

Lists: [[5, 7, 8, 9, 9, 9], [], [2, 5]]
Output [0, 1, 1]
Min Heap: [(2, 2), (2, 0), (4, 1)]

Lists: [[5, 7, 8, 9, 9, 9], [], [5]]
Output [0, 1, 1, 2]
Min Heap: [(2, 0), (4, 1), (2, 2)]

Lists: [[7, 8, 9, 9, 9], [], [5]]
Output [0, 1, 1, 2, 2]
Min Heap: [(2, 2), (4, 1), (5, 0)]

Lists: [[7, 8, 9, 9, 9], [], []]
Output [0, 1, 1, 2, 2, 2]
Min Heap: [(4, 1), (5, 0), (5, 2)]

Lists: [[7, 8, 9, 9, 9], [], []]
Output [0, 1, 1, 2, 2, 2, 4]
Min Heap: [(5, 0), (5, 2)]

Lists: [[8, 9, 9, 9], [], []]
Output [0, 1, 1, 2, 2, 2, 4, 5]
Min Heap: [(5, 2), (7, 0)]

Lists: [[8, 9, 9, 9], [], []]
Output [0, 1, 1, 2, 2, 2, 4, 5, 5]
Min Heap: []

Lists: [[8, 

### Concluding remarks
It seems to work fine with the worst case.  Not sure it matters, but such an operation could also potentially be map-reduced.