### With brute force algorithm we see that time complexity of O(N^3) is too big to be practical. Trying new approach: sorting adducts and using binary search on difference between signal and metabolite

In [7]:
import numpy as np

orig = [20, 10, 15, 30, 25]
# ind = [1, 2, 0, 4, 3]

print(f'orig: {orig}')
print()

ind = np.argsort(orig)

for i in range(len(orig)):
    print(f'{orig[ind[i]]}')

print(f'orig: {orig}')

orig: [20, 10, 15, 30, 25]

10
15
20
25
30
orig: [20, 10, 15, 30, 25]


In [16]:
M = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047]
K = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008] 
N = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]

k_indices = np.argsort(K)  # returns indices
print(f'k_indices: {k_indices}')

k_sorted = []
for i in range(len(K)):
    k_sorted.append(K[k_indices[i]])
print(f'k_sorted: {k_sorted}')

print()
m_indices = np.argsort(M)
print(f'm_indices: {m_indices}')

m_sorted = []
for i in range(len(M)):
    m_sorted.append(M[m_indices[i]])
print(f'm_sorted: {m_sorted}')

k_indices: [2 4 7 3 9 0 1 5 8 6]
k_sorted: [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05]

m_indices: [0 1 5 8 7 4 2 6 3]
m_sorted: [3e-06, 1.2e-05, 4.5e-05, 4.7e-05, 6.8e-05, 7.6e-05, 8.1e-05, 9.2e-05, 9.9e-05]


In [13]:
def binary_search(array, low, high, x):
    if high >= low:
        midpt = (high + low) // 2
        
        if array[midpt] == x:
            # found: stop recursion
            return midpt

        # continue search in the half that contains element
        
        if array[midpt] > x:
            return binary_search(array, low, midpt - 1, x)
        else:
            return binary_search(array, midpt + 1, high, x)

    else:
        return -1

# array = [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05]
# x = 4.5e-5
array = [ 2, 3, 4, 10, 40, 50]
x = 10
x = 2
x = 50
x = 10
index = binary_search(array, 0, len(array) -1, x)
print(f'found {x} at index {index}')

found 10 at index 3


In [75]:
# added epsilon to account for float type of array elements

def binary_search(array, low, high, x):
    if high >= low:
        midpt = (high + low) // 2
        
        # eps = 1e-12
        eps = 1e-3
        # if array[midpt] == x:  # this does not work with float numbers
        if abs(array[midpt] - x) < eps:
            # found: stop recursion
            return midpt

        # continue search in the half that contains element
        
        if array[midpt] > x:
            return binary_search(array, low, midpt - 1, x)
        else:
            return binary_search(array, midpt + 1, high, x)

    else:
        return -1

# array = [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05]
# x = 4.5e-5
array = [ 2, 3, 1.333333, 10, 40, 50]
x = 10
x = 2
x = 50
x = 4 / 3
index = binary_search(array, 0, len(array) -1, x)
print(f'found {x} at index {index}')

found 1.3333333333333333 at index 2


In [None]:
# this process_test_sorted uses dictionary

def binary_search(array, low, high, x):
    if high >= low:
        midpt = (high + low) // 2
        
        eps = 1e-7
        if array[midpt] == x:
            # found: stop recursion
            return midpt

        # continue search in the half that contains element
        
        if array[midpt] > x:
            return binary_search(array, low, midpt - 1, x)
        else:
            return binary_search(array, midpt + 1, high, x)

    else:
        return -1

def process_test(M, K, N):
    """
    M is database of metabolites
    K is database of adducts
    N is our measured signals
    """
    list_pairs = []
    for isignal in range(len(N)):
        meta_min = None
        adduct_min = None
        # delta_min = abs(M[meta_min] + K[adduct_min] - N[isignal])
        delta_min = 1e6
        for imeta in range(len(M)):
            for iadduct in range(len(K)):
                sum_curr = M[imeta] + K[iadduct]
                if sum_curr < 0:
                    continue
                delta_curr = abs(sum_curr - N[isignal])
                if delta_curr < delta_min:
                    delta_min = delta_curr
                    meta_min = imeta
                    adduct_min = iadduct

        # here we selected pair that gives closest sum to our signal
        if meta_min is None:
            print(f'isignal: {isignal} meta_min is None')
        if adduct_min is None:
            print(f'isignal: {isignal} adduct_min is None')
        list_pairs.append((meta_min, adduct_min))

    return list_pairs


def process_test_sorted(M, K_sorted, k_indices, N):
    """
    M is database of metabolites
    K_sorted is database of adducts sorted
    k_indices is list of indices returned by np.argsort
    N is our measured signals
    """
    list_pairs = []
    adduct_dict = {}  # key: meta, value: adduct that when added to meta gives signal
    k_ind_dict = {}  # key: sorted indices, value: orig indices
    
    # find imeta in M, iadduct in sorted array
    for isignal in range(len(N)):
        for imeta in range(len(M)):
            diff_curr = N[isignal] - M[imeta]
            if diff_curr not in adduct_dict:
                adduct_dict[M[imeta]] = diff_curr

            adduct_index = binary_search(K_sorted, 0, len(K_sorted) -1, diff_curr)  # indices in sorted array
    
    list_pairs.append(imeta, adduct_dict[iadduct])
    return imeta, iadduct

In [1]:
# testing getting indices of original array from array of sorted indices

import numpy as np

orig = [20, 10, 15, 30, 25]
# ind = [1, 2, 0, 4, 3]

print(f'orig: {orig}')
print()

ind = np.argsort(orig)
print(f'indices: {ind}')

for i in range(len(orig)):
    print(f'{orig[ind[i]]}')

print(f'orig: {orig}')

orig: [20, 10, 15, 30, 25]

indices: [1 2 0 4 3]
10
15
20
25
30
orig: [20, 10, 15, 30, 25]


# Modify Binary Search
Use two arrays: *original* array and _sorted_ indices.  
Toy example.

In [1]:
import numpy as np

orig = [20, 10, 15, 30, 25]
sort_indices = np.argsort(orig)
print(f'orig: {orig}')
print(f'sort_indices: {sort_indices}')

sorted_list = []
dct = {}
for i in range(len(orig)):
    sorted_list.append(orig[sort_indices[i]])
    # dct[sort_indices[i]] = i
    dct[i] = sort_indices[i]  # this way of creating k, v gives us wrong key, value pairs
print(f'sorted_list: {sorted_list}')
print(f'dct: {dct}')

orig: [20, 10, 15, 30, 25]
sort_indices: [1 2 0 4 3]
sorted_list: [10, 15, 20, 25, 30]
dct: {0: 1, 1: 2, 2: 0, 3: 4, 4: 3}


## When we use adduct[indices_sort[i]], we effectively go from sorted array to original without using dictionary

In [45]:
import numpy as np

adduct = [20, 10, 15, 30, 25]
print(f'adduct orig: {adduct}')

# adduct_sort
# i_orig = d[i_sort]

# adduct_sort
# adduct_sort = adduct.sort()  # sorts array/list in place
# print(f'adduct_sort: {adduct_sort}')
# print(f'adduct sorted: {adduct.sort()}')
# print(f'adduct orig: {adduct}')
# print(f'sorted adduct: {adduct}')

# adduct = np.array([20, 10, 15, 30, 25])
# sorting
indices_sort = np.argsort(adduct)
print(f'indices of sorted array: {indices_sort}')

print(f'adduct sort: {[adduct[indices_sort[i]] for i in range(len(adduct))]}')

# i_orig = d[i_sort]
d = {}  # keys need to be indices of sorted array, values - corresponding indices of original array
print(f'first element of adduct: {adduct[indices_sort[0]]}')

for i_sort in range(len(indices_sort)):
    d[i_sort] = adduct[i_sort]
print(f'd using for loop: {d}')

keys = indices_sort
values = [indices_sort[i] for i in range(len(adduct))]
print(f'keys: {keys}, values: {values}')
d = dict(zip(indices_sort, values))
# d = {keys[i]: values[i] for i in range(len(keys))}
d = {indices_sort[i_orig]: adduct[indices_sort[i_orig]] for i_orig in range(len(adduct))}
print(f'dictionary of sorted indices: {d}')

adduct orig: [20, 10, 15, 30, 25]
indices of sorted array: [1 2 0 4 3]
adduct sort: [10, 15, 20, 25, 30]
first element of adduct: 10
d using for loop: {0: 20, 1: 10, 2: 15, 3: 30, 4: 25}
keys: [1 2 0 4 3], values: [1, 2, 0, 4, 3]
dictionary of sorted indices: {1: 10, 2: 15, 0: 20, 4: 25, 3: 30}


In [39]:
adduct = np.array([20, 10, 15, 30, 25])
print(f'adduct orig: {adduct}')
indices_orig = np.array([i for i in range(len(adduct))])
print(f'indices of orig array:   {indices_orig}')
# sorting
indices_sort = np.argsort(adduct)
print(f'indices of sorted array: {indices_sort}')

#
# when we use adduct[indices_sort[i]], we effectively go from sorted array to original without using dictionary
#
print(f'adduct sort: {[adduct[indices_sort[i]] for i in range(len(adduct))]}')
print()
# get list of orig indices from sorted indices
# orig array[sorted indices]
iadduct = [[i for i in range(len(indices_sort))]]
print(f'iadduct: {iadduct}')
print(f'orig array: {[adduct[iadduct] for iadduct in range(len(adduct))]}')

adduct orig: [20 10 15 30 25]
indices of orig array:   [0 1 2 3 4]
indices of sorted array: [1 2 0 4 3]
adduct sort: [10, 15, 20, 25, 30]

iadduct: [[0, 1, 2, 3, 4]]
orig array: [20, 10, 15, 30, 25]


In [1]:
# testing how to get original indices from sorted indices

import numpy as np

adduct = [20, 10, 15, 30, 25]
print(f'adduct orig: {adduct}')

indices_sort = np.argsort(adduct)
print(f'indices of sorted array: {indices_sort}')

#
# when we use adduct[indices_sort[i]], we effectively go from sorted array to original without using dictionary
#
adduct_sort = [adduct[indices_sort[i]] for i in range(len(adduct))]
print(f'adduct_sort: {adduct_sort}')

n = 3
print(f'Example: the adduct_sort[{n}] has value {adduct_sort[n]} has orig index {indices_sort[n]}')

n = 4
print(f'Example: the adduct_sort[{n}] with value {adduct_sort[n]} has orig index {indices_sort[n]}')

n = 2
print(f'Example: the adduct_sort[{n}] with value {adduct_sort[n]} has orig index {indices_sort[n]}')

adduct orig: [20, 10, 15, 30, 25]
indices of sorted array: [1 2 0 4 3]
adduct_sort: [10, 15, 20, 25, 30]
Example: the adduct_sort[3] has value 25 has orig index 4
Example: the adduct_sort[4] with value 30 has orig index 3
Example: the adduct_sort[2] with value 20 has orig index 0


In [31]:
def binary_search_sort(array_ind, array, low, high, x):
    if high >= low:
        midpt = (high + low) // 2
        
        if array[array_ind[midpt]] == x:
            # found: stop recursion
            print(f'array[array_ind[{midpt}]]: {array[array_ind[midpt]]}')
            return midpt

        # continue search in the half that contains element
        
        if array[array_ind[midpt]] > x:
            return binary_search_sort(array_ind, array, low, midpt - 1, x)
        else:
            return binary_search_sort(array_ind, array, midpt + 1, high, x)

    else:
        return -1

index = binary_search_sort(sort_indices, orig, 0, len(sort_indices) - 1, 15)
print(f'index = {index}, sort_indices[index] = {sort_indices[index]}, orig[sort_indices[index]] = {orig[sort_indices[index]]}')

array[array_ind[1]]: 15
index = 1, sort_indices[index] = 2, orig[sort_indices[index]] = 15


In [4]:
#    0         1          2          3          4         5         6          7         8          9
k = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
print(f'original array: {k}')
k_ind = np.argsort(k)
print(f'indices of sorted array: {k_ind}')
k_sort = [k[k_ind[i]] for i in range(len(k))]
print(f'sorted array: {k_sort}')

for i_sort in range(len(k_ind)):
    print(f'index of sorted array: {i_sort}, index of original array: {k_ind[i_sort]}')

original array: [2e-06, 4.5e-05, -6.3e-05, -9e-06, -5e-05, 4.8e-05, 7e-05, -3.7e-05, 5.6e-05, -8e-06]
indices of sorted array: [2 4 7 3 9 0 1 5 8 6]
sorted array: [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05]
index of sorted array: 0, index of original array: 2
index of sorted array: 1, index of original array: 4
index of sorted array: 2, index of original array: 7
index of sorted array: 3, index of original array: 3
index of sorted array: 4, index of original array: 9
index of sorted array: 5, index of original array: 0
index of sorted array: 6, index of original array: 1
index of sorted array: 7, index of original array: 5
index of sorted array: 8, index of original array: 8
index of sorted array: 9, index of original array: 6


In [3]:
# playing with sorted indices --> original indices

adduct = [20, 10, 15, 30, 25]
print(f'adduct orig: {adduct}')

indices_sort = np.argsort(adduct)
print(f'indices of sorted array: {indices_sort}')

adduct_sort = [adduct[indices_sort[i]] for i in range(len(adduct))]
print(f'adduct_sort: {adduct_sort}')

for i_sort in range(len(indices_sort)):
    print(f'i_sort (new index): {i_sort}, indices_sort (orig index): {indices_sort[i_sort]}')
    i_orig = indices_sort[i_sort]
    print(f'i_orig: {i_orig}')

adduct orig: [20, 10, 15, 30, 25]
indices of sorted array: [1 2 0 4 3]
adduct_sort: [10, 15, 20, 25, 30]
i_sort (new index): 0, indices_sort (orig index): 1
i_orig: 1
i_sort (new index): 1, indices_sort (orig index): 2
i_orig: 2
i_sort (new index): 2, indices_sort (orig index): 0
i_orig: 0
i_sort (new index): 3, indices_sort (orig index): 4
i_orig: 4
i_sort (new index): 4, indices_sort (orig index): 3
i_orig: 3


In [1]:
import numpy as np

def binary_search_sort(array_ind, array, low, high, x):
    if high < low:
        return -1  # element not found
    
    midpt = (high + low) // 2

    curr = array[array_ind[midpt]]

    delta = 1e-7
    if abs(curr - x) < delta:
        # found: stop recursion
        return midpt

    # continue search in the half that contains element

    if curr > x:
        return binary_search_sort(array_ind, array, low, midpt - 1, x)
    else:
        return binary_search_sort(array_ind, array, midpt + 1, high, x)

adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
adducts_ind = np.argsort(adducts)
adducts_sorted = [adducts[adducts_ind[i]] for i in range(len(adducts))]
print(f'values of sorted array: {adducts_sorted}')
x = adducts[0]
x = adducts[9]
#x = adducts[4]
print(f'x = {x}: {binary_search_sort(adducts_ind, adducts, 0, len(adducts) - 1, x)}')

values of sorted array: [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05]
x = -8e-06: 4


### Modifying process_test function to use binary_search

In [51]:
import numpy as np

def binary_search_sort(array_ind, array, low, high, x):
    if high < low:
        return -1  # element not found
    
    midpt = (high + low) // 2

    curr = array[array_ind[midpt]]

    delta = 1e-7
    if abs(curr - x) < delta:
        # found: stop recursion
        return midpt

    # continue search in the half that contains element

    if curr > x:
        return binary_search_sort(array_ind, array, low, midpt - 1, x)
    else:
        return binary_search_sort(array_ind, array, midpt + 1, high, x)

def process_test_sort(metas, adducts_ind, adducts, signals):
    signal_pairs = []
    for isignal in range(len(signals)):
        for imeta in range(len(metas)):
            diff = signals[isignal] - metas[imeta]
            ind_sort = binary_search_sort(adducts_ind, adducts, 0, len(adducts)-1, diff)
            iadduct = adducts[adducts_ind[ind_sort]]
            signal_pairs.append((imeta, iadduct))
    return signal_pairs

#        0         1         2         3         4         5         6         7         8
metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047]
#          0         1          2          3          4         5         6          7         8          9
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
adducts_ind = np.argsort(adducts)
adducts_sorted = [adducts[adducts_ind[i]] for i in range(len(adducts))]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]
signal_pairs = process_test_sort(metas, adducts_ind, adducts, signals)
print(f'signal pairs: {signal_pairs}')
print()
print(f'len(signals): {len(signals)}, len(metas): {len(metas)}')
print(f'len(signal pairs: {len(signal_pairs)})')

signal pairs: [(0, 7e-05), (1, 7e-05), (2, 7e-05), (3, 7e-05), (4, 7e-05), (5, 7e-05), (6, 7e-05), (7, 7e-05), (8, 7e-05), (0, 7e-05), (1, 7e-05), (2, 7e-05), (3, 7e-05), (4, 7e-05), (5, 7e-05), (6, 2e-06), (7, 7e-05), (8, 7e-05), (0, 7e-05), (1, 7e-05), (2, 7e-05), (3, 7e-05), (4, 7e-05), (5, 7e-05), (6, -8e-06), (7, 7e-05), (8, 7e-05), (0, 7e-05), (1, 7e-05), (2, 7e-05), (3, 7e-05), (4, 7e-05), (5, 7e-05), (6, 7e-05), (7, 7e-05), (8, 7e-05), (0, 7e-05), (1, 7e-05), (2, 7e-05), (3, 7e-05), (4, 7e-05), (5, 7e-05), (6, 7e-05), (7, 7e-05), (8, 7e-05), (0, 7e-05), (1, 7e-05), (2, 7e-05), (3, 7e-05), (4, 7e-05), (5, 7e-05), (6, -3.7e-05), (7, 7e-05), (8, 7e-05), (0, 7e-05), (1, 7e-05), (2, 7e-05), (3, 7e-05), (4, 7e-05), (5, 7e-05), (6, 7e-05), (7, 2e-06), (8, 7e-05), (0, 7e-05), (1, 7e-05), (2, 7e-05), (3, 7e-05), (4, 7e-05), (5, 7e-05), (6, 7e-05), (7, 7e-05), (8, 7e-05)]

len(signals): 8, len(metas): 9
len(signal pairs: 72)


In [19]:
import numpy as np

def binary_search_sort(array_ind, array, low, high, x):
    if high < low:
        return -1  # element not found
    
    midpt = (high + low) // 2

    curr = array[array_ind[midpt]]

    delta = 1.1e-6
    if abs(curr - x) < delta:
        # found: stop recursion
        return midpt

    # continue search in the half that contains element

    if curr > x:
        return binary_search_sort(array_ind, array, low, midpt - 1, x)
    else:
        return binary_search_sort(array_ind, array, midpt + 1, high, x)

def process_test_sort(metas, adducts_ind, adducts, signals):
    signal_pairs = []
    for isignal in range(len(signals)):
        found_match = False
        for imeta in range(len(metas)):
            diff = signals[isignal] - metas[imeta]
            ind_sort = binary_search_sort(adducts_ind, adducts, 0, len(adducts)-1, diff)
            if ind_sort >= 0:
                iadduct = adducts_ind[ind_sort]
                signal_pairs.append((imeta, iadduct))
                found_match = True
                break  # next signal
        if not found_match:
            signal_pairs.append((None, None))
    return signal_pairs

#        0         1         2         3         4         5         6         7         8
metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047]
#          0         1          2          3          4         5         6          7         8          9
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
adducts_ind = np.argsort(adducts)
adducts_sorted = [adducts[adducts_ind[i]] for i in range(len(adducts))]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]
signal_pairs = process_test_sort(metas, adducts_ind, adducts, signals)
print(f'signal pairs: {signal_pairs}')

signal pairs: [(4, 0), (5, 5), (2, 0), (0, 5), (None, None), (6, 7), (7, 0), (4, 0)]


In [5]:
# adaptive binary search - process_test_sort never exits while loop

import numpy as np

def binary_search_sort(array_ind, array, low, high, delta, x):
    if high < low:
        return -1  # element not found
    
    midpt = (high + low) // 2

    curr = array[array_ind[midpt]]

    if abs(curr - x) < delta:
        # found: stop recursion
        return midpt

    # continue search in the half that contains element

    if curr > x:
        return binary_search_sort(array_ind, array, low, midpt - 1, delta, x)
    else:
        return binary_search_sort(array_ind, array, midpt + 1, high, delta, x)

def process_test_sort(metas, adducts_ind, adducts, signals):
    delta = 1.1e-6
    signal_pairs = []
    for isignal in range(len(signals)):
        found_match = False
        for imeta in range(len(metas)):
            diff = signals[isignal] - metas[imeta]
            ind_sort = binary_search_sort(adducts_ind, adducts, 0, len(adducts)-1, delta, diff)
            if ind_sort >= 0:
                iadduct = adducts_ind[ind_sort]
                signal_pairs.append((imeta, iadduct))
                found_match = True
                break  # next signal
        if not found_match:
            # signal_pairs.append((None, None))
            factor = 1
            ind_sort = -1
            while ind_sort < 0:
                # factor *= 2
                factor *= 1.01
                ind_sort = binary_search_sort(adducts_ind, adducts, 0, len(adducts)-1, factor*delta, diff)
            iadduct = adducts_ind[ind_sort]
            signal_pairs.append((imeta, iadduct))
    return signal_pairs

#        0         1         2         3         4         5         6         7         8
metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047]
#          0         1          2          3          4         5         6          7         8          9
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
adducts_ind = np.argsort(adducts)
adducts_sorted = [adducts[adducts_ind[i]] for i in range(len(adducts))]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]
signal_pairs = process_test_sort(metas, adducts_ind, adducts, signals)
print(f'signal pairs: {signal_pairs}')

signal pairs: [(4, 0), (5, 5), (2, 0), (0, 5), (8, 0), (6, 7), (7, 0), (4, 0)]


### Calculating deltas since current binary_search_sort function does not find metabolite, adduct pair for some signals: getting metas, adducts, and signals by reading the file

# Come back to the two cells right below this one

In [2]:
import numpy as np

adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
adducts_ind = np.argsort(adducts)
adducts_sorted = [adducts[adducts_ind[i]] for i in range(len(adducts))]

For process_test_sort, we already have indices array of sorted adducts as numpy arrays:  
Get metas as numpy array (in my annotate function)

In [4]:
def read_test(input_fname, test_number):
    with open(input_fname) as input_file:
        n_tests = int(input_file.readline().strip())
        if test_number >= n_tests:
            print(f'This file contains only {n_tests} tests. Numbering of test_number starts from 0.')
            return [], [], []
        
        # skip test_number - 1 tests
        for itest in range(test_number + 1):
            int_numbers = input_file.readline().strip()
            nmetas, nadducts, nsignals = [int(x) for x in int_numbers.split()]
            metas = [float(x) for x in input_file.readline().strip().split()]
            adducts = [float(x) for x in input_file.readline().strip().split()]
            signals = [float(x) for x in input_file.readline().strip().split()]
    return metas, adducts, signals

input_fname = '1.txt'
metas, adducts, signals = read_test(input_fname, 1)
print(f'metas: {metas}')
print(f'adducts: {adducts}')
print(f'signals: {signals}')

This file contains only 1 tests. Numbering of test_number starts from 0.
metas: []
adducts: []
signals: []


In [22]:
# using brute force process test function - deltas on file 1.txt

def process_test(metas, adducts, signals):
    """
    metas is database of metabolites
    adducts is database of adducts
    signals is our measured signals
    """
    deltas = []
    list_pairs = []
    for isignal in range(len(signals)):
        meta_min = None
        adduct_min = None
        # delta_min = abs(M[meta_min] + K[adduct_min] - N[isignal])
        delta_min = 1e6
        for imeta in range(len(metas)):
            for iadduct in range(len(adducts)):
                sum_curr = metas[imeta] + adducts[iadduct]
                if sum_curr < 0:
                    continue
                delta_curr = abs(sum_curr - signals[isignal])
                if delta_curr < delta_min:
                    delta_min = delta_curr
                    meta_min = imeta
                    adduct_min = iadduct

        deltas.append(delta_min)
        # here we selected pair that gives closest sum to our signal
        if meta_min is None:
            print(f'isignal: {isignal} meta_min is None')
        if adduct_min is None:
            print(f'isignal: {isignal} adduct_min is None')
        list_pairs.append((meta_min, adduct_min))

    return list_pairs, deltas

input_fname = '1.txt'
test_number = 0
metas, adducts, signals = read_test(input_fname, test_number)
signal_pairs, deltas = process_test(metas, adducts, signals)
print(f'deltas: {deltas}')

deltas: [9.999999999999972e-07, 0.0, 0.0, 9.999999999999972e-07, 2.000000000000008e-06, 0.0, 0.0, 9.999999999999972e-07]


In [23]:
%matplotlib qt
import matplotlib.pyplot as plt

# plot for file 1.txt
plt.figure()
plt.hist(deltas, bins=10, histtype='step')
plt.title('Delta for each signal: file 1')
plt.xlabel('Delta')
plt.grid()

In [17]:
# using brute force process test function - deltas on file 2.txt test 1

def process_test(metas, adducts, signals):
    """
    metas is database of metabolites
    adducts is database of adducts
    signals is our measured signals
    """
    deltas = []
    list_pairs = []
    for isignal in range(len(signals)):
        meta_min = None
        adduct_min = None
        # delta_min = abs(M[meta_min] + K[adduct_min] - N[isignal])
        delta_min = 1e6
        for imeta in range(len(metas)):
            for iadduct in range(len(adducts)):
                sum_curr = metas[imeta] + adducts[iadduct]
                if sum_curr < 0:
                    continue
                delta_curr = abs(sum_curr - signals[isignal])
                if delta_curr < delta_min:
                    delta_min = delta_curr
                    meta_min = imeta
                    adduct_min = iadduct

        deltas.append(delta_min)
        # here we selected pair that gives closest sum to our signal
        if meta_min is None:
            print(f'isignal: {isignal} meta_min is None')
        if adduct_min is None:
            print(f'isignal: {isignal} adduct_min is None')
        list_pairs.append((meta_min, adduct_min))

    return list_pairs, deltas

input_fname = '2.txt'
test_number = 0
metas, adducts, signals = read_test(input_fname, test_number)
signal_pairs, deltas = process_test(metas, adducts, signals)
print(f'deltas[:10]: {deltas[:10]}')

deltas[:10]: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [18]:
%matplotlib qt
import matplotlib.pyplot as plt

# plot for file 2.txt test 1
plt.figure()
plt.hist(deltas, bins=20, histtype='step')
plt.title('Delta for each signal: file 2 test 1')
plt.xlabel('Delta')
plt.grid()

In [19]:
# using brute force process test function - deltas on file 2.txt test 2

def process_test(metas, adducts, signals):
    """
    metas is database of metabolites
    adducts is database of adducts
    signals is our measured signals
    """
    deltas = []
    list_pairs = []
    for isignal in range(len(signals)):
        meta_min = None
        adduct_min = None
        # delta_min = abs(M[meta_min] + K[adduct_min] - N[isignal])
        delta_min = 1e6
        for imeta in range(len(metas)):
            for iadduct in range(len(adducts)):
                sum_curr = metas[imeta] + adducts[iadduct]
                if sum_curr < 0:
                    continue
                delta_curr = abs(sum_curr - signals[isignal])
                if delta_curr < delta_min:
                    delta_min = delta_curr
                    meta_min = imeta
                    adduct_min = iadduct

        deltas.append(delta_min)
        # here we selected pair that gives closest sum to our signal
        if meta_min is None:
            print(f'isignal: {isignal} meta_min is None')
        if adduct_min is None:
            print(f'isignal: {isignal} adduct_min is None')
        list_pairs.append((meta_min, adduct_min))

    return list_pairs, deltas

input_fname = '2.txt'
test_number = 1
metas, adducts, signals = read_test(input_fname, test_number)
signal_pairs, deltas = process_test(metas, adducts, signals)
print(f'deltas[:10]: {deltas[:10]}')

deltas[:10]: [0.004984000000092692, 0.002853000000072825, 0.007287999999988415, 0.004300000000000637, 0.003628999999932603, 0.0045429999999839765, 0.006248999999911575, 0.006723000000079082, 0.001050000000077489, 0.005426999999940563]


In [20]:
# plot for file 2.txt test 2
plt.figure()
plt.hist(deltas, bins=40, histtype='step')
plt.title('Delta for each signal: file 2 test 2')
plt.xlabel('Delta')
plt.grid()

### At this point we know that binary search will not work in the traditional sense.

In [11]:
array = [1, 2, 4, 5, 6]
x = 3
x = 0
x = 10
bisect.bisect_left(array, x, 0, len(array))

5

In [12]:
metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047] 
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]
metas_ind = np.argsort(metas)
adducts_ind = np.argsort(adducts)
signals_ind = np.argsort(signals)
metas_sort = [metas[metas_ind[i]] for i in range(len(metas))]
adducts_sort = [adducts[adducts_ind[i]] for i in range(len(adducts))]
print(f'adducts_sort: {adducts_sort}')
signals_sort = [signals[signals_ind[i]] for i in range(len(signals))]
print(f'metas_sort: {metas_sort}')
print(f'signals_sort: {signals_sort}')

adducts_sort: [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05]
metas_sort: [3e-06, 1.2e-05, 4.5e-05, 4.7e-05, 6.8e-05, 7.6e-05, 8.1e-05, 9.2e-05, 9.9e-05]
signals_sort: [5.2e-05, 5.5e-05, 6.4e-05, 7e-05, 7.9e-05, 7.9e-05, 8.4e-05, 9.4e-05]


In [17]:
i = 0
signal_meta = signals_sort[i] - metas_sort[i]
print(f'signal_meta: {signal_meta}')
imin = bisect.bisect_left(adducts_sort, signal_meta)
print(f'imin: {imin}')
if imin == 0:
    print(f'delta = {abs(signal_meta - adducts_sort[imin])}')
elif imin == len(adducts_sort) - 1:
    print(f'delta = {abs(signal_meta - adducts_sort[imin])}')
else:
    delta = min(abs(signal_meta - adducts_sort[imin]), abs(signal_meta - adducts_sort[imin+1]))
    print(f'delta = {delta}')

signal_meta: 4.9e-05
imin: 8
delta = 7.000000000000001e-06


In [31]:
print(f'adducts_sort: {adducts_sort}')
print(f'metas_sort: {metas_sort}')
print(f'signals_sort: {signals_sort}')

i = 0
signal_meta = signals_sort[i] - metas_sort[i]
print(f'signal_meta: {signal_meta}')
ins = bisect.bisect_left(adducts_sort, signal_meta)

# ins = 9  # set by hand for test

print(f'ins: {ins}')
if ins == 0:
    print(f'delta = {abs(signal_meta - adducts_sort[ins])}')
elif ins == len(adducts_sort) - 1:
    print(f'delta = {abs(signal_meta - adducts_sort[ins-1])}')
else:
    print(f'adducts_sort[ins-1] = {adducts_sort[ins-1]}, adducts_sort[ins] = {adducts_sort[ins]}')
    print(f'delta = {min(abs(signal_meta - adducts_sort[ins-1]), abs(signal_meta - adducts_sort[ins]))}')

adducts_sort: [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05]
metas_sort: [3e-06, 1.2e-05, 4.5e-05, 4.7e-05, 6.8e-05, 7.6e-05, 8.1e-05, 9.2e-05, 9.9e-05]
signals_sort: [5.2e-05, 5.5e-05, 6.4e-05, 7e-05, 7.9e-05, 7.9e-05, 8.4e-05, 9.4e-05]
signal_meta: 4.9e-05
ins: 8
adducts_sort[ins-1] = 4.8e-05, adducts_sort[ins] = 5.6e-05
delta = 9.999999999999972e-07


In [73]:
# from sorted indices to original indices
# initializing meta_min, adduct_min, and delta_min inside the loop gives us right indices

import bisect

metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047] 
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]
metas_ind = np.argsort(metas)
adducts_ind = np.argsort(adducts)
signals_ind = np.argsort(signals)
metas_sort = [metas[metas_ind[i]] for i in range(len(metas))]
adducts_sort = [adducts[adducts_ind[i]] for i in range(len(adducts))]
print(f'adducts_sort: {adducts_sort}, len(adducts_sort): {len(adducts_sort)}')
signals_sort = [signals[signals_ind[i]] for i in range(len(signals))]
print(f'len(metas_sort): {len(metas_sort)}')
print(f'len(signals): {len(signals)}')

list_pairs = []

for isignal in range(len(signals)):
    meta_min = 0
    adduct_min = 0
    delta_min = abs(signals[isignal] - (metas[meta_min] + adducts_sort[adduct_min]))
    for imeta in range(len(metas)):
        signal_meta = signals[isignal] - metas[imeta]
        # print(f'signal_meta: {signal_meta}')
        ind = bisect.bisect_left(adducts_sort, signal_meta, 0, len(adducts_sort)-1)
        # print(f'ind: {ind}')
        
        adduct_curr = None
        if ind == 0:
            adduct_curr = ind
        elif ind == len(adducts_sort) - 1:
            adduct_curr = ind - 1
        else:
            if abs(signal_meta - adducts_sort[ind-1]) < abs(signal_meta - adducts_sort[ind]):
                adduct_curr = ind - 1
            else:
                adduct_curr = ind
        delta = abs(signal_meta - adducts_sort[adduct_curr])
        
        if delta < delta_min:
            delta_min = delta
            meta_min = imeta
            adduct_min = adducts_ind[adduct_curr]
            
    list_pairs.append((meta_min, adduct_min))
    print(f'current signal: {signals[isignal]}')
print(f'list_pairs: {list_pairs}')
print(f'length of list pairs: {len(list_pairs)}')

adducts_sort: [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05], len(adducts_sort): 10
len(metas_sort): 9
len(signals): 8
current signal: 7.9e-05
current signal: 9.4e-05
current signal: 8.4e-05
current signal: 5.2e-05
current signal: 6.4e-05
current signal: 5.5e-05
current signal: 7e-05
current signal: 7.9e-05
list_pairs: [(4, 0), (6, 0), (6, 9), (0, 5), (3, 7), (6, 7), (7, 0), (4, 0)]
length of list pairs: 8


In [3]:
# from sorted indices to original indices - works

import numpy as np
import bisect

def find_pairs(metas, adducts_sort, adducts_ind, signals):
    list_pairs = []
    for isignal in range(len(signals)):
        meta_min = 0
        adduct_min = 0
        delta_min = abs(signals[isignal] - (metas[meta_min] + adducts_sort[adduct_min]))
        for imeta in range(len(metas)):
            signal_meta = signals[isignal] - metas[imeta]
            # print(f'signal_meta: {signal_meta}')
            ind = bisect.bisect_left(adducts_sort, signal_meta, 0, len(adducts_sort)-1)
            # print(f'ind: {ind}')

            adduct_curr = None
            if ind == 0:
                adduct_curr = ind
            elif ind == len(adducts_sort) - 1:
                adduct_curr = ind - 1
            else:
                if abs(signal_meta - adducts_sort[ind-1]) < abs(signal_meta - adducts_sort[ind]):
                    adduct_curr = ind - 1
                else:
                    adduct_curr = ind
            delta = abs(signal_meta - adducts_sort[adduct_curr])

            if delta < delta_min:
                delta_min = delta
                meta_min = imeta
                adduct_min = adducts_ind[adduct_curr]

        list_pairs.append((meta_min, adduct_min))
    
    return list_pairs

metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047] 
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]
adducts_ind = np.argsort(adducts)
adducts_sort = [adducts[adducts_ind[i]] for i in range(len(adducts))]
list_pairs = find_pairs(metas, adducts_sort, adducts_ind, signals)
print(f'list_pairs: {list_pairs}')
print(f'length of list pairs: {len(list_pairs)}')

list_pairs: [(4, 0), (6, 0), (6, 9), (0, 5), (3, 7), (6, 7), (7, 0), (4, 0)]
length of list pairs: 8


In [1]:
# putting the entire solution together (no output file yet)

import numpy as np
import bisect

def find_pairs(metas, adducts_sort, adducts_ind, signals):
    list_pairs = []
    for isignal in range(len(signals)):
        meta_min = 0
        adduct_min = 0
        delta_min = abs(signals[isignal] - (metas[meta_min] + adducts_sort[adduct_min]))
        for imeta in range(len(metas)):
            signal_meta = signals[isignal] - metas[imeta]
            # print(f'signal_meta: {signal_meta}')
            ind = bisect.bisect_left(adducts_sort, signal_meta, 0, len(adducts_sort)-1)
            # print(f'ind: {ind}')

            adduct_curr = None
            if ind == 0:
                adduct_curr = ind
            elif ind == len(adducts_sort) - 1:
                adduct_curr = ind - 1
            else:
                if abs(signal_meta - adducts_sort[ind-1]) < abs(signal_meta - adducts_sort[ind]):
                    adduct_curr = ind - 1
                else:
                    adduct_curr = ind
            delta = abs(signal_meta - adducts_sort[adduct_curr])

            if delta < delta_min:
                delta_min = delta
                meta_min = imeta
                adduct_min = adducts_ind[adduct_curr]

        list_pairs.append((meta_min, adduct_min))
    
    return list_pairs

def annotate(input_name):
    input_file = open(input_name)
    
    n_tests = int(input_file.readline().strip())
    for itest in range(n_tests):

        int_numbers = input_file.readline().strip()
        nmeta, nadduct, nsignals = [int(x) for x in int_numbers.split()]

        metas = [float(x) for x in input_file.readline().strip().split()]
        adducts = [float(x) for x in input_file.readline().strip().split()]
        signals = [float(x) for x in input_file.readline().strip().split()]
        
        adducts_ind = np.argsort(adducts)
        adducts_sort = [adducts[adducts_ind[i]] for i in range(len(adducts))]
        
        list_pairs = find_pairs(metas, adducts_sort, adducts_ind, signals)
    
    return list_pairs

input_name = '1.txt'
pairs = annotate(input_name)
print(f'pairs: {pairs}')

pairs: [(4, 0), (6, 0), (6, 9), (0, 5), (3, 7), (6, 7), (7, 0), (4, 0)]


In [2]:
%%time

# solution input file 1.txt

import numpy as np
import bisect

def find_pairs(metas, adducts_sort, adducts_ind, signals):
    list_pairs = []
    for isignal in range(len(signals)):
        meta_min = 0
        adduct_min = 0
        delta_min = abs(signals[isignal] - (metas[meta_min] + adducts_sort[adduct_min]))
        for imeta in range(len(metas)):
            signal_meta = signals[isignal] - metas[imeta]
            ind = bisect.bisect_left(adducts_sort, signal_meta, 0, len(adducts_sort)-1)

            adduct_curr = None
            if ind == 0:
                adduct_curr = ind
            elif ind == len(adducts_sort) - 1:
                adduct_curr = ind - 1
            else:
                if abs(signal_meta - adducts_sort[ind-1]) < abs(signal_meta - adducts_sort[ind]):
                    adduct_curr = ind - 1
                else:
                    adduct_curr = ind
            delta = abs(signal_meta - adducts_sort[adduct_curr])

            if delta < delta_min:
                delta_min = delta
                meta_min = imeta
                adduct_min = adducts_ind[adduct_curr]

        list_pairs.append((meta_min, adduct_min))
    
    return list_pairs

def annotate(input_name):
    input_file = open(input_name)
    output_file = open(output_name, 'w')
    
    n_tests = int(input_file.readline().strip())
    for itest in range(n_tests):

        int_numbers = input_file.readline().strip()
        nmeta, nadduct, nsignals = [int(x) for x in int_numbers.split()]

        metas = [float(x) for x in input_file.readline().strip().split()]
        adducts = [float(x) for x in input_file.readline().strip().split()]
        signals = [float(x) for x in input_file.readline().strip().split()]
        
        adducts_ind = np.argsort(adducts)
        adducts_sort = [adducts[adducts_ind[i]] for i in range(len(adducts))]
        
        list_pairs = find_pairs(metas, adducts_sort, adducts_ind, signals)
        
        for x in list_pairs:
            output_file.write(f'{x[0] + 1} {x[1] + 1}\n')
    
    print(f'created output file {output_name}')
    input_file.close()
    output_file.close()

input_name = '1.txt'
output_name = '1_output_optimized.txt'
pairs = annotate(input_name)
# print(f'pairs: {pairs}')

created output file 1_output_optimized.txt


In [1]:
%%time

# solution input file 2.txt

import numpy as np
import bisect

def find_pairs(metas, adducts_sort, adducts_ind, signals):
    list_pairs = []
    for isignal in range(len(signals)):
        meta_min = 0
        adduct_min = 0
        delta_min = abs(signals[isignal] - (metas[meta_min] + adducts_sort[adduct_min]))
        for imeta in range(len(metas)):
            signal_meta = signals[isignal] - metas[imeta]
            ind = bisect.bisect_left(adducts_sort, signal_meta, 0, len(adducts_sort)-1)

            adduct_curr = None
            if ind == 0:
                adduct_curr = ind
            elif ind == len(adducts_sort) - 1:
                adduct_curr = ind - 1
            else:
                if abs(signal_meta - adducts_sort[ind-1]) < abs(signal_meta - adducts_sort[ind]):
                    adduct_curr = ind - 1
                else:
                    adduct_curr = ind
            delta = abs(signal_meta - adducts_sort[adduct_curr])

            if delta < delta_min:
                delta_min = delta
                meta_min = imeta
                adduct_min = adducts_ind[adduct_curr]

        list_pairs.append((meta_min, adduct_min))
    
    return list_pairs

def annotate(input_name):
    input_file = open(input_name)
    output_file = open(output_name, 'w')
    
    n_tests = int(input_file.readline().strip())
    for itest in range(n_tests):

        int_numbers = input_file.readline().strip()
        nmeta, nadduct, nsignals = [int(x) for x in int_numbers.split()]

        metas = [float(x) for x in input_file.readline().strip().split()]
        adducts = [float(x) for x in input_file.readline().strip().split()]
        signals = [float(x) for x in input_file.readline().strip().split()]
        
        adducts_ind = np.argsort(adducts)
        adducts_sort = [adducts[adducts_ind[i]] for i in range(len(adducts))]
        
        list_pairs = find_pairs(metas, adducts_sort, adducts_ind, signals)
        
        for x in list_pairs:
            output_file.write(f'{x[0] + 1} {x[1] + 1}\n')
    
    print(f'created output file {output_name}')
    input_file.close()
    output_file.close()

input_name = '2.txt'
output_name = '2_output_optimized.txt'
pairs = annotate(input_name)

created output file 2_output_optimized.txt


In [1]:
%%time

# solution input file 3.txt

import numpy as np
import bisect

def find_pairs(metas, adducts_sort, adducts_ind, signals):
    list_pairs = []
    for isignal in range(len(signals)):
        if isignal % 1000 == 0:
            print(f'processing isignal: {isignal}')
        meta_min = 0
        adduct_min = 0
        delta_min = abs(signals[isignal] - (metas[meta_min] + adducts_sort[adduct_min]))
        for imeta in range(len(metas)):
            signal_meta = signals[isignal] - metas[imeta]
            ind = bisect.bisect_left(adducts_sort, signal_meta, 0, len(adducts_sort)-1)

            adduct_curr = None
            if ind == 0:
                adduct_curr = ind
            elif ind == len(adducts_sort) - 1:
                adduct_curr = ind - 1
            else:
                if abs(signal_meta - adducts_sort[ind-1]) < abs(signal_meta - adducts_sort[ind]):
                    adduct_curr = ind - 1
                else:
                    adduct_curr = ind
            delta = abs(signal_meta - adducts_sort[adduct_curr])

            if delta < delta_min:
                delta_min = delta
                meta_min = imeta
                adduct_min = adducts_ind[adduct_curr]

        list_pairs.append((meta_min, adduct_min))
    
    return list_pairs

def annotate(input_name):
    input_file = open(input_name)
    output_file = open(output_name, 'w')
    
    n_tests = int(input_file.readline().strip())
    for itest in range(n_tests):

        int_numbers = input_file.readline().strip()
        nmeta, nadduct, nsignals = [int(x) for x in int_numbers.split()]

        metas = [float(x) for x in input_file.readline().strip().split()]
        adducts = [float(x) for x in input_file.readline().strip().split()]
        signals = [float(x) for x in input_file.readline().strip().split()]
        
        adducts_ind = np.argsort(adducts)
        adducts_sort = [adducts[adducts_ind[i]] for i in range(len(adducts))]
        
        list_pairs = find_pairs(metas, adducts_sort, adducts_ind, signals)
        
        for x in list_pairs:
            output_file.write(f'{x[0] + 1} {x[1] + 1}\n')
    
    print(f'created output file {output_name}')
    input_file.close()
    output_file.close()

input_name = '3.txt'
output_name = '3_output.txt'
# pairs = annotate(input_name)

In [2]:
%%time

# solution input file 4.txt

import numpy as np
import bisect

def find_pairs(metas, adducts_sort, adducts_ind, signals):
    list_pairs = []
    for isignal in range(len(signals)):
        if isignal % 10 == 0:
            print(f'processing isignal: {isignal}')
        meta_min = 0
        adduct_min = 0
        delta_min = abs(signals[isignal] - (metas[meta_min] + adducts_sort[adduct_min]))
        for imeta in range(len(metas)):
            if imeta % 100000 == 0:
                print(f'    processing imeta: {imeta}')
            signal_meta = signals[isignal] - metas[imeta]
            ind = bisect.bisect_left(adducts_sort, signal_meta, 0, len(adducts_sort)-1)

            adduct_curr = None
            if ind == 0:
                adduct_curr = ind
            elif ind == len(adducts_sort) - 1:
                adduct_curr = ind - 1
            else:
                if abs(signal_meta - adducts_sort[ind-1]) < abs(signal_meta - adducts_sort[ind]):
                    adduct_curr = ind - 1
                else:
                    adduct_curr = ind
            delta = abs(signal_meta - adducts_sort[adduct_curr])

            if delta < delta_min:
                delta_min = delta
                meta_min = imeta
                adduct_min = adducts_ind[adduct_curr]

        list_pairs.append((meta_min, adduct_min))
    
    return list_pairs

def annotate(input_name):
    input_file = open(input_name)
    output_file = open(output_name, 'w')
    
    n_tests = int(input_file.readline().strip())
    for itest in range(n_tests):

        int_numbers = input_file.readline().strip()
        nmeta, nadduct, nsignals = [int(x) for x in int_numbers.split()]

        metas = [float(x) for x in input_file.readline().strip().split()]
        adducts = [float(x) for x in input_file.readline().strip().split()]
        signals = [float(x) for x in input_file.readline().strip().split()]
        
        adducts_ind = np.argsort(adducts)
        adducts_sort = [adducts[adducts_ind[i]] for i in range(len(adducts))]
        
        list_pairs = find_pairs(metas, adducts_sort, adducts_ind, signals)
        
        for x in list_pairs:
            output_file.write(f'{x[0] + 1} {x[1] + 1}\n')
    
    print(f'created output file {output_name}')
    input_file.close()
    output_file.close()

input_name = '4.txt'
output_name = input_name[:-4] + '_output.txt'
# pairs = annotate(input_name)

In [3]:
%%time

# solution input file 5.txt - works perfectly

import numpy as np
import bisect

def find_pairs(metas, adducts_sort, adducts_ind, signals):
    list_pairs = []
    for isignal in range(len(signals)):
        if isignal % 1000 == 0:
            print(f'processing isignal: {isignal}')
        meta_min = 0
        adduct_min = 0
        delta_min = abs(signals[isignal] - (metas[meta_min] + adducts_sort[adduct_min]))
        for imeta in range(len(metas)):
            if imeta % 1000 == 0:
                print(f'    processing imeta: {imeta}')
            signal_meta = signals[isignal] - metas[imeta]
            ind = bisect.bisect_left(adducts_sort, signal_meta, 0, len(adducts_sort)-1)

            adduct_curr = None
            if ind == 0:
                adduct_curr = ind
            elif ind == len(adducts_sort) - 1:
                adduct_curr = ind - 1
            else:
                if abs(signal_meta - adducts_sort[ind-1]) < abs(signal_meta - adducts_sort[ind]):
                    adduct_curr = ind - 1
                else:
                    adduct_curr = ind
            delta = abs(signal_meta - adducts_sort[adduct_curr])

            if delta < delta_min:
                delta_min = delta
                meta_min = imeta
                adduct_min = adducts_ind[adduct_curr]

        list_pairs.append((meta_min, adduct_min))
    
    return list_pairs

def annotate(input_name):
    input_file = open(input_name)
    output_file = open(output_name, 'w')
    
    n_tests = int(input_file.readline().strip())
    for itest in range(n_tests):

        int_numbers = input_file.readline().strip()
        nmeta, nadduct, nsignals = [int(x) for x in int_numbers.split()]

        metas = [float(x) for x in input_file.readline().strip().split()]
        adducts = [float(x) for x in input_file.readline().strip().split()]
        signals = [float(x) for x in input_file.readline().strip().split()]
        
        adducts_ind = np.argsort(adducts)
        adducts_sort = [adducts[adducts_ind[i]] for i in range(len(adducts))]
        
        list_pairs = find_pairs(metas, adducts_sort, adducts_ind, signals)
        
        for x in list_pairs:
            output_file.write(f'{x[0] + 1} {x[1] + 1}\n')
    
    print(f'created output file {output_name}')
    input_file.close()
    output_file.close()

input_name = '5.txt'
output_name = input_name[:-4] + '_output.txt'
# pairs = annotate(input_name)