### With brute force algorithm we see that time complexity of O(N^3) is too big to be practical. Trying new approach: sorting adducts and using binary search on difference between signal and metabolite

In [7]:
import numpy as np

orig = [20, 10, 15, 30, 25]
# ind = [1, 2, 0, 4, 3]

print(f'orig: {orig}')
print()

ind = np.argsort(orig)

for i in range(len(orig)):
    print(f'{orig[ind[i]]}')

print(f'orig: {orig}')

orig: [20, 10, 15, 30, 25]

10
15
20
25
30
orig: [20, 10, 15, 30, 25]


In [16]:
M = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047]
K = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008] 
N = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]

k_indices = np.argsort(K)  # returns indices
print(f'k_indices: {k_indices}')

k_sorted = []
for i in range(len(K)):
    k_sorted.append(K[k_indices[i]])
print(f'k_sorted: {k_sorted}')

print()
m_indices = np.argsort(M)
print(f'm_indices: {m_indices}')

m_sorted = []
for i in range(len(M)):
    m_sorted.append(M[m_indices[i]])
print(f'm_sorted: {m_sorted}')

k_indices: [2 4 7 3 9 0 1 5 8 6]
k_sorted: [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05]

m_indices: [0 1 5 8 7 4 2 6 3]
m_sorted: [3e-06, 1.2e-05, 4.5e-05, 4.7e-05, 6.8e-05, 7.6e-05, 8.1e-05, 9.2e-05, 9.9e-05]


In [13]:
def binary_search(array, low, high, x):
    if high >= low:
        midpt = (high + low) // 2
        
        if array[midpt] == x:
            # found: stop recursion
            return midpt

        # continue search in the half that contains element
        
        if array[midpt] > x:
            return binary_search(array, low, midpt - 1, x)
        else:
            return binary_search(array, midpt + 1, high, x)

    else:
        return -1

# array = [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05]
# x = 4.5e-5
array = [ 2, 3, 4, 10, 40, 50]
x = 10
x = 2
x = 50
x = 10
index = binary_search(array, 0, len(array) -1, x)
print(f'found {x} at index {index}')

found 10 at index 3


In [75]:
# added epsilon to account for float type of array elements

def binary_search(array, low, high, x):
    if high >= low:
        midpt = (high + low) // 2
        
        # eps = 1e-12
        eps = 1e-3
        # if array[midpt] == x:  # this does not work with float numbers
        if abs(array[midpt] - x) < eps:
            # found: stop recursion
            return midpt

        # continue search in the half that contains element
        
        if array[midpt] > x:
            return binary_search(array, low, midpt - 1, x)
        else:
            return binary_search(array, midpt + 1, high, x)

    else:
        return -1

# array = [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05]
# x = 4.5e-5
array = [ 2, 3, 1.333333, 10, 40, 50]
x = 10
x = 2
x = 50
x = 4 / 3
index = binary_search(array, 0, len(array) -1, x)
print(f'found {x} at index {index}')

found 1.3333333333333333 at index 2


In [None]:
# this process_test_sorted uses dictionary

def binary_search(array, low, high, x):
    if high >= low:
        midpt = (high + low) // 2
        
        eps = 1e-7
        if array[midpt] == x:
            # found: stop recursion
            return midpt

        # continue search in the half that contains element
        
        if array[midpt] > x:
            return binary_search(array, low, midpt - 1, x)
        else:
            return binary_search(array, midpt + 1, high, x)

    else:
        return -1

def process_test(M, K, N):
    """
    M is database of metabolites
    K is database of adducts
    N is our measured signals
    """
    list_pairs = []
    for isignal in range(len(N)):
        meta_min = None
        adduct_min = None
        # delta_min = abs(M[meta_min] + K[adduct_min] - N[isignal])
        delta_min = 1e6
        for imeta in range(len(M)):
            for iadduct in range(len(K)):
                sum_curr = M[imeta] + K[iadduct]
                if sum_curr < 0:
                    continue
                delta_curr = abs(sum_curr - N[isignal])
                if delta_curr < delta_min:
                    delta_min = delta_curr
                    meta_min = imeta
                    adduct_min = iadduct

        # here we selected pair that gives closest sum to our signal
        if meta_min is None:
            print(f'isignal: {isignal} meta_min is None')
        if adduct_min is None:
            print(f'isignal: {isignal} adduct_min is None')
        list_pairs.append((meta_min, adduct_min))

    return list_pairs


def process_test_sorted(M, K_sorted, k_indices, N):
    """
    M is database of metabolites
    K_sorted is database of adducts sorted
    k_indices is list of indices returned by np.argsort
    N is our measured signals
    """
    list_pairs = []
    adduct_dict = {}  # key: meta, value: adduct that when added to meta gives signal
    k_ind_dict = {}  # key: sorted indices, value: orig indices
    
    # find imeta in M, iadduct in sorted array
    for isignal in range(len(N)):
        for imeta in range(len(M)):
            diff_curr = N[isignal] - M[imeta]
            if diff_curr not in adduct_dict:
                adduct_dict[M[imeta]] = diff_curr

            adduct_index = binary_search(K_sorted, 0, len(K_sorted) -1, diff_curr)  # indices in sorted array
    
    list_pairs.append(imeta, adduct_dict[iadduct])
    return imeta, iadduct

In [1]:
# testing getting indices of original array from array of sorted indices

import numpy as np

orig = [20, 10, 15, 30, 25]
# ind = [1, 2, 0, 4, 3]

print(f'orig: {orig}')
print()

ind = np.argsort(orig)
print(f'indices: {ind}')

for i in range(len(orig)):
    print(f'{orig[ind[i]]}')

print(f'orig: {orig}')

orig: [20, 10, 15, 30, 25]

indices: [1 2 0 4 3]
10
15
20
25
30
orig: [20, 10, 15, 30, 25]


# Modify Binary Search
Use two arrays: *original* array and _sorted_ indices.  
Toy example.

In [1]:
import numpy as np

orig = [20, 10, 15, 30, 25]
sort_indices = np.argsort(orig)
print(f'orig: {orig}')
print(f'sort_indices: {sort_indices}')

sorted_list = []
dct = {}
for i in range(len(orig)):
    sorted_list.append(orig[sort_indices[i]])
    # dct[sort_indices[i]] = i
    dct[i] = sort_indices[i]  # this way of creating k, v gives us wrong key, value pairs
print(f'sorted_list: {sorted_list}')
print(f'dct: {dct}')

orig: [20, 10, 15, 30, 25]
sort_indices: [1 2 0 4 3]
sorted_list: [10, 15, 20, 25, 30]
dct: {0: 1, 1: 2, 2: 0, 3: 4, 4: 3}


## When we use adduct[indices_sort[i]], we effectively go from sorted array to original without using dictionary

In [45]:
import numpy as np

adduct = [20, 10, 15, 30, 25]
print(f'adduct orig: {adduct}')

# adduct_sort
# i_orig = d[i_sort]

# adduct_sort
# adduct_sort = adduct.sort()  # sorts array/list in place
# print(f'adduct_sort: {adduct_sort}')
# print(f'adduct sorted: {adduct.sort()}')
# print(f'adduct orig: {adduct}')
# print(f'sorted adduct: {adduct}')

# adduct = np.array([20, 10, 15, 30, 25])
# sorting
indices_sort = np.argsort(adduct)
print(f'indices of sorted array: {indices_sort}')

print(f'adduct sort: {[adduct[indices_sort[i]] for i in range(len(adduct))]}')

# i_orig = d[i_sort]
d = {}  # keys need to be indices of sorted array, values - corresponding indices of original array
print(f'first element of adduct: {adduct[indices_sort[0]]}')

for i_sort in range(len(indices_sort)):
    d[i_sort] = adduct[i_sort]
print(f'd using for loop: {d}')

keys = indices_sort
values = [indices_sort[i] for i in range(len(adduct))]
print(f'keys: {keys}, values: {values}')
d = dict(zip(indices_sort, values))
# d = {keys[i]: values[i] for i in range(len(keys))}
d = {indices_sort[i_orig]: adduct[indices_sort[i_orig]] for i_orig in range(len(adduct))}
print(f'dictionary of sorted indices: {d}')

adduct orig: [20, 10, 15, 30, 25]
indices of sorted array: [1 2 0 4 3]
adduct sort: [10, 15, 20, 25, 30]
first element of adduct: 10
d using for loop: {0: 20, 1: 10, 2: 15, 3: 30, 4: 25}
keys: [1 2 0 4 3], values: [1, 2, 0, 4, 3]
dictionary of sorted indices: {1: 10, 2: 15, 0: 20, 4: 25, 3: 30}


In [39]:
adduct = np.array([20, 10, 15, 30, 25])
print(f'adduct orig: {adduct}')
indices_orig = np.array([i for i in range(len(adduct))])
print(f'indices of orig array:   {indices_orig}')
# sorting
indices_sort = np.argsort(adduct)
print(f'indices of sorted array: {indices_sort}')

#
# when we use adduct[indices_sort[i]], we effectively go from sorted array to original without using dictionary
#
print(f'adduct sort: {[adduct[indices_sort[i]] for i in range(len(adduct))]}')
print()
# get list of orig indices from sorted indices
# orig array[sorted indices]
iadduct = [[i for i in range(len(indices_sort))]]
print(f'iadduct: {iadduct}')
print(f'orig array: {[adduct[iadduct] for iadduct in range(len(adduct))]}')

adduct orig: [20 10 15 30 25]
indices of orig array:   [0 1 2 3 4]
indices of sorted array: [1 2 0 4 3]
adduct sort: [10, 15, 20, 25, 30]

iadduct: [[0, 1, 2, 3, 4]]
orig array: [20, 10, 15, 30, 25]


In [44]:
K = np.array([0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008])
print(f'orig array: {K}')
K_indices = np.argsort(K)
print(f'sorted array: {[K[K_indices[i]] for i in range(len(K))]}')
print(f'K_indices: {K_indices}')

keys = K_indices
values = [i for i in range(len(K))]
ind_d = dict(zip(keys, values))
print(f'index dictionary: {ind_d}')

orig array: [ 2.0e-06  4.5e-05 -6.3e-05 -9.0e-06 -5.0e-05  4.8e-05  7.0e-05 -3.7e-05
  5.6e-05 -8.0e-06]
sorted array: [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05]
K_indices: [2 4 7 3 9 0 1 5 8 6]
index dictionary: {2: 0, 4: 1, 7: 2, 3: 3, 9: 4, 0: 5, 1: 6, 5: 7, 8: 8, 6: 9}


In [1]:
# testing how to get original indices from sorted indices

import numpy as np

adduct = [20, 10, 15, 30, 25]
print(f'adduct orig: {adduct}')

indices_sort = np.argsort(adduct)
print(f'indices of sorted array: {indices_sort}')

#
# when we use adduct[indices_sort[i]], we effectively go from sorted array to original without using dictionary
#
adduct_sort = [adduct[indices_sort[i]] for i in range(len(adduct))]
print(f'adduct_sort: {adduct_sort}')

n = 3
print(f'Example: the adduct_sort[{n}] has value {adduct_sort[n]} has orig index {indices_sort[n]}')

n = 4
print(f'Example: the adduct_sort[{n}] with value {adduct_sort[n]} has orig index {indices_sort[n]}')

n = 2
print(f'Example: the adduct_sort[{n}] with value {adduct_sort[n]} has orig index {indices_sort[n]}')

adduct orig: [20, 10, 15, 30, 25]
indices of sorted array: [1 2 0 4 3]
adduct_sort: [10, 15, 20, 25, 30]
Example: the adduct_sort[3] has value 25 has orig index 4
Example: the adduct_sort[4] with value 30 has orig index 3
Example: the adduct_sort[2] with value 20 has orig index 0


In [31]:
def binary_search_sort(array_ind, array, low, high, x):
    if high >= low:
        midpt = (high + low) // 2
        
        if array[array_ind[midpt]] == x:
            # found: stop recursion
            print(f'array[array_ind[{midpt}]]: {array[array_ind[midpt]]}')
            return midpt

        # continue search in the half that contains element
        
        if array[array_ind[midpt]] > x:
            return binary_search_sort(array_ind, array, low, midpt - 1, x)
        else:
            return binary_search_sort(array_ind, array, midpt + 1, high, x)

    else:
        return -1

index = binary_search_sort(sort_indices, orig, 0, len(sort_indices) - 1, 15)
print(f'index = {index}, sort_indices[index] = {sort_indices[index]}, orig[sort_indices[index]] = {orig[sort_indices[index]]}')

array[array_ind[1]]: 15
index = 1, sort_indices[index] = 2, orig[sort_indices[index]] = 15


In [4]:
#    0         1          2          3          4         5         6          7         8          9
k = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
print(f'original array: {k}')
k_ind = np.argsort(k)
print(f'indices of sorted array: {k_ind}')
k_sort = [k[k_ind[i]] for i in range(len(k))]
print(f'sorted array: {k_sort}')

for i_sort in range(len(k_ind)):
    print(f'index of sorted array: {i_sort}, index of original array: {k_ind[i_sort]}')

original array: [2e-06, 4.5e-05, -6.3e-05, -9e-06, -5e-05, 4.8e-05, 7e-05, -3.7e-05, 5.6e-05, -8e-06]
indices of sorted array: [2 4 7 3 9 0 1 5 8 6]
sorted array: [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05]
index of sorted array: 0, index of original array: 2
index of sorted array: 1, index of original array: 4
index of sorted array: 2, index of original array: 7
index of sorted array: 3, index of original array: 3
index of sorted array: 4, index of original array: 9
index of sorted array: 5, index of original array: 0
index of sorted array: 6, index of original array: 1
index of sorted array: 7, index of original array: 5
index of sorted array: 8, index of original array: 8
index of sorted array: 9, index of original array: 6


In [3]:
# playing with sorted indices --> original indices

adduct = [20, 10, 15, 30, 25]
print(f'adduct orig: {adduct}')

indices_sort = np.argsort(adduct)
print(f'indices of sorted array: {indices_sort}')

adduct_sort = [adduct[indices_sort[i]] for i in range(len(adduct))]
print(f'adduct_sort: {adduct_sort}')

for i_sort in range(len(indices_sort)):
    print(f'i_sort (new index): {i_sort}, indices_sort (orig index): {indices_sort[i_sort]}')
    i_orig = indices_sort[i_sort]
    print(f'i_orig: {i_orig}')

adduct orig: [20, 10, 15, 30, 25]
indices of sorted array: [1 2 0 4 3]
adduct_sort: [10, 15, 20, 25, 30]
i_sort (new index): 0, indices_sort (orig index): 1
i_orig: 1
i_sort (new index): 1, indices_sort (orig index): 2
i_orig: 2
i_sort (new index): 2, indices_sort (orig index): 0
i_orig: 0
i_sort (new index): 3, indices_sort (orig index): 4
i_orig: 4
i_sort (new index): 4, indices_sort (orig index): 3
i_orig: 3


### Come back to the cell below it

In [47]:
# indices checked - binary_search_sort returns indices in sorted array but no delta

def binary_search_sort(array_ind, array, low, high, x):
    if high >= low:
        midpt = (high + low) // 2
        
        if array[array_ind[midpt]] == x:
            # found: stop recursion
            print(f'array[array_ind[{midpt}]]: {array[array_ind[midpt]]}')
            return midpt

        # continue search in the half that contains element
        
        if array[array_ind[midpt]] > x:
            return binary_search_sort(array_ind, array, low, midpt - 1, x)
        else:
            return binary_search_sort(array_ind, array, midpt + 1, high, x)

    else:
        return -1

adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
adducts_ind = np.argsort(adducts)
adducts_sorted = [adducts[adducts_ind[i]] for i in range(len(adducts))]
print(f'values of sorted array: {adducts_sorted}')
print(f'x = adducts[4]: {binary_search_sort(adducts_ind, adducts, 0, len(adducts) - 1, adducts[4])}')
print(f'x = adducts[0]: {binary_search_sort(adducts_ind, adducts, 0, len(adducts) - 1, adducts[0])}')
print(f'x = adducts[1]: {binary_search_sort(adducts_ind, adducts, 0, len(adducts) - 1, adducts[1])}')
print(f'x = adducts[2]: {binary_search_sort(adducts_ind, adducts, 0, len(adducts) - 1, adducts[2])}')
print(f'x = adducts[3]: {binary_search_sort(adducts_ind, adducts, 0, len(adducts) - 1, adducts[3])}')
print(f'x = adducts[5]: {binary_search_sort(adducts_ind, adducts, 0, len(adducts) - 1, adducts[5])}')
print(f'x = adducts[6]: {binary_search_sort(adducts_ind, adducts, 0, len(adducts) - 1, adducts[6])}')
print(f'x = adducts[7]: {binary_search_sort(adducts_ind, adducts, 0, len(adducts) - 1, adducts[7])}')
print(f'x = adducts[8]: {binary_search_sort(adducts_ind, adducts, 0, len(adducts) - 1, adducts[8])}')
print(f'x = adducts[9]: {binary_search_sort(adducts_ind, adducts, 0, len(adducts) - 1, adducts[9])}')

values of sorted array: [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05]
array[array_ind[1]]: -5e-05
x = adducts[4]: 1
array[array_ind[5]]: 2e-06
x = adducts[0]: 5
array[array_ind[6]]: 4.5e-05
x = adducts[1]: 6
array[array_ind[0]]: -6.3e-05
x = adducts[2]: 0
array[array_ind[3]]: -9e-06
x = adducts[3]: 3
array[array_ind[7]]: 4.8e-05
x = adducts[5]: 7
array[array_ind[9]]: 7e-05
x = adducts[6]: 9
array[array_ind[2]]: -3.7e-05
x = adducts[7]: 2
array[array_ind[8]]: 5.6e-05
x = adducts[8]: 8
array[array_ind[4]]: -8e-06
x = adducts[9]: 4


In [6]:
import numpy as np

def binary_search_sort(array_ind, array, low, high, x):
    if high >= low:
        midpt = (high + low) // 2
        
        delta = 1e-7
        if abs(array[array_ind[midpt]] - x) < delta:
            # found: stop recursion
            return midpt

        # continue search in the half that contains element
        
        if array[array_ind[midpt]] > x:
            return binary_search_sort(array_ind, array, low, midpt - 1, x)
        else:
            return binary_search_sort(array_ind, array, midpt + 1, high, x)

    else:
        return -1

adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
adducts_ind = np.argsort(adducts)
adducts_sorted = [adducts[adducts_ind[i]] for i in range(len(adducts))]
print(f'values of sorted array: {adducts_sorted}')
x = adducts[0]
x = adducts[9]
x = adducts[4]
print(f'x = {x}: {binary_search_sort(adducts_ind, adducts, 0, len(adducts) - 1, x)}')

values of sorted array: [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05]
x = -5e-05: 1


In [1]:
import numpy as np

def binary_search_sort(array_ind, array, low, high, x):
    if high < low:
        return -1  # element not found
    
    midpt = (high + low) // 2

    curr = array[array_ind[midpt]]

    delta = 1e-7
    if abs(curr - x) < delta:
        # found: stop recursion
        return midpt

    # continue search in the half that contains element

    if curr > x:
        return binary_search_sort(array_ind, array, low, midpt - 1, x)
    else:
        return binary_search_sort(array_ind, array, midpt + 1, high, x)

adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
adducts_ind = np.argsort(adducts)
adducts_sorted = [adducts[adducts_ind[i]] for i in range(len(adducts))]
print(f'values of sorted array: {adducts_sorted}')
x = adducts[0]
x = adducts[9]
#x = adducts[4]
print(f'x = {x}: {binary_search_sort(adducts_ind, adducts, 0, len(adducts) - 1, x)}')

values of sorted array: [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05]
x = -8e-06: 4


### Modifying process_test function to use binary_search

In [15]:
def process_test_sort(metas, signals):
    signal_meta_list = []
    for isignal in range(len(signals)):
        for imeta in range(len(metas)):
            signal_meta_list.append(signals[isignal] - metas[imeta])
    print(f'signal_meta_list: {signal_meta_list}')
    return signal_meta_list

metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047]
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
adducts_ind = np.argsort(adducts)
adducts_sorted = [adducts[adducts_ind[i]] for i in range(len(adducts))]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]
sig_meta_list = process_test_sort(metas, signals)
print()
print(f'len(signals): {len(signals)}')
print(f'len(metas): {len(metas)}')
print(f'len(sig_meta_list: {len(sig_meta_list)})')

signal_meta_list: [7.599999999999999e-05, 6.7e-05, -2.000000000000008e-06, -1.9999999999999998e-05, 2.9999999999999916e-06, 3.399999999999999e-05, -1.3000000000000004e-05, 1.0999999999999996e-05, 3.2e-05, 9.099999999999999e-05, 8.199999999999999e-05, 1.299999999999999e-05, -4.9999999999999996e-06, 1.799999999999999e-05, 4.899999999999999e-05, 1.9999999999999944e-06, 2.5999999999999995e-05, 4.7e-05, 8.099999999999999e-05, 7.199999999999999e-05, 2.9999999999999916e-06, -1.4999999999999999e-05, 7.999999999999991e-06, 3.899999999999999e-05, -8.000000000000005e-06, 1.5999999999999996e-05, 3.7e-05, 4.9e-05, 3.9999999999999996e-05, -2.9000000000000007e-05, -4.7e-05, -2.4000000000000007e-05, 6.999999999999994e-06, -4e-05, -1.6000000000000003e-05, 4.9999999999999996e-06, 6.1e-05, 5.2e-05, -1.7000000000000007e-05, -3.5e-05, -1.2000000000000007e-05, 1.8999999999999994e-05, -2.8000000000000003e-05, -4.000000000000002e-06, 1.7e-05, 5.2000000000000004e-05, 4.3e-05, -2.6000000000000002e-05, -4.399999

In [51]:
import numpy as np

def binary_search_sort(array_ind, array, low, high, x):
    if high < low:
        return -1  # element not found
    
    midpt = (high + low) // 2

    curr = array[array_ind[midpt]]

    delta = 1e-7
    if abs(curr - x) < delta:
        # found: stop recursion
        return midpt

    # continue search in the half that contains element

    if curr > x:
        return binary_search_sort(array_ind, array, low, midpt - 1, x)
    else:
        return binary_search_sort(array_ind, array, midpt + 1, high, x)

def process_test_sort(metas, adducts_ind, adducts, signals):
    signal_pairs = []
    for isignal in range(len(signals)):
        for imeta in range(len(metas)):
            diff = signals[isignal] - metas[imeta]
            ind_sort = binary_search_sort(adducts_ind, adducts, 0, len(adducts)-1, diff)
            iadduct = adducts[adducts_ind[ind_sort]]
            signal_pairs.append((imeta, iadduct))
    return signal_pairs

#        0         1         2         3         4         5         6         7         8
metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047]
#          0         1          2          3          4         5         6          7         8          9
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
adducts_ind = np.argsort(adducts)
adducts_sorted = [adducts[adducts_ind[i]] for i in range(len(adducts))]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]
signal_pairs = process_test_sort(metas, adducts_ind, adducts, signals)
print(f'signal pairs: {signal_pairs}')
print()
print(f'len(signals): {len(signals)}, len(metas): {len(metas)}')
print(f'len(signal pairs: {len(signal_pairs)})')

signal pairs: [(0, 7e-05), (1, 7e-05), (2, 7e-05), (3, 7e-05), (4, 7e-05), (5, 7e-05), (6, 7e-05), (7, 7e-05), (8, 7e-05), (0, 7e-05), (1, 7e-05), (2, 7e-05), (3, 7e-05), (4, 7e-05), (5, 7e-05), (6, 2e-06), (7, 7e-05), (8, 7e-05), (0, 7e-05), (1, 7e-05), (2, 7e-05), (3, 7e-05), (4, 7e-05), (5, 7e-05), (6, -8e-06), (7, 7e-05), (8, 7e-05), (0, 7e-05), (1, 7e-05), (2, 7e-05), (3, 7e-05), (4, 7e-05), (5, 7e-05), (6, 7e-05), (7, 7e-05), (8, 7e-05), (0, 7e-05), (1, 7e-05), (2, 7e-05), (3, 7e-05), (4, 7e-05), (5, 7e-05), (6, 7e-05), (7, 7e-05), (8, 7e-05), (0, 7e-05), (1, 7e-05), (2, 7e-05), (3, 7e-05), (4, 7e-05), (5, 7e-05), (6, -3.7e-05), (7, 7e-05), (8, 7e-05), (0, 7e-05), (1, 7e-05), (2, 7e-05), (3, 7e-05), (4, 7e-05), (5, 7e-05), (6, 7e-05), (7, 2e-06), (8, 7e-05), (0, 7e-05), (1, 7e-05), (2, 7e-05), (3, 7e-05), (4, 7e-05), (5, 7e-05), (6, 7e-05), (7, 7e-05), (8, 7e-05)]

len(signals): 8, len(metas): 9
len(signal pairs: 72)


In [19]:
import numpy as np

def binary_search_sort(array_ind, array, low, high, x):
    if high < low:
        return -1  # element not found
    
    midpt = (high + low) // 2

    curr = array[array_ind[midpt]]

    delta = 1.1e-6
    if abs(curr - x) < delta:
        # found: stop recursion
        return midpt

    # continue search in the half that contains element

    if curr > x:
        return binary_search_sort(array_ind, array, low, midpt - 1, x)
    else:
        return binary_search_sort(array_ind, array, midpt + 1, high, x)

def process_test_sort(metas, adducts_ind, adducts, signals):
    signal_pairs = []
    for isignal in range(len(signals)):
        found_match = False
        for imeta in range(len(metas)):
            diff = signals[isignal] - metas[imeta]
            ind_sort = binary_search_sort(adducts_ind, adducts, 0, len(adducts)-1, diff)
            if ind_sort >= 0:
                iadduct = adducts_ind[ind_sort]
                signal_pairs.append((imeta, iadduct))
                found_match = True
                break  # next signal
        if not found_match:
            signal_pairs.append((None, None))
    return signal_pairs

#        0         1         2         3         4         5         6         7         8
metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047]
#          0         1          2          3          4         5         6          7         8          9
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
adducts_ind = np.argsort(adducts)
adducts_sorted = [adducts[adducts_ind[i]] for i in range(len(adducts))]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]
signal_pairs = process_test_sort(metas, adducts_ind, adducts, signals)
print(f'signal pairs: {signal_pairs}')

signal pairs: [(4, 0), (5, 5), (2, 0), (0, 5), (None, None), (6, 7), (7, 0), (4, 0)]


In [5]:
# adaptive binary search - process_test_sort never exits while loop

import numpy as np

def binary_search_sort(array_ind, array, low, high, delta, x):
    if high < low:
        return -1  # element not found
    
    midpt = (high + low) // 2

    curr = array[array_ind[midpt]]

    if abs(curr - x) < delta:
        # found: stop recursion
        return midpt

    # continue search in the half that contains element

    if curr > x:
        return binary_search_sort(array_ind, array, low, midpt - 1, delta, x)
    else:
        return binary_search_sort(array_ind, array, midpt + 1, high, delta, x)

def process_test_sort(metas, adducts_ind, adducts, signals):
    delta = 1.1e-6
    signal_pairs = []
    for isignal in range(len(signals)):
        found_match = False
        for imeta in range(len(metas)):
            diff = signals[isignal] - metas[imeta]
            ind_sort = binary_search_sort(adducts_ind, adducts, 0, len(adducts)-1, delta, diff)
            if ind_sort >= 0:
                iadduct = adducts_ind[ind_sort]
                signal_pairs.append((imeta, iadduct))
                found_match = True
                break  # next signal
        if not found_match:
            # signal_pairs.append((None, None))
            factor = 1
            ind_sort = -1
            while ind_sort < 0:
                # factor *= 2
                factor *= 1.01
                ind_sort = binary_search_sort(adducts_ind, adducts, 0, len(adducts)-1, factor*delta, diff)
            iadduct = adducts_ind[ind_sort]
            signal_pairs.append((imeta, iadduct))
    return signal_pairs

#        0         1         2         3         4         5         6         7         8
metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047]
#          0         1          2          3          4         5         6          7         8          9
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
adducts_ind = np.argsort(adducts)
adducts_sorted = [adducts[adducts_ind[i]] for i in range(len(adducts))]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]
signal_pairs = process_test_sort(metas, adducts_ind, adducts, signals)
print(f'signal pairs: {signal_pairs}')

signal pairs: [(4, 0), (5, 5), (2, 0), (0, 5), (8, 0), (6, 7), (7, 0), (4, 0)]


### Calculating deltas since current binary_search_sort function does not find metabolite, adduct pair for some signals

In [2]:
import numpy as np

adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
adducts_ind = np.argsort(adducts)
adducts_sorted = [adducts[adducts_ind[i]] for i in range(len(adducts))]

In [3]:
%matplotlib qt

import matplotlib.pyplot as plt

plt.figure()
plt.title(f'adducts_sorted')
plt.plot(adducts_sorted, '.')
plt.grid()

In [None]:
# closure

def b_search_wrap(a_i, a, low, high, x):
    def b_search(a_i, low, high, x):
        pass
        # if a[a_i[midpt]] == x

In [83]:
# getting deltas - file 1.txt

def process_test(metas, adducts, signals):
    """
    metas is database of metabolites
    adducts is database of adducts
    signals is our measured signals
    """
    deltas = []
    list_pairs = []
    for isignal in range(len(signals)):
        meta_min = None
        adduct_min = None
        # delta_min = abs(M[meta_min] + K[adduct_min] - N[isignal])
        delta_min = 1e6
        for imeta in range(len(metas)):
            for iadduct in range(len(adducts)):
                sum_curr = metas[imeta] + adducts[iadduct]
                if sum_curr < 0:
                    continue
                delta_curr = abs(sum_curr - signals[isignal])
                if delta_curr < delta_min:
                    delta_min = delta_curr
                    meta_min = imeta
                    adduct_min = iadduct

        deltas.append(delta_min)
        # here we selected pair that gives closest sum to our signal
        if meta_min is None:
            print(f'isignal: {isignal} meta_min is None')
        if adduct_min is None:
            print(f'isignal: {isignal} adduct_min is None')
        list_pairs.append((meta_min, adduct_min))

    return list_pairs, deltas

metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047] 
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]
list_pairs, deltas = process_test(metas, adducts, signals)
print(f'deltas: {deltas}')

deltas: [9.999999999999972e-07, 0.0, 0.0, 9.999999999999972e-07, 2.000000000000008e-06, 0.0, 0.0, 9.999999999999972e-07]


In [84]:
# deltas for file 1.txt

list_pairs, deltas = process_test(metas, adducts, signals)
plt.figure()
# plt.hist(deltas, bins=100, histtype='step')
plt.hist(deltas, bins=10, histtype='step')
plt.grid()

In [85]:
# working on delta file name 2.txt - test 1

import numpy as np

def process_test_sort(metas, adducts_ind, adducts, signals):
    deltas = []
    list_pairs = []
    for isignal in range(len(signals)):
        meta_min = None
        adduct_min = None
        # delta_min = abs(M[meta_min] + K[adduct_min] - N[isignal])
        delta_min = 1e6
        for imeta in range(len(metas)):
            for iadduct in range(len(adducts)):
                sum_curr = metas[imeta] + adducts[iadduct]
                if sum_curr < 0:
                    continue
                delta_curr = abs(sum_curr - signals[isignal])
                if delta_curr < delta_min:
                    delta_min = delta_curr
                    meta_min = imeta
                    adduct_min = iadduct

        deltas.append(delta_min)
        # here we selected pair that gives closest sum to our signal
        if meta_min is None:
            print(f'isignal: {isignal} meta_min is None')
        if adduct_min is None:
            print(f'isignal: {isignal} adduct_min is None')
        list_pairs.append((meta_min, adduct_min))

    return list_pairs, deltas

metas = "0.000003 0.000012 0.000081 0.000099 0.000076 0.000045 0.000092 0.000068 0.000047 0.000052 0.000073 0.000019 0.000046 0.000026 0.000074 0.000085 0.000032 0.000078 0.000046 0.000079 0.000094 0.000084 0.000052 0.000064 0.000055 0.000070 0.000079 0.000027 0.000031 0.000079 0.000074 0.000008 0.000057 0.000067 0.000049 0.000038 0.000012 0.000032 0.000039 0.000064 0.000087 0.000038 0.000083 0.000068 0.000011 0.000002 0.000078 0.000047 0.000064 0.000036 0.000028 0.000022 0.000073 0.000092 0.000097 0.000098 0.000017 0.000086 0.000013 0.000084 0.000082 0.000058 0.000049 0.000012 0.000040 0.000076 0.000036 0.000072 0.000025 0.000016 0.000058 0.000071 0.000029 0.000046 0.000068 0.000030 0.000046 0.000038 0.000065 0.000080 0.000020 0.000100 0.000033 0.000003 0.000002 0.000019 0.000096 0.000085 0.000070 0.000046 0.000083 0.000098 0.000032 0.000089 0.000047 0.000060 0.000027 0.000027 0.000039 0.000038 0.000090 0.000068 0.000046 0.000060 0.000043 0.000052 0.000065 0.000052 0.000022 0.000023 0.000052 0.000030 0.000072 0.000021 0.000039 0.000072 0.000054 0.000050 0.000066 0.000081 0.000042 0.000042 0.000019 0.000009 0.000003 0.000088 0.000036 0.000092 0.000068 0.000035 0.000007 0.000018 0.000038 0.000021 0.000070 0.000016 0.000080 0.000024 0.000089 0.000066 0.000098 0.000041 0.000075 0.000019 0.000017 0.000046 0.000081 0.000008 0.000076 0.000071 0.000056 0.000062 0.000009 0.000074 0.000075 0.000061 0.000031 0.000025 0.000092 0.000096 0.000091 0.000023 0.000034 0.000086 0.000029 0.000066 0.000017 0.000090 0.000015 0.000062 0.000079 0.000077 0.000064 0.000039 0.000073 0.000004 0.000005 0.000097 0.000092 0.000015 0.000031 0.000055 0.000044 0.000094 0.000064 0.000004 0.000015 0.000025 0.000023 0.000009 0.000026 0.000047 0.000022 0.000022 0.000028 0.000011 0.000012 0.000029 0.000065 0.000006 0.000095 0.000017 0.000060 0.000022 0.000072 0.000007 0.000083 0.000066 0.000090 0.000098 0.000032 0.000042 0.000086 0.000029 0.000049 0.000089 0.000057 0.000091 0.000029 0.000095 0.000058 0.000051 0.000020 0.000036 0.000032 0.000048 0.000033 0.000087 0.000091 0.000083 0.000086 0.000098 0.000067 0.000045 0.000005 0.000011 0.000049 0.000073 0.000055 0.000098 0.000032 0.000042 0.000037 0.000027 0.000086 0.000097 0.000087 0.000021 0.000089 0.000065 0.000013 0.000054 0.000055 0.000098 0.000024 0.000058 0.000035 0.000027 0.000092 0.000002 0.000080 0.000093 0.000004 0.000070 0.000024 0.000088 0.000011 0.000036 0.000021 0.000099 0.000061 0.000075 0.000023 0.000031 0.000055 0.000008 0.000035 0.000062 0.000028 0.000061 0.000037 0.000054 0.000026 0.000042 0.000017 0.000063 0.000017 0.000082 0.000015 0.000056 0.000078 0.000038 0.000043 0.000096 0.000081 0.000032 0.000078 0.000047 0.000002 0.000084 0.000057 0.000094 0.000030 0.000051 0.000082 0.000043 0.000022 0.000081 0.000094 0.000050 0.000020 0.000043 0.000047 0.000021 0.000053 0.000067 0.000042 0.000043 0.000097 0.000030 0.000058 0.000042 0.000043 0.000015 0.000092 0.000068 0.000065 0.000096 0.000058 0.000074 0.000046 0.000067 0.000067 0.000047 0.000036 0.000088 0.000043 0.000021 0.000073 0.000092 0.000020 0.000095 0.000031 0.000026 0.000013 0.000087 0.000056 0.000055 0.000026 0.000032 0.000065 0.000015 0.000022 0.000078 0.000025 0.000007 0.000037 0.000038 0.000072 0.000047 0.000038 0.000044 0.000093 0.000003 0.000010 0.000001 0.000070 0.000005 0.000091 0.000084 0.000074 0.000082 0.000080 0.000056 0.000076 0.000038 0.000065 0.000007 0.000092 0.000052 0.000032 0.000017 0.000100 0.000057 0.000002 0.000051 0.000074 0.000017 0.000099 0.000053 0.000043 0.000045 0.000046 0.000049 0.000094 0.000059 0.000049 0.000058 0.000088 0.000059 0.000078 0.000045 0.000026 0.000088 0.000052 0.000007 0.000053 0.000066 0.000066 0.000007 0.000031 0.000031 0.000012 0.000096 0.000007 0.000099 0.000031 0.000011 0.000041 0.000024 0.000066 0.000071 0.000001 0.000037 0.000061 0.000050 0.000040 0.000022 0.000024 0.000012 0.000054 0.000012 0.000051 0.000086 0.000051 0.000037 0.000009 0.000089 0.000056 0.000084 0.000008 0.000066 0.000055 0.000075 0.000001 0.000077 0.000012 0.000030 0.000013 0.000005 0.000073 0.000003 0.000010 0.000061 0.000026 0.000072 0.000093 0.000097 0.000050 0.000034 0.000003 0.000035 0.000027 0.000052 0.000049 0.000025 0.000084 0.000021 0.000067 0.000042 0.000016 0.000050 0.000048 0.000076 0.000078 0.000067 0.000050 0.000071 0.000087".split()
metas = [float(x) for x in metas]
adducts = "-0.000038 -0.000025 0.000073 0.000068 0.000072 0.000026 -0.000060 -0.000020 -0.000006 0.000055 0.000024 0.000031 0.000024 0.000061 -0.000098 0.000097 0.000084 0.000073 0.000031 -0.000065 0.000064 -0.000074 0.000010 -0.000075 0.000018 0.000036 -0.000060 -0.000098 -0.000059 0.000058 -0.000067 0.000027 -0.000092 0.000013 -0.000092 -0.000080 -0.000066 -0.000012 0.000026 0.000096 -0.000011 0.000001 -0.000072 0.000028 0.000061 0.000013 0.000005 0.000074 -0.000035 -0.000075 0.000076 -0.000022 -0.000042 0.000005 -0.000005 -0.000016 -0.000071 0.000053 0.000021 0.000065 0.000065 0.000017 -0.000084 -0.000068 -0.000007 -0.000095 -0.000013 0.000019 -0.000008 -0.000041 0.000013 -0.000034 0.000043 0.000019 -0.000056 0.000051 0.000029 0.000098 -0.000064 0.000090 0.000038 -0.000051 -0.000043 0.000063 0.000094 -0.000044 -0.000056 -0.000009 -0.000073 0.000040 -0.000002 0.000062 0.000010 0.000084 -0.000093 0.000024 -0.000019 -0.000051 -0.000074 -0.000059 -0.000015 0.000050 0.000010 -0.000048 0.000051 0.000062 0.000047 -0.000061 -0.000058 0.000020 0.000027 0.000058 0.000019 0.000004 0.000017 0.000045 -0.000056 -0.000053 0.000043 0.000067 0.000093 0.000028 0.000070 0.000002 -0.000025 -0.000037 0.000094 -0.000097 -0.000014 -0.000040 -0.000037 0.000013 0.000019 0.000079 0.000021 -0.000061 -0.000090 0.000091 -0.000068 0.000033 -0.000029 -0.000059 0.000034 0.000087 -0.000018 0.000011 -0.000016 -0.000012 -0.000053 0.000100 -0.000022 0.000003 -0.000062 0.000014 0.000085 -0.000089 0.000021 -0.000055 -0.000019 -0.000012 -0.000055 -0.000076 -0.000033 0.000010 0.000046 -0.000013 0.000008 0.000093 -0.000055 0.000006 0.000005 0.000079 -0.000071 0.000027 0.000092 0.000051 0.000003 -0.000095 0.000000 0.000064 -0.000021 -0.000072 -0.000091 -0.000091 0.000008 -0.000026 -0.000073 -0.000097 0.000035 0.000024 0.000087 0.000062 0.000015 -0.000032 -0.000065 0.000006 0.000040 0.000093 -0.000037 0.000099 0.000048 -0.000087 0.000048 -0.000067 -0.000034 -0.000079 -0.000048 0.000070 0.000090 0.000000 -0.000071 -0.000010 -0.000088 0.000002 0.000038 0.000059 0.000064 0.000032 -0.000067 0.000067 0.000023 -0.000077 -0.000063 -0.000052 0.000086 0.000093 0.000075 -0.000060 0.000050 0.000000 0.000021 0.000069 0.000097 -0.000090 0.000019 -0.000030 0.000100 0.000072 0.000066 0.000096 0.000003 -0.000021 0.000051 0.000077 -0.000001 0.000093 -0.000010 0.000060 -0.000047 0.000012 0.000059 -0.000058 -0.000099 -0.000083 -0.000058 0.000037 0.000037 0.000061 -0.000015 0.000043 0.000043 0.000096 0.000046 -0.000075 0.000056 0.000039 -0.000065 -0.000010 0.000049 -0.000059 0.000078 -0.000008 0.000062 0.000021 -0.000072 0.000012 -0.000007 0.000028 0.000078 -0.000019 0.000037 0.000078 0.000033 -0.000075 0.000006 -0.000007 0.000019 -0.000017 -0.000027 0.000072 0.000010 0.000047 0.000068 -0.000085 0.000061 0.000037 -0.000011 -0.000070 -0.000073 -0.000001 -0.000057 0.000041 0.000096 -0.000014 -0.000024 0.000093 0.000026 0.000092 0.000083 -0.000074 -0.000068 -0.000014 -0.000062 -0.000043 -0.000092 -0.000008 0.000066 0.000065 0.000086 -0.000037 0.000040 -0.000012 -0.000026 0.000097 0.000065 0.000046 -0.000047 0.000064 0.000071 -0.000003 0.000063 0.000091 0.000094 -0.000099 -0.000056 -0.000001 0.000066 -0.000076 0.000069 -0.000090 0.000037 0.000052 -0.000054 0.000004 -0.000084 0.000018 0.000052 -0.000023 0.000094 -0.000026 0.000056 0.000068 0.000082 -0.000066 -0.000022 -0.000035 0.000067 0.000091 0.000056 0.000080 -0.000045 0.000039 -0.000016 -0.000026 -0.000085 -0.000048 0.000056 0.000022 -0.000058 0.000008 0.000098 0.000079 0.000057 0.000058 -0.000064 0.000060 0.000008 -0.000007 -0.000011 0.000093 0.000003 -0.000015 -0.000011 -0.000062 -0.000056 0.000046 -0.000062 0.000079 -0.000006 0.000093 0.000005 -0.000079 0.000025 -0.000049 0.000052 0.000045 0.000046 -0.000072 0.000007 -0.000071 0.000079 0.000073 0.000034 -0.000059 -0.000090 0.000041 -0.000006 -0.000076 -0.000062 0.000082 -0.000013 0.000050 0.000023 -0.000092 0.000057 -0.000055 0.000034 0.000073 -0.000037 -0.000039 0.000082 0.000006 -0.000095 0.000094 -0.000082 -0.000023 -0.000001 0.000057 -0.000094 -0.000093 0.000012 0.000047 -0.000032 0.000035 -0.000087 -0.000017 0.000027 -0.000040 0.000016 0.000096 0.000063 -0.000006 0.000003 0.000014 0.000027 -0.000011 0.000020 -0.000018 -0.000014 0.000081 0.000034 0.000016 0.000072 0.000065 -0.000032 0.000032 -0.000088 0.000095 -0.000095 -0.000040 -0.000090 -0.000057 -0.000028 0.000014 0.000075 -0.000079 -0.000062 -0.000012 0.000032 0.000047 -0.000098 -0.000032 -0.000047 -0.000084 -0.000028 -0.000067 -0.000076 0.000001 -0.000024 0.000029 0.000093 -0.000099 -0.000052 0.000009".split()
adducts = [float(x) for x in adducts]
signals = "0.000010 0.000022 0.000062 0.000095 0.000088 0.000072 0.000002 0.000095 0.000085 0.000047 0.000033 0.000073 0.000049 0.000065 0.000002 0.000052 0.000097 0.000065 0.000019 0.000052 0.000084 0.000030 0.000017 0.000076 0.000009 0.000089 0.000006 0.000057 0.000065 0.000058 0.000045 0.000069 0.000083 0.000051 0.000052 0.000015 0.000053 0.000030 0.000007 0.000061 0.000059 0.000029 0.000047 0.000014 0.000083 0.000056 0.000032 0.000042 0.000020 0.000035 0.000036 0.000074 0.000016 0.000038 0.000059 0.000011 0.000022 0.000037 0.000051 0.000090 0.000019 0.000062 0.000099 0.000011 0.000062 0.000088 0.000069 0.000008 0.000068 0.000076 0.000082 0.000016 0.000084 0.000067 0.000029 0.000034 0.000090 0.000066 0.000029 0.000069 0.000057 0.000006 0.000084 0.000011 0.000035 0.000093 0.000022 0.000071 0.000079 0.000011 0.000035 0.000100 0.000002 0.000078 0.000038 0.000064 0.000065 0.000026 0.000019 0.000020 0.000054 0.000020 0.000057 0.000076 0.000050 0.000004 0.000033 0.000008 0.000027 0.000046 0.000024 0.000095 0.000020 0.000008 0.000007 0.000093 0.000049 0.000055 0.000070 0.000010 0.000087 0.000075 0.000007 0.000062 0.000016 0.000083 0.000064 0.000099 0.000028 0.000004 0.000057 0.000099 0.000065 0.000075 0.000055 0.000062 0.000065 0.000030 0.000003 0.000100 0.000003 0.000052 0.000021 0.000030 0.000063 0.000019 0.000069 0.000013 0.000036 0.000035 0.000063 0.000064 0.000007 0.000074 0.000048 0.000044 0.000043 0.000070 0.000027 0.000047 0.000037 0.000020 0.000053 0.000048 0.000015 0.000097 0.000022 0.000054 0.000057 0.000054 0.000003 0.000097 0.000030 0.000008 0.000033 0.000014 0.000079 0.000082 0.000063 0.000085 0.000099 0.000057 0.000041 0.000073 0.000073 0.000077 0.000045 0.000019 0.000086 0.000044 0.000085 0.000046 0.000069 0.000045 0.000075 0.000076 0.000006 0.000078 0.000086 0.000010 0.000018 0.000093 0.000085 0.000037 0.000067 0.000023 0.000065 0.000038 0.000084 0.000059 0.000073 0.000022 0.000041 0.000001 0.000073 0.000020 0.000094 0.000044 0.000047 0.000098 0.000019 0.000022 0.000038 0.000054 0.000047 0.000032 0.000048 0.000061 0.000039 0.000026 0.000019 0.000003 0.000011 0.000049 0.000002 0.000022 0.000044 0.000045 0.000039 0.000021 0.000061 0.000064 0.000067 0.000094 0.000074 0.000061 0.000100 0.000088 0.000002 0.000066 0.000094 0.000023 0.000021 0.000056 0.000021 0.000071 0.000028 0.000022 0.000080 0.000005 0.000091 0.000098 0.000041 0.000081 0.000042 0.000045 0.000032 0.000049 0.000072 0.000012 0.000036 0.000031 0.000039 0.000009 0.000037 0.000088 0.000064 0.000097 0.000054 0.000016 0.000031 0.000019 0.000021 0.000091 0.000092 0.000098 0.000036 0.000057 0.000009 0.000046 0.000057 0.000064 0.000086 0.000099 0.000082 0.000027 0.000032 0.000017 0.000069 0.000079 0.000086 0.000021 0.000064 0.000065 0.000061 0.000071 0.000093 0.000043 0.000097 0.000010 0.000076 0.000077 0.000052 0.000052 0.000056 0.000068 0.000068 0.000027 0.000061 0.000029 0.000099 0.000098 0.000021 0.000038 0.000017 0.000061 0.000081 0.000095 0.000093 0.000043 0.000041 0.000093 0.000008 0.000067 0.000066 0.000093 0.000021 0.000042 0.000036 0.000078 0.000053 0.000029 0.000097 0.000054 0.000097 0.000051 0.000082 0.000083 0.000022 0.000071 0.000057 0.000036 0.000093 0.000008 0.000059 0.000060 0.000072 0.000070 0.000068 0.000023 0.000090 0.000040 0.000055 0.000014 0.000007 0.000078 0.000045 0.000023 0.000039 0.000006 0.000027 0.000018 0.000037 0.000026 0.000071 0.000016 0.000071 0.000058 0.000041 0.000075 0.000010 0.000046 0.000099 0.000068 0.000001 0.000019 0.000050 0.000007 0.000057 0.000017 0.000069 0.000054 0.000077 0.000050 0.000025 0.000065 0.000079 0.000034 0.000027 0.000093 0.000007 0.000084 0.000079 0.000036 0.000033 0.000043 0.000031 0.000052 0.000030 0.000080 0.000073 0.000062 0.000065 0.000028 0.000084 0.000004 0.000090 0.000063 0.000020 0.000008 0.000008 0.000063 0.000052 0.000077 0.000086 0.000066 0.000061 0.000068 0.000069 0.000070 0.000027 0.000043 0.000084 0.000070 0.000005 0.000095 0.000061 0.000091 0.000045 0.000099 0.000083 0.000024 0.000011 0.000072 0.000083 0.000092 0.000073 0.000053 0.000030 0.000093 0.000098 0.000072 0.000011 0.000023 0.000032 0.000040 0.000037 0.000069 0.000023 0.000043".split()
signals = [float(x) for x in signals]
signal_pairs, deltas = process_test_sort(metas, adducts_ind, adducts, signals)
print(f'deltas[:10]: {deltas[:10]}')

deltas[:10]: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [86]:
# deltas for file 2.txt - test 1

# signal_pairs, deltas = process_test_sort(metas, adducts_ind, adducts, signals)
plt.figure()
plt.hist(deltas, bins=100, histtype='step')
plt.grid()

In [65]:
%matplotlib qt

In [67]:
# using brute force process test (process_test_sort doesn't find indices) function - deltas on file 2.txt test 2

def process_test(metas, adducts, signals):
    """
    metas is database of metabolites
    adducts is database of adducts
    signals is our measured signals
    """
    deltas = []
    list_pairs = []
    for isignal in range(len(signals)):
        meta_min = None
        adduct_min = None
        # delta_min = abs(M[meta_min] + K[adduct_min] - N[isignal])
        delta_min = 1e6
        for imeta in range(len(metas)):
            for iadduct in range(len(adducts)):
                sum_curr = metas[imeta] + adducts[iadduct]
                if sum_curr < 0:
                    continue
                delta_curr = abs(sum_curr - signals[isignal])
                if delta_curr < delta_min:
                    delta_min = delta_curr
                    meta_min = imeta
                    adduct_min = iadduct

        deltas.append(delta_min)
        # here we selected pair that gives closest sum to our signal
        if meta_min is None:
            print(f'isignal: {isignal} meta_min is None')
        if adduct_min is None:
            print(f'isignal: {isignal} adduct_min is None')
        list_pairs.append((meta_min, adduct_min))

    return list_pairs, deltas

metas = "831.850957 792.988376 455.473233 482.981254 314.339836 881.460907 518.428904 295.952640 816.089750 354.138917 159.726925 491.064996 243.465165 827.522162 437.234965 474.113133 182.626753 372.721530 815.651161 171.193152 4.887743 494.104766 201.083466 116.111115 438.928438 7.145720 763.698730 934.474306 889.237277 915.335570 372.055254 792.441874 389.617463 157.196128 241.927215 629.289680 405.552586 77.753874 128.066205 921.826289 278.286656 946.808713 926.526830 280.916275 388.942957 803.684883 926.845496 877.396990 604.175456 669.415005 289.780032 718.305224 81.970922 32.583794 744.947814 809.694416 981.244194 856.177313 526.820444 906.811635 231.619839 929.675727 830.101319 291.619211 406.154453 715.479147 484.949942 2.724749 146.439523 707.096969 114.367247 772.141750 360.611946 224.521115 528.952803 23.598793 782.158852 227.473246 249.535460 46.282142 555.803711 151.758048 771.877460 222.533568 132.201648 489.722005 822.766233 133.154125 30.955094 843.653863 951.668133 945.823820 949.044353 432.103886 151.806944 417.234045 639.248537 954.042764 395.317565 373.729439 124.292075 581.494720 707.500157 927.815054 295.174515 829.509311 908.066650 523.113913 138.135455 791.666684 863.207026 866.981064 612.073885 250.897971 477.148885 86.447832 638.553171 618.619440 595.445357 849.547018 382.351885 434.587378 440.787214 298.742966 855.162180 712.636856 161.459871 289.305532 755.812656 726.213995 109.355705 638.664895 92.510003 820.109815 975.598317 760.714198 898.764687 479.005888 733.907425 975.144443 515.723469 784.465364 587.363109 401.140145 284.435352 561.198207 344.313682 887.815841 170.626181 327.109781 769.657963 967.193538 296.234029 75.793647 904.510156 260.261068 526.668931 998.496838 502.490801 772.852956 945.871549 794.053223 849.663906 179.283379 194.898755 654.238385 603.805257 100.328728 562.874797 126.232237 803.223512 777.978119 699.841561 959.942255 444.637041 683.531572 137.709554 434.929759 288.906420 288.168861 441.478011 450.643902 426.176904 372.704604 918.798784 975.821266 504.700287 654.717129 976.012974 458.579143 323.272326 890.310861 299.991373 779.192231 317.663376 628.831837 988.562244 115.258823 197.528716 66.009131 449.900549 643.490618 761.286538 839.822717 810.498455 361.179979 997.591386 689.727938 99.207099 827.419201 550.381721 697.246190 812.594799 482.485031 798.549336 83.513549 183.860753 238.127517 674.960066 229.047569 575.577922 736.919304 854.382076 508.132824 764.261354 747.234232 817.940330 367.281215 218.464527 602.158556 721.827727 631.576051 15.949315 95.665696 61.836649 698.570933 192.951150 857.761938 777.459751 6.174073 845.348891 906.884724 990.406533 172.423082 592.440514 907.260578 218.737355 380.379766 370.082941 582.289779 897.174666 881.475905 73.066769 532.418403 739.200230 578.433780 932.462617 34.948690 690.443035 14.166539 438.138109 13.768238 611.946131 799.126661 203.128969 969.737784 686.855152 307.235394 393.422109 148.347023 611.165232 734.517439 591.032160 983.141653 38.865661 471.470729 9.414013 169.870615 116.267633 308.359285 426.617859 671.775476 263.564501 380.699275 720.928445 988.021574 133.718713 82.312197 949.356459 941.897079 85.461979 657.164300 309.599771 199.533629 686.552558 403.181519 38.539896 563.857563 11.900311 113.963198 952.409475 716.198713 812.279026 81.839877 303.370203 702.031140 833.334338 112.866878 725.970264 975.300558 304.262726 840.481887 740.380052 483.445392 168.472227 61.509073 464.687663 940.348927 142.893601 581.741593 899.766237 371.747352 835.592026 879.571969 694.203061 979.640861 408.130568 161.709122 712.934144 219.259007 389.724775 493.549744 719.974059 575.989765 68.355052 931.537376 381.064677 107.419684 248.939687 893.621319 544.167365 7.827741 393.930340 754.142167 313.646613 213.358456 961.489112 943.136913 693.956910 624.685609 920.675182 320.538886 430.838660 359.462830 806.402896 205.034478 100.539302 838.220287 790.072805 189.655263 235.183960 392.169948 597.874793 731.068279 376.536370 532.316317 64.318688 952.190696 517.235529 418.124754 359.821432 962.119211 655.477066 823.370793 967.792863 641.952968 732.004897 336.708804 515.477691 514.445254 403.767323 216.539092 962.009195 193.207982 149.732180 944.065282 670.092375 478.328010 359.108210 619.253993 44.453042 240.358627 851.395698 608.837438 336.487424 917.684092 312.643570 381.635246 78.868840 439.043457 963.079987 494.122952 443.275911 524.713752 932.272715 144.337681 590.472822 714.089844 473.486963 340.296183 274.723430 782.417357 66.651523 186.667491 246.446776 197.792059 737.785168 367.338471 709.381871 712.170671 59.627377 744.857634 884.365569 732.760286 719.924830 534.104783 926.949595 661.617577 158.415661 584.007237 42.426515 617.123447 41.813136 87.166944 564.137117 371.337269 62.665290 695.928101 904.263029 449.185964 250.987380 406.601108 457.049917 208.635879 619.074747 333.583608 242.116213 688.718369 507.401298 692.130447 479.325068 204.614525 507.976128 534.515357 277.824711 723.946983 437.376384 206.535718 151.869012 850.785218 634.682015 316.654050 173.399569 113.412464 857.016058 946.159273 498.793040 845.688461 690.276837 862.244176 982.652780 493.561673 621.854143 384.868479 671.542826".split()
metas = [float(x) for x in metas]
adducts = "-607.576991 -177.735181 407.684542 851.233163 526.905340 -190.960202 549.363020 982.739882 630.732374 463.907388 256.004434 998.146833 494.716434 181.270791 181.799833 -191.506149 395.803370 -596.608908 250.403477 -102.555282 -751.055088 -275.074788 -700.475441 98.300278 9.056838 302.446756 763.659457 -900.061883 -634.007898 -153.547906 406.397174 819.919517 -661.336904 -863.784655 -748.222499 432.770607 -433.089628 -805.151705 -986.728549 449.657319 764.195242 -572.970146 -665.891554 -829.613617 -179.121802 -998.331115 865.814376 998.426464 -493.796421 -305.586158 -540.663482 660.851293 -228.808585 -184.776017 -77.375543 365.217564 -536.219852 -394.757431 272.002388 -188.135624 -715.835044 734.833871 -27.489026 943.050290 -243.366671 66.775799 387.386755 676.202500 -487.264561 940.211814 -11.270888 645.652839 609.393388 -386.405183 -115.629513 488.715164 318.675460 -516.588024 748.466957 306.951379 -275.325020 -791.571203 -801.567493 850.141377 -872.928312 -168.604663 294.712972 -207.194845 13.538245 -913.945556 -83.428667 -175.517316 -390.511892 -122.934133 -78.134816 -190.813897 782.216221 904.963419 277.100992 346.550070 -274.271951 -712.536492 -85.643526 -656.990620 -184.089602 -417.716240 -143.023475 -866.457649 612.691638 826.157203 -975.895566 549.102388 -803.836199 -872.586577 -245.595235 -189.153117 737.134443 223.480284 728.842042 -487.121054 -898.802809 338.701943 843.202834 977.285045 53.689571 -426.651193 -317.176592 878.170337 -109.166141 407.064894 -551.284388 545.216435 -128.992770 -727.933789 -672.384778 429.722522 753.301444 584.790502 -356.225968 26.087501 855.309224 477.486300 -139.117961 -170.984453 470.398443 -955.317948 480.681005 -208.920329 735.872238 110.450740 99.560891 -379.642675 -845.260648 672.476020 -491.455528 -507.464767 479.803406 -332.186492 242.541514 -250.940100 687.079442 -666.962004 -196.893753 -192.932558 -918.063378 484.823035 -511.198476 -79.458979 451.840746 943.960836 402.725511 -952.304401 -221.266440 -880.697637 -576.039148 -510.713769 -250.194958 -443.428273 -890.437976 600.213655 -485.320869 562.688917 346.190715 707.049928 -993.428374 80.170536 -121.449824 -385.914916 -714.641450 119.438317 -100.347618 818.562031 -871.322293 -581.648520 679.365385 -95.918396 704.100256 177.379871 -51.184377 420.814197 435.890344 -324.981615 -139.183824 609.372318 -997.186809 -914.359040 580.491974 -795.681842 -256.463029 715.398843 -172.354620 526.961215 35.244883 93.685888 130.360505 440.631382 523.459253 23.650516 -270.816066 -282.525406 -475.095297 -70.377167 713.116482 328.027211 723.614252 -580.290016 633.248306 -645.342530 -856.127786 652.866935 554.125879 743.873875 482.499746 257.998425 813.898370 -548.935828 371.555869 -599.486643 -353.347690 433.378529 -926.263173 -56.616553 -955.625947 384.659361 38.713888 -433.568505 -819.312293 929.150756 -507.334317 125.831777 -908.133544 16.151881 345.989531 -603.802488 557.622576 -943.100405 -397.332175 302.175523 -969.431238 511.831502 -775.416773 186.298646 -696.749687 -929.838788 -595.771306 -439.940949 -227.020652 -912.600555 -807.509445 -533.329796 -319.344817 87.749535 467.993351 -576.326325 81.023113 -764.711483 -316.879596 48.109344 -839.073159 415.454970 878.937748 859.515706 -493.494407 -123.933472 783.912125 591.396609 -899.323633 -771.658176 138.659791 597.546826 892.923065 -43.415902 457.114202 -437.409943 801.038321 -150.918178 417.749504 216.919422 -614.251848 -285.236966 794.840957 614.689061 292.048160 454.263836 -657.124611 293.429702 -14.052811 1.637255 -380.325808 -609.635949 933.693291 -209.196049 -831.829711 -323.066302 -692.435171 796.450947 28.200083 -398.532989 -535.800097 544.737273 -362.522424 -601.109477 -96.647066 877.327845 -561.911295 391.065893 962.408872 -478.800643 -906.923676 -420.824013 110.869294 -692.681908 196.172074 -114.266494 -382.524685 617.020916 -776.030178 183.784290 -592.691643 98.971154 623.014647 947.475744 -177.175260 890.233870 -660.929016 -830.194447 378.669239 634.805278 -223.012500 -211.851982 985.992955 362.784650 743.399210 -649.555560 -213.194096 -237.715055 58.222320 -939.493359 -887.766632 -314.939858 -352.781735 -890.736519 -789.119669 866.865401 518.270336 112.380486 859.308942 232.550769 -168.299727 623.394171 39.242338 -690.419267 237.600427 601.237532 639.704971 581.643215 169.201549 699.917659 386.940911 -219.213727 838.897073 980.855137 -451.362248 197.032783 -459.903557 220.773908 524.083114 66.713804 317.859908 1.647870 694.528503 -407.004974 -855.747541 -466.229938 53.130823 931.550773 758.975897 -455.747462 -331.706636 572.616013 679.460273 792.453091 -249.910023 -65.128387 -610.937518 890.358966 870.490029 96.586601 -54.963189 -951.558829 -317.573157 400.086360 -240.085182 -394.625109 -228.915766 776.625077 905.487659 -168.758881 752.765867 -819.121782 -130.081707 -174.041549 -323.257480 403.164112 661.064514 701.355960 -465.198324 607.990816 -94.216003 -828.096920 700.015264 -500.004094 785.066805 939.397836 751.083091 31.082562 -583.266148 -799.802643 -293.826373 -350.732148 691.950489 -138.437869 -873.624858 459.297648 587.562752 981.076748 -549.087362 -612.519241 186.542352 -303.024613 711.492143 978.301590 443.689161 479.693250 944.804991 363.218685 235.013614 183.396271 584.682103 -37.779475 990.705533 -236.977004 54.425397 -372.327147 -351.718136 -727.717097 545.921862 44.151885 -63.244620".split()
adducts = [float(x) for x in adducts]
signals = "510.822598 918.634400 147.121794 506.438709 656.369414 59.713199 868.159701 894.628582 598.505886 278.998278 912.519797 822.889575 660.892162 462.574204 275.954692 458.954369 383.506804 758.282068 187.141227 453.898638 515.483715 420.073155 236.488348 698.126399 241.681749 403.419099 922.679563 290.676541 765.991268 944.666521 162.646623 159.118088 129.412201 723.903992 403.330598 107.798703 202.982134 235.169698 530.133218 961.949988 224.414470 521.107000 489.041540 316.390566 693.659226 715.441733 375.154188 0.548382 934.151963 990.125705 671.831224 646.561485 770.651139 71.977699 895.635188 647.097828 85.076036 643.615313 229.722486 366.669714 928.266158 800.663176 160.031555 493.299035 595.420259 355.808962 30.224049 233.513602 501.580573 989.973783 422.554174 140.206106 88.797068 342.700909 159.561039 435.333712 461.033943 352.708993 427.204378 806.372344 506.722000 706.485513 362.150529 836.786864 111.911876 482.946500 688.556251 610.726028 780.027270 897.271411 490.042393 732.949784 76.758501 179.285790 279.814743 528.788436 867.076685 181.063418 955.974667 955.983770 167.040798 360.725147 985.730529 366.176742 565.363318 118.767515 770.180468 302.752321 865.590497 534.761619 577.476977 338.518100 669.536687 577.448311 394.287918 10.144794 543.187555 570.389535 310.057985 438.140172 194.595031 340.672359 600.856660 595.253662 193.509300 747.500529 448.503348 864.810704 558.805714 514.564270 374.435219 242.616147 641.764826 868.914033 950.620551 540.650916 138.561423 61.435145 958.411418 784.089572 742.747959 231.671689 156.689659 297.527147 434.349600 497.888446 172.978643 659.665460 22.772679 685.507868 446.434152 759.181993 318.644906 633.893130 630.180017 484.512168 585.296869 70.546766 581.412841 508.726957 758.963415 277.718619 193.072831 806.405690 984.773582 879.371330 65.693988 920.002926 769.423918 706.774615 80.479024 605.218436 23.601495 819.734881 280.898191 647.226530 193.404134 50.567656 132.512881 457.972034 810.905308 317.801623 531.564828 480.420159 273.060252 488.880209 975.572367 18.287831 142.507903 185.646637 485.524309 712.053173 914.364280 193.602798 445.504942 413.573161 199.874395 836.204687 101.723526 371.728606 140.508280 391.767837 786.065666 145.163550 84.445889 953.024848 456.815902 213.634382 122.210978 545.417361 179.063972 588.217283 24.430079 440.823062 222.404316 411.177895 607.545488 676.602040 739.176473 56.601184 432.463481 755.230837 1.887775 110.276289 84.851022 58.854053 842.374162 13.362748 141.226429 858.619491 185.605631 90.429500 934.586481 333.772271 979.918149 737.081237 621.021384 813.411694 511.353513 309.781501 516.920551 926.364068 276.773964 325.232948 741.494354 955.646454 344.467574 18.091287 61.829180 374.321332 651.571167 326.471533 165.121823 228.178221 492.277760 514.398618 287.886645 450.749788 832.827685 704.595010 516.538032 39.827003 579.257394 526.075600 62.610398 270.993875 835.970549 268.095138 176.924723 609.909078 822.704427 63.305260 663.880139 492.970229 292.266163 541.372414 781.878061 99.321795 311.011713 518.123188 753.966097 340.608019 771.728863 764.404473 402.632339 118.173704 291.237599 996.452491 737.991507 146.844125 136.426102 277.291920 843.303680 912.327093 967.899479 33.762814 853.461267 882.815923 227.714636 679.688363 852.793108 530.738934 368.625927 433.851238 945.263862 855.465143 133.564844 366.757448 658.211571 638.074483 323.319964 605.476828 426.507025 125.528086 585.050527 333.391032 53.129253 272.835231 44.484008 310.230799 75.854094 299.013593 184.924543 484.408702 444.282892 403.599070 174.584014 55.711938 708.257070 829.977678 817.910696 917.641642 802.488862 344.723986 667.656362 597.985404 420.744874 22.415745 631.385162 260.993785 566.993053 671.976846 165.794042 736.013568 633.635309 100.625941 985.311670 245.042165 222.347706 917.133349 0.431379 623.407964 4.835547 858.709792 101.877079 904.708473 559.080489 39.514453 775.538898 65.159603 516.724078 567.853551 243.185552 432.770856 510.663659 963.136696 503.158182 125.869335 59.599039 988.851449 5.734264 646.663697 514.341216 362.973755 794.697752 463.131719 704.217934 932.186465 787.259753 86.660762 164.915990 553.008931 643.833772 288.290365 520.153081 849.631774 982.739300 927.778704 429.582641 247.138194 506.722667 424.422829 363.897817 322.863960 586.372826 444.429428 99.103847 361.731148 593.921150 824.893341 349.315036 569.227202 717.842929 989.094261 344.006164 31.677254 683.034646 937.215587 19.979683 861.590406 901.034513 56.734512 656.883480 81.100782 698.816781 156.781223 815.995624 88.852292 421.101511 911.603664 489.465024 40.805310 427.219620 722.743391 460.792680 428.746193 550.718989 733.808017 18.269111 437.664754 422.842812 86.409048 814.757718 958.562918 619.977547 174.145405 946.075379 967.075085 678.047173 252.195631 889.119985 821.130481 111.500848 823.987831 215.247929 917.705121 564.348860 515.379053 930.505572 900.881192 338.165279 856.531675 539.565696 819.371373 283.350917 109.706376 235.816408 403.863547 720.758293 209.014635 627.134037 34.030407 370.482376 260.584157 64.640761 675.500613 174.444475 774.873133".split()
signals = [float(x) for x in signals]
signal_pairs, deltas = process_test(metas, adducts, signals)
print(f'deltas[:10]: {deltas[:10]}')

deltas[:10]: [97.47560799999991, 310.3361940000001, 461.1764119999999, 101.85949699999992, 48.07120800000007, 548.5850069999999, 259.8614950000001, 286.3303760000001, 9.792319999999904, 329.2999279999999]


In [69]:
# signal_pairs, deltas = process_test_sort(metas, adducts_ind, adducts, signals)
plt.figure()
plt.hist(deltas, bins=20, histtype='step')
plt.grid()

For process_test_sort, we already have indices array of sorted adducts as numpy arrays:  
Get metas as numpy array (in my annotate function)

In [35]:
import numpy as np

metas_str = '0.000003 0.000012 0.000081 0.000099 0.000076 0.000045 0.000092 0.000068 0.000047'
metas_floats = np.array([float(x) for x in metas_str.split()])
metas_floats

array([3.0e-06, 1.2e-05, 8.1e-05, 9.9e-05, 7.6e-05, 4.5e-05, 9.2e-05,
       6.8e-05, 4.7e-05])

In [37]:
#            0        1        2        3        4        5        6        7        8
metas_str = '0.000003 0.000012 0.000081 0.000099 0.000076 0.000045 0.000092 0.000068 0.000047'
metas_list = [float(x) for x in metas_str.split()]
metas_ind = np.argsort(metas_list)
print(f'metas_ind: {metas_ind}')

metas_ind: [0 1 5 8 7 4 2 6 3]


### Getting metas, adducts, and signals for file 2.txt by reading the file

In [79]:
def read_test(input_fname, test_number):
    with open(input_fname) as input_file:
        n_tests = int(input_file.readline().strip())
        if test_number >= n_tests:
            print(f'This file contains only {n_tests} tests. Numbering of test_number starts from 0.')
            return [], [], []
        
        # skip test_number - 1 tests
        for itest in range(test_number + 1):
            int_numbers = input_file.readline().strip()
            nmetas, nadducts, nsignals = [int(x) for x in int_numbers.split()]
            metas = [float(x) for x in input_file.readline().strip().split()]
            adducts = [float(x) for x in input_file.readline().strip().split()]
            signals = [float(x) for x in input_file.readline().strip().split()]
    return metas, adducts, signals

input_fname = '1.txt'
metas, adducts, signals = read_test(input_fname, 1)
print(f'metas: {metas}')
print(f'adducts: {adducts}')
print(f'signals: {signals}')

This file contains only 1 tests. Numbering of test_number starts from 0.
metas: []
adducts: []
signals: []


In [80]:
# using brute force process test (process_test_sort doesn't find indices) function - deltas on file 2.txt test 2

def process_test(metas, adducts, signals):
    """
    metas is database of metabolites
    adducts is database of adducts
    signals is our measured signals
    """
    deltas = []
    list_pairs = []
    for isignal in range(len(signals)):
        meta_min = None
        adduct_min = None
        # delta_min = abs(M[meta_min] + K[adduct_min] - N[isignal])
        delta_min = 1e6
        for imeta in range(len(metas)):
            for iadduct in range(len(adducts)):
                sum_curr = metas[imeta] + adducts[iadduct]
                if sum_curr < 0:
                    continue
                delta_curr = abs(sum_curr - signals[isignal])
                if delta_curr < delta_min:
                    delta_min = delta_curr
                    meta_min = imeta
                    adduct_min = iadduct

        deltas.append(delta_min)
        # here we selected pair that gives closest sum to our signal
        if meta_min is None:
            print(f'isignal: {isignal} meta_min is None')
        if adduct_min is None:
            print(f'isignal: {isignal} adduct_min is None')
        list_pairs.append((meta_min, adduct_min))

    return list_pairs, deltas

input_fname = '2.txt'
test_number = 1
metas, adducts, signals = read_test(input_fname, test_number)
signal_pairs, deltas = process_test(metas, adducts, signals)
print(f'deltas[:10]: {deltas[:10]}')

deltas[:10]: [0.004984000000092692, 0.002853000000072825, 0.007287999999988415, 0.004300000000000637, 0.003628999999932603, 0.0045429999999839765, 0.006248999999911575, 0.006723000000079082, 0.001050000000077489, 0.005426999999940563]


In [81]:
plt.figure()
plt.hist(deltas, bins=40, histtype='step')
plt.title('Delta for each signal')
plt.xlabel('Delta')
plt.grid()

Invalid limit will be ignored.
  app.exec_()


In [77]:
plt.figure()
plt.hist(signals, bins=40, histtype='step')
plt.title('Signals')
plt.xlabel('Signal')
plt.grid()

In [92]:
metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047] 
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]
print([x * 1e6 for x in sorted(metas)])
print(x * 1e6 for x in sorted(adducts))
print(sorted(signals))

[3e-06, 1.2e-05, 4.5e-05, 4.7e-05, 6.8e-05, 7.6e-05, 8.1e-05, 9.2e-05, 9.9e-05]
[-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05]
[5.2e-05, 5.5e-05, 6.4e-05, 7e-05, 7.9e-05, 7.9e-05, 8.4e-05, 9.4e-05]


In [5]:
metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047] 
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]

delta = 1e6
l = 0  # leftmost index of metas
r = len(adducts) - 1
for i in range(len(signals)):
    while l == 0:
        # sum of current meta + current adduct is closer to signal than previous diff
        if abs(metas[l] + adducts[r] - signals[i]) < delta:
            diff = abs(metas[l] + adducts[r] - signals[i])
            imeta = l
            iadduct = r
        # current signal is bigger than sum of current meta and current adduct
        if metas[l] + adducts[r] < signals[i]:
            l += 1
        else:
            r -= 1
    print(f'imeta: {imeta}, iadduct: {iadduct}')

imeta: 0, iadduct: 9
imeta: 0, iadduct: 9
imeta: 0, iadduct: 9
imeta: 0, iadduct: 9
imeta: 0, iadduct: 9
imeta: 0, iadduct: 9
imeta: 0, iadduct: 9
imeta: 0, iadduct: 9


In [7]:
metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047] 
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]

delta = 1e6
l = 0  # leftmost index of metas
r = len(adducts) - 1
for i in range(len(signals)):
    while l < len(metas) and r >= 0:
        # sum of current meta + current adduct is closer to signal than previous diff
        if abs(metas[l] + adducts[r] - signals[i]) < delta:
            delta = abs(metas[l] + adducts[r] - signals[i])
            imeta = l
            iadduct = r
        # current signal is bigger than sum of current meta and current adduct
        if metas[l] + adducts[r] < signals[i]:
            l += 1
        else:
            r -= 1
    print(f'imeta: {imeta}, iadduct: {iadduct}')

imeta: 2, iadduct: 9
imeta: 2, iadduct: 9
imeta: 2, iadduct: 9
imeta: 2, iadduct: 9
imeta: 2, iadduct: 9
imeta: 2, iadduct: 9
imeta: 2, iadduct: 9
imeta: 2, iadduct: 9


In [2]:
# trying to account for min delta

metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047] 
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]


#####################################################from brute force process_test funciton
# for isignal in range(len(N)):
#     meta_min = None
#     adduct_min = None
#     # delta_min = abs(M[meta_min] + K[adduct_min] - N[isignal])
#     delta_min = 1e6
#     for imeta in range(len(M)):
#         for iadduct in range(len(K)):
#             sum_curr = M[imeta] + K[iadduct]
#             if sum_curr < 0:
#                 continue
#             delta_curr = abs(sum_curr - N[isignal])
#             if delta_curr < delta_min:
#                 delta_min = delta_curr
#                  meta_min = imeta
#                 adduct_min = iadduct
##################################################

delta = 1e6
l = 0  # leftmost index of metas
r = len(adducts) - 1
for i in range(len(signals)):
    while l < len(metas) and r >= 0:
        # sum of current meta + current adduct is closer to signal than previous diff
        if abs(metas[l] + adducts[r] - signals[i]) < delta:
            delta = abs(metas[l] + adducts[r] - signals[i])
            imeta = l
            iadduct = r
#         if abs(metas[l] + adducts[r] - signals[i]) < delta:
#             delta_curr = abs(metas[l] + adducts[r] - signals[i])
#             if delta_curr < delta:
#                 delta = delta_curr
#             imeta = l
#             iadduct = r
        # current signal is bigger than sum of current meta and current adduct
        if metas[l] + adducts[r] < signals[i]:
            l += 1
        else:
            r -= 1
    print(f'imeta: {imeta}, iadduct: {iadduct}')

imeta: 2, iadduct: 9
imeta: 2, iadduct: 9
imeta: 2, iadduct: 9
imeta: 2, iadduct: 9
imeta: 2, iadduct: 9
imeta: 2, iadduct: 9
imeta: 2, iadduct: 9
imeta: 2, iadduct: 9


In [8]:
# sorted arrays, going from sorted indices to original indices

import numpy as np

metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047] 
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]
metas_sort = np.argsort(metas)
adducts_sort = np.argsort(adducts)

delta = 1e6
l = 0  # leftmost index of metas
r = len(adducts) - 1
for i in range(len(signals)):
    while l < len(metas_sort) and r >= 0:
        # sum of current meta + current adduct is closer to signal than previous diff
        if abs(metas_sort[l] + adducts_sort[r] - signals[i]) < delta:
            delta = abs(metas_sort[l] + adducts_sort[r] - signals[i])
            imeta = l
            iadduct = r

        # sum of current meta and current adduct is bigger than current signal
        if metas_sort[l] + adducts_sort[r] > signals[i]:
            r -= 1
        else:
            l += 1
        meta_ind = metas_sort[imeta]
        adduct_ind = adducts_sort[iadduct]
        print(f'meta_ind: {meta_ind}, adduct_ind: {adduct_ind}')

meta_ind: 0, adduct_ind: 6
meta_ind: 0, adduct_ind: 6
meta_ind: 0, adduct_ind: 5
meta_ind: 0, adduct_ind: 1
meta_ind: 0, adduct_ind: 0
meta_ind: 0, adduct_ind: 0
meta_ind: 0, adduct_ind: 0
meta_ind: 0, adduct_ind: 0
meta_ind: 0, adduct_ind: 0
meta_ind: 0, adduct_ind: 0
meta_ind: 0, adduct_ind: 0


In [5]:
# using data from Geeks for Geeks

metas = [1, 4, 5, 7]
adducts = [10, 20, 30, 40]
# m = len(ar1)
# n = len(ar2)
x = 38
l = 0  # leftmost index of metas
r = len(adducts) - 1
delta = 1e6
while l < len(metas) and r >= 0:
    # sum of current meta + current adduct is closer to signal than previous diff
    if abs(metas[l] + adducts[r] - x) < delta:
        delta = abs(metas[l] + adducts[r] - x)
        imeta = l
        iadduct = r

    # sum of current meta and current adduct is bigger than current signal
    if metas[l] + adducts[r] > x:
        r -= 1
    else:
        l += 1
print(f'The closest pair is {metas[imeta]}, {adducts[iadduct]}')


The closest pair is 7, 30


In [4]:
# sorted arrays, getting indices of sorted arrays

import numpy as np

metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047] 
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]
metas_ind = np.argsort(metas)
adducts_ind = np.argsort(adducts)
metas_sort = [metas[metas_ind[i]] for i in range(len(metas))]
adducts_sort = [adducts[adducts_ind[i]] for i in range(len(adducts))]

delta = 1e6
l = 0  # leftmost index of metas
r = len(adducts) - 1
for i in range(len(signals)):
    while l < len(metas_sort) and r >= 0:
        # sum of current meta + current adduct is closer to signal than previous diff
        print(f'metas_sort[l]: {metas_sort[l]}, adducts_sort[r]: {adducts_sort[r]}, signals[i]: {signals[i]}, delta: {delta}')
        if abs(metas_sort[l] + adducts_sort[r] - signals[i]) < delta:
            delta = abs(metas_sort[l] + adducts_sort[r] - signals[i])
            imeta = l
            iadduct = r

        # sum of current meta and current adduct is bigger than current signal
        if metas_sort[l] + adducts_sort[r] > signals[i]:
            r -= 1
        else:
            l += 1
        meta_ind = l
        adduct_ind = r
        print(f'meta_ind: {meta_ind}, adduct_ind: {adduct_ind}')

metas_sort[l]: 3e-06, adducts_sort[r]: 7e-05, signals[i]: 7.9e-05, delta: 1000000.0
meta_ind: 1, adduct_ind: 9
metas_sort[l]: 1.2e-05, adducts_sort[r]: 7e-05, signals[i]: 7.9e-05, delta: 5.999999999999997e-06
meta_ind: 1, adduct_ind: 8
metas_sort[l]: 1.2e-05, adducts_sort[r]: 5.6e-05, signals[i]: 7.9e-05, delta: 2.9999999999999916e-06
meta_ind: 2, adduct_ind: 8
metas_sort[l]: 4.5e-05, adducts_sort[r]: 5.6e-05, signals[i]: 7.9e-05, delta: 2.9999999999999916e-06
meta_ind: 2, adduct_ind: 7
metas_sort[l]: 4.5e-05, adducts_sort[r]: 4.8e-05, signals[i]: 7.9e-05, delta: 2.9999999999999916e-06
meta_ind: 2, adduct_ind: 6
metas_sort[l]: 4.5e-05, adducts_sort[r]: 4.5e-05, signals[i]: 7.9e-05, delta: 2.9999999999999916e-06
meta_ind: 2, adduct_ind: 5
metas_sort[l]: 4.5e-05, adducts_sort[r]: 2e-06, signals[i]: 7.9e-05, delta: 2.9999999999999916e-06
meta_ind: 3, adduct_ind: 5
metas_sort[l]: 4.7e-05, adducts_sort[r]: 2e-06, signals[i]: 7.9e-05, delta: 2.9999999999999916e-06
meta_ind: 4, adduct_ind: 5


In [3]:
import numpy as np

def binary_search_sort(array_ind, array, low, high, x):
    if high < low:
        return -1  # element not found
    
    midpt = (high + low) // 2

    curr = array[array_ind[midpt]]

    delta = 1.1e-6
    if abs(curr - x) < delta:
        # found: stop recursion
        return midpt

    # continue search in the half that contains element

    if curr > x:
        return binary_search_sort(array_ind, array, low, midpt - 1, x)
    else:
        return binary_search_sort(array_ind, array, midpt + 1, high, x)

def binary_search(array, low, high, x):
    if high >= low:
        midpt = (high + low) // 2
        print(f'midpt: {midpt}')

        if array[midpt] == x:
            return midpt

        # continue search in the half that contains element
        
        if array[midpt] > x:
            print(f'in recursion array[midpt] > x: low = {low}, high = {high}')
            return binary_search(array, low, midpt - 1, x)
        else:
            print(f'in recursion array[midpt] < x: low = {low}, high = {high}')
            return binary_search(array, midpt + 1, high, x)

    else:
        return -1

metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047] 
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]
metas_ind = np.argsort(metas)
adducts_ind = np.argsort(adducts)
signals_ind = np.argsort(signals)
metas_sort = [metas[metas_ind[i]] for i in range(len(metas))]
adducts_sort = [adducts[adducts_ind[i]] for i in range(len(adducts))]
print(f'adducts_sort: {adducts_sort}, len(adducts_sort): {len(adducts_sort)}')
signals_sort = [signals[signals_ind[i]] for i in range(len(signals))]
print(f'len(metas_sort): {len(metas_sort)}')
binary_search(adducts_sort, 0, len(adducts_sort)-1, signals[0])
#print(f'low: {low}, high: {high}')

adducts_sort: [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05], len(adducts_sort): 10
len(metas_sort): 9
midpt: 4
in recursion array[midpt] < x: low = 0, high = 9
midpt: 7
in recursion array[midpt] < x: low = 5, high = 9
midpt: 8
in recursion array[midpt] < x: low = 8, high = 9
midpt: 9
in recursion array[midpt] < x: low = 9, high = 9


-1

In [11]:
array = [1, 2, 4, 5, 6]
x = 3
x = 0
x = 10
bisect.bisect_left(array, x, 0, len(array))

5

In [12]:
metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047] 
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]
metas_ind = np.argsort(metas)
adducts_ind = np.argsort(adducts)
signals_ind = np.argsort(signals)
metas_sort = [metas[metas_ind[i]] for i in range(len(metas))]
adducts_sort = [adducts[adducts_ind[i]] for i in range(len(adducts))]
print(f'adducts_sort: {adducts_sort}')
signals_sort = [signals[signals_ind[i]] for i in range(len(signals))]
print(f'metas_sort: {metas_sort}')
print(f'signals_sort: {signals_sort}')

adducts_sort: [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05]
metas_sort: [3e-06, 1.2e-05, 4.5e-05, 4.7e-05, 6.8e-05, 7.6e-05, 8.1e-05, 9.2e-05, 9.9e-05]
signals_sort: [5.2e-05, 5.5e-05, 6.4e-05, 7e-05, 7.9e-05, 7.9e-05, 8.4e-05, 9.4e-05]


In [17]:
i = 0
signal_meta = signals_sort[i] - metas_sort[i]
print(f'signal_meta: {signal_meta}')
imin = bisect.bisect_left(adducts_sort, signal_meta)
print(f'imin: {imin}')
if imin == 0:
    print(f'delta = {abs(signal_meta - adducts_sort[imin])}')
elif imin == len(adducts_sort) - 1:
    print(f'delta = {abs(signal_meta - adducts_sort[imin])}')
else:
    delta = min(abs(signal_meta - adducts_sort[imin]), abs(signal_meta - adducts_sort[imin+1]))
    print(f'delta = {delta}')

signal_meta: 4.9e-05
imin: 8
delta = 7.000000000000001e-06


In [24]:
print(f'adducts_sort: {adducts_sort}')
print(f'metas_sort: {metas_sort}')
print(f'signals_sort: {signals_sort}')

i = 0
signal_meta = signals_sort[i] - metas_sort[i]
print(f'signal_meta: {signal_meta}')
imin = bisect.bisect_left(adducts_sort, signal_meta)

# imin = 9  # set by hand for test

print(f'imin: {imin}')
if imin == 0:
    print(f'delta = {abs(signal_meta - adducts_sort[imin])}')
elif imin == len(adducts_sort) - 1:
    print(f'delta = {abs(signal_meta - adducts_sort[imin-1])}')
else:
    print(f'adducts_sort[imin] = {adducts_sort[imin]}, adducts_sort[imin+1] = {adducts_sort[imin+1]}')
    print(f'delta = {min(abs(signal_meta - adducts_sort[imin]), abs(signal_meta - adducts_sort[imin+1]))}')

adducts_sort: [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05]
metas_sort: [3e-06, 1.2e-05, 4.5e-05, 4.7e-05, 6.8e-05, 7.6e-05, 8.1e-05, 9.2e-05, 9.9e-05]
signals_sort: [5.2e-05, 5.5e-05, 6.4e-05, 7e-05, 7.9e-05, 7.9e-05, 8.4e-05, 9.4e-05]
signal_meta: 4.9e-05
imin: 8
adducts_sort[imin] = 5.6e-05, adducts_sort[imin+1] = 7e-05
delta = 7.000000000000001e-06


In [31]:
print(f'adducts_sort: {adducts_sort}')
print(f'metas_sort: {metas_sort}')
print(f'signals_sort: {signals_sort}')

i = 0
signal_meta = signals_sort[i] - metas_sort[i]
print(f'signal_meta: {signal_meta}')
ins = bisect.bisect_left(adducts_sort, signal_meta)

# ins = 9  # set by hand for test

print(f'ins: {ins}')
if ins == 0:
    print(f'delta = {abs(signal_meta - adducts_sort[ins])}')
elif ins == len(adducts_sort) - 1:
    print(f'delta = {abs(signal_meta - adducts_sort[ins-1])}')
else:
    print(f'adducts_sort[ins-1] = {adducts_sort[ins-1]}, adducts_sort[ins] = {adducts_sort[ins]}')
    print(f'delta = {min(abs(signal_meta - adducts_sort[ins-1]), abs(signal_meta - adducts_sort[ins]))}')

adducts_sort: [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05]
metas_sort: [3e-06, 1.2e-05, 4.5e-05, 4.7e-05, 6.8e-05, 7.6e-05, 8.1e-05, 9.2e-05, 9.9e-05]
signals_sort: [5.2e-05, 5.5e-05, 6.4e-05, 7e-05, 7.9e-05, 7.9e-05, 8.4e-05, 9.4e-05]
signal_meta: 4.9e-05
ins: 8
adducts_sort[ins-1] = 4.8e-05, adducts_sort[ins] = 5.6e-05
delta = 9.999999999999972e-07


In [35]:
import bisect

metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047] 
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]
metas_ind = np.argsort(metas)
adducts_ind = np.argsort(adducts)
signals_ind = np.argsort(signals)
metas_sort = [metas[metas_ind[i]] for i in range(len(metas))]
adducts_sort = [adducts[adducts_ind[i]] for i in range(len(adducts))]
print(f'adducts_sort: {adducts_sort}, len(adducts_sort): {len(adducts_sort)}')
signals_sort = [signals[signals_ind[i]] for i in range(len(signals))]
print(f'len(metas_sort): {len(metas_sort)}')

delta_min = 1e6
meta_min = None
adduct_min = None
list_pairs = []

for isignal in range(len(signals)):
    for imeta in range(len(metas_sort)):
        signal_meta = signals[isignal] - metas_sort[imeta]
        # print(f'signal_meta: {signal_meta}')
        ind = bisect.bisect_left(adducts_sort, signal_meta, 0, len(adducts_sort)-1)
        #print(f'ind: {ind}')
        
        if ind == 0:
            delta = abs(signal_meta - adducts_sort[ind])
        elif ind == len(adducts_sort) - 1:
            delta = abs(signal_meta - adducts_sort[ind-1])
        else:
            # print(f'adducts_sort[ind-1] = {adducts_sort[ind-1]}, adducts_sort[ind] = {adducts_sort[ind]}')
            delta = min(abs(signal_meta - adducts_sort[ind-1]), abs(signal_meta - adducts_sort[ind]))
        
        if delta < delta_min:
            delta_min = delta
            meta_min = metas_sort[imeta]
            adduct_min = adducts_sort[ind]
    list_pairs.append((meta_min, adduct_min))
    print(f'list_pairs: {list_pairs}')

adducts_sort: [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05], len(adducts_sort): 10
len(metas_sort): 9
list_pairs: [(7.6e-05, 4.5e-05)]
list_pairs: [(7.6e-05, 4.5e-05), (9.2e-05, 2e-06)]
list_pairs: [(7.6e-05, 4.5e-05), (9.2e-05, 2e-06), (9.2e-05, -8e-06)]
list_pairs: [(7.6e-05, 4.5e-05), (9.2e-05, 2e-06), (9.2e-05, -8e-06), (9.2e-05, -8e-06)]
list_pairs: [(7.6e-05, 4.5e-05), (9.2e-05, 2e-06), (9.2e-05, -8e-06), (9.2e-05, -8e-06), (9.2e-05, -8e-06)]
list_pairs: [(7.6e-05, 4.5e-05), (9.2e-05, 2e-06), (9.2e-05, -8e-06), (9.2e-05, -8e-06), (9.2e-05, -8e-06), (9.2e-05, -3.7e-05)]
list_pairs: [(7.6e-05, 4.5e-05), (9.2e-05, 2e-06), (9.2e-05, -8e-06), (9.2e-05, -8e-06), (9.2e-05, -8e-06), (9.2e-05, -3.7e-05), (9.2e-05, -3.7e-05)]
list_pairs: [(7.6e-05, 4.5e-05), (9.2e-05, 2e-06), (9.2e-05, -8e-06), (9.2e-05, -8e-06), (9.2e-05, -8e-06), (9.2e-05, -3.7e-05), (9.2e-05, -3.7e-05), (9.2e-05, -3.7e-05)]


In [57]:
# from sorted indices to original indices 

import bisect

metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047] 
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]
metas_ind = np.argsort(metas)
adducts_ind = np.argsort(adducts)
signals_ind = np.argsort(signals)
metas_sort = [metas[metas_ind[i]] for i in range(len(metas))]
adducts_sort = [adducts[adducts_ind[i]] for i in range(len(adducts))]
print(f'adducts_sort: {adducts_sort}, len(adducts_sort): {len(adducts_sort)}')
signals_sort = [signals[signals_ind[i]] for i in range(len(signals))]
print(f'len(metas_sort): {len(metas_sort)}')
print(f'len(signals): {len(signals)}')

delta_min = 1e6
meta_min = None
adduct_min = None
list_pairs = []

for isignal in range(len(signals)):
    for imeta in range(len(metas_sort)):
        signal_meta = signals[isignal] - metas_sort[imeta]
        # print(f'signal_meta: {signal_meta}')
        ind = bisect.bisect_left(adducts_sort, signal_meta, 0, len(adducts_sort)-1)
        # print(f'ind: {ind}')
        
        if ind == 0:
            delta = abs(signal_meta - adducts_sort[ind])
        elif ind == len(adducts_sort) - 1:
            delta = abs(signal_meta - adducts_sort[ind-1])
        else:
            # print(f'adducts_sort[ind-1] = {adducts_sort[ind-1]}, adducts_sort[ind] = {adducts_sort[ind]}')
            delta = min(abs(signal_meta - adducts_sort[ind-1]), abs(signal_meta - adducts_sort[ind]))
        
        if delta < delta_min:
            delta_min = delta
            meta_min = metas_sort[imeta]
            adduct_min = adducts_sort[ind]
    list_pairs.append((meta_min, adduct_min))
    print(f'list_pairs: {list_pairs}')
print(f'length of list pairs: {len(list_pairs)}')

adducts_sort: [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05], len(adducts_sort): 10
len(metas_sort): 9
len(signals): 8
list_pairs: [(7.6e-05, 4.5e-05)]
list_pairs: [(7.6e-05, 4.5e-05), (9.2e-05, 2e-06)]
list_pairs: [(7.6e-05, 4.5e-05), (9.2e-05, 2e-06), (9.2e-05, -8e-06)]
list_pairs: [(7.6e-05, 4.5e-05), (9.2e-05, 2e-06), (9.2e-05, -8e-06), (9.2e-05, -8e-06)]
list_pairs: [(7.6e-05, 4.5e-05), (9.2e-05, 2e-06), (9.2e-05, -8e-06), (9.2e-05, -8e-06), (9.2e-05, -8e-06)]
list_pairs: [(7.6e-05, 4.5e-05), (9.2e-05, 2e-06), (9.2e-05, -8e-06), (9.2e-05, -8e-06), (9.2e-05, -8e-06), (9.2e-05, -3.7e-05)]
list_pairs: [(7.6e-05, 4.5e-05), (9.2e-05, 2e-06), (9.2e-05, -8e-06), (9.2e-05, -8e-06), (9.2e-05, -8e-06), (9.2e-05, -3.7e-05), (9.2e-05, -3.7e-05)]
list_pairs: [(7.6e-05, 4.5e-05), (9.2e-05, 2e-06), (9.2e-05, -8e-06), (9.2e-05, -8e-06), (9.2e-05, -8e-06), (9.2e-05, -3.7e-05), (9.2e-05, -3.7e-05), (9.2e-05, -3.7e-05)]
length of list pairs: 8


In [71]:
# from sorted indices to original indices 

import bisect

metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047] 
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]
metas_ind = np.argsort(metas)
adducts_ind = np.argsort(adducts)
signals_ind = np.argsort(signals)
metas_sort = [metas[metas_ind[i]] for i in range(len(metas))]
adducts_sort = [adducts[adducts_ind[i]] for i in range(len(adducts))]
print(f'adducts_sort: {adducts_sort}, len(adducts_sort): {len(adducts_sort)}')
signals_sort = [signals[signals_ind[i]] for i in range(len(signals))]
print(f'len(metas_sort): {len(metas_sort)}')
print(f'len(signals): {len(signals)}')

list_pairs = []

for isignal in range(len(signals)):
    meta_min = 0
    adduct_min = 0
    delta_min = abs(signals[isignal] - (metas_sort[meta_min] + adducts_sort[adduct_min]))
    for imeta in range(len(metas_sort)):
        signal_meta = signals[isignal] - metas_sort[imeta]
        # print(f'signal_meta: {signal_meta}')
        ind = bisect.bisect_left(adducts_sort, signal_meta, 0, len(adducts_sort)-1)
        # print(f'ind: {ind}')
        
        adduct_curr = None
        if ind == 0:
            adduct_curr = ind
        elif ind == len(adducts_sort) - 1:
            adduct_curr = ind - 1
        else:
            if abs(signal_meta - adducts_sort[ind-1]) < abs(signal_meta - adducts_sort[ind]):
                adduct_curr = ind - 1
            else:
                adduct_curr = ind
        delta = abs(signal_meta - adducts_sort[adduct_curr])
        
        if delta < delta_min:
            delta_min = delta
            meta_min = metas_sort[imeta]
            adduct_min = adducts_ind[adduct_curr]
            
    list_pairs.append((meta_min, adduct_min))
    print(f'current signal: {signals[isignal]}')
print(f'list_pairs: {list_pairs}')
print(f'length of list pairs: {len(list_pairs)}')

adducts_sort: [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05], len(adducts_sort): 10
len(metas_sort): 9
len(signals): 8
current signal: 7.9e-05
current signal: 9.4e-05
current signal: 8.4e-05
current signal: 5.2e-05
current signal: 6.4e-05
current signal: 5.5e-05
current signal: 7e-05
current signal: 7.9e-05
list_pairs: [(7.6e-05, 0), (9.2e-05, 0), (9.2e-05, 9), (3e-06, 5), (9.9e-05, 7), (9.2e-05, 7), (6.8e-05, 0), (7.6e-05, 0)]
length of list pairs: 8


In [73]:
# from sorted indices to original indices 

import bisect

metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047] 
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]
metas_ind = np.argsort(metas)
adducts_ind = np.argsort(adducts)
signals_ind = np.argsort(signals)
metas_sort = [metas[metas_ind[i]] for i in range(len(metas))]
adducts_sort = [adducts[adducts_ind[i]] for i in range(len(adducts))]
print(f'adducts_sort: {adducts_sort}, len(adducts_sort): {len(adducts_sort)}')
signals_sort = [signals[signals_ind[i]] for i in range(len(signals))]
print(f'len(metas_sort): {len(metas_sort)}')
print(f'len(signals): {len(signals)}')

list_pairs = []

for isignal in range(len(signals)):
    meta_min = 0
    adduct_min = 0
    delta_min = abs(signals[isignal] - (metas[meta_min] + adducts_sort[adduct_min]))
    for imeta in range(len(metas)):
        signal_meta = signals[isignal] - metas[imeta]
        # print(f'signal_meta: {signal_meta}')
        ind = bisect.bisect_left(adducts_sort, signal_meta, 0, len(adducts_sort)-1)
        # print(f'ind: {ind}')
        
        adduct_curr = None
        if ind == 0:
            adduct_curr = ind
        elif ind == len(adducts_sort) - 1:
            adduct_curr = ind - 1
        else:
            if abs(signal_meta - adducts_sort[ind-1]) < abs(signal_meta - adducts_sort[ind]):
                adduct_curr = ind - 1
            else:
                adduct_curr = ind
        delta = abs(signal_meta - adducts_sort[adduct_curr])
        
        if delta < delta_min:
            delta_min = delta
            meta_min = imeta
            adduct_min = adducts_ind[adduct_curr]
            
    list_pairs.append((meta_min, adduct_min))
    print(f'current signal: {signals[isignal]}')
print(f'list_pairs: {list_pairs}')
print(f'length of list pairs: {len(list_pairs)}')

adducts_sort: [-6.3e-05, -5e-05, -3.7e-05, -9e-06, -8e-06, 2e-06, 4.5e-05, 4.8e-05, 5.6e-05, 7e-05], len(adducts_sort): 10
len(metas_sort): 9
len(signals): 8
current signal: 7.9e-05
current signal: 9.4e-05
current signal: 8.4e-05
current signal: 5.2e-05
current signal: 6.4e-05
current signal: 5.5e-05
current signal: 7e-05
current signal: 7.9e-05
list_pairs: [(4, 0), (6, 0), (6, 9), (0, 5), (3, 7), (6, 7), (7, 0), (4, 0)]
length of list pairs: 8


In [3]:
# from sorted indices to original indices - works

import numpy as np
import bisect

def find_pairs(metas, adducts_sort, adducts_ind, signals):
    list_pairs = []
    for isignal in range(len(signals)):
        meta_min = 0
        adduct_min = 0
        delta_min = abs(signals[isignal] - (metas[meta_min] + adducts_sort[adduct_min]))
        for imeta in range(len(metas)):
            signal_meta = signals[isignal] - metas[imeta]
            # print(f'signal_meta: {signal_meta}')
            ind = bisect.bisect_left(adducts_sort, signal_meta, 0, len(adducts_sort)-1)
            # print(f'ind: {ind}')

            adduct_curr = None
            if ind == 0:
                adduct_curr = ind
            elif ind == len(adducts_sort) - 1:
                adduct_curr = ind - 1
            else:
                if abs(signal_meta - adducts_sort[ind-1]) < abs(signal_meta - adducts_sort[ind]):
                    adduct_curr = ind - 1
                else:
                    adduct_curr = ind
            delta = abs(signal_meta - adducts_sort[adduct_curr])

            if delta < delta_min:
                delta_min = delta
                meta_min = imeta
                adduct_min = adducts_ind[adduct_curr]

        list_pairs.append((meta_min, adduct_min))
    
    return list_pairs

metas = [0.000003, 0.000012, 0.000081, 0.000099, 0.000076, 0.000045, 0.000092, 0.000068, 0.000047] 
adducts = [0.000002, 0.000045, -0.000063, -0.000009, -0.000050, 0.000048, 0.000070, -0.000037, 0.000056, -0.000008]
signals = [0.000079, 0.000094, 0.000084, 0.000052, 0.000064, 0.000055, 0.000070, 0.000079]
adducts_ind = np.argsort(adducts)
adducts_sort = [adducts[adducts_ind[i]] for i in range(len(adducts))]
list_pairs = find_pairs(metas, adducts_sort, adducts_ind, signals)
print(f'list_pairs: {list_pairs}')
print(f'length of list pairs: {len(list_pairs)}')

list_pairs: [(4, 0), (6, 0), (6, 9), (0, 5), (3, 7), (6, 7), (7, 0), (4, 0)]
length of list pairs: 8


In [1]:
# putting the entire solution together (no output file yet)

import numpy as np
import bisect

def find_pairs(metas, adducts_sort, adducts_ind, signals):
    list_pairs = []
    for isignal in range(len(signals)):
        meta_min = 0
        adduct_min = 0
        delta_min = abs(signals[isignal] - (metas[meta_min] + adducts_sort[adduct_min]))
        for imeta in range(len(metas)):
            signal_meta = signals[isignal] - metas[imeta]
            # print(f'signal_meta: {signal_meta}')
            ind = bisect.bisect_left(adducts_sort, signal_meta, 0, len(adducts_sort)-1)
            # print(f'ind: {ind}')

            adduct_curr = None
            if ind == 0:
                adduct_curr = ind
            elif ind == len(adducts_sort) - 1:
                adduct_curr = ind - 1
            else:
                if abs(signal_meta - adducts_sort[ind-1]) < abs(signal_meta - adducts_sort[ind]):
                    adduct_curr = ind - 1
                else:
                    adduct_curr = ind
            delta = abs(signal_meta - adducts_sort[adduct_curr])

            if delta < delta_min:
                delta_min = delta
                meta_min = imeta
                adduct_min = adducts_ind[adduct_curr]

        list_pairs.append((meta_min, adduct_min))
    
    return list_pairs

def annotate(input_name):
    input_file = open(input_name)
    
    n_tests = int(input_file.readline().strip())
    for itest in range(n_tests):

        int_numbers = input_file.readline().strip()
        nmeta, nadduct, nsignals = [int(x) for x in int_numbers.split()]

        metas = [float(x) for x in input_file.readline().strip().split()]
        adducts = [float(x) for x in input_file.readline().strip().split()]
        signals = [float(x) for x in input_file.readline().strip().split()]
        
        adducts_ind = np.argsort(adducts)
        adducts_sort = [adducts[adducts_ind[i]] for i in range(len(adducts))]
        
        list_pairs = find_pairs(metas, adducts_sort, adducts_ind, signals)
    
    return list_pairs

input_name = '1.txt'
pairs = annotate(input_name)
print(f'pairs: {pairs}')

pairs: [(4, 0), (6, 0), (6, 9), (0, 5), (3, 7), (6, 7), (7, 0), (4, 0)]


In [2]:
# putting the entire solution together input file 1.txt

import numpy as np
import bisect

def find_pairs(metas, adducts_sort, adducts_ind, signals):
    list_pairs = []
    for isignal in range(len(signals)):
        meta_min = 0
        adduct_min = 0
        delta_min = abs(signals[isignal] - (metas[meta_min] + adducts_sort[adduct_min]))
        for imeta in range(len(metas)):
            signal_meta = signals[isignal] - metas[imeta]
            # print(f'signal_meta: {signal_meta}')
            ind = bisect.bisect_left(adducts_sort, signal_meta, 0, len(adducts_sort)-1)
            # print(f'ind: {ind}')

            adduct_curr = None
            if ind == 0:
                adduct_curr = ind
            elif ind == len(adducts_sort) - 1:
                adduct_curr = ind - 1
            else:
                if abs(signal_meta - adducts_sort[ind-1]) < abs(signal_meta - adducts_sort[ind]):
                    adduct_curr = ind - 1
                else:
                    adduct_curr = ind
            delta = abs(signal_meta - adducts_sort[adduct_curr])

            if delta < delta_min:
                delta_min = delta
                meta_min = imeta
                adduct_min = adducts_ind[adduct_curr]

        list_pairs.append((meta_min, adduct_min))
    
    return list_pairs

def annotate(input_name):
    input_file = open(input_name)
    output_file = open(output_name, 'w')
    
    n_tests = int(input_file.readline().strip())
    for itest in range(n_tests):

        int_numbers = input_file.readline().strip()
        nmeta, nadduct, nsignals = [int(x) for x in int_numbers.split()]

        metas = [float(x) for x in input_file.readline().strip().split()]
        adducts = [float(x) for x in input_file.readline().strip().split()]
        signals = [float(x) for x in input_file.readline().strip().split()]
        
        adducts_ind = np.argsort(adducts)
        adducts_sort = [adducts[adducts_ind[i]] for i in range(len(adducts))]
        
        list_pairs = find_pairs(metas, adducts_sort, adducts_ind, signals)
        
        for x in list_pairs:
            output_file.write(f'{x[0] + 1} {x[1] + 1}\n')
    
    print(f'created output file {output_name}')
    input_file.close()
    output_file.close()

input_name = '1.txt'
output_name = '1_output_optimized.txt'
pairs = annotate(input_name)
# print(f'pairs: {pairs}')

created output file 1_output_optimized.txt


In [1]:
%timeit

# putting the entire solution together input file 2.txt

import numpy as np
import bisect

def find_pairs(metas, adducts_sort, adducts_ind, signals):
    list_pairs = []
    for isignal in range(len(signals)):
        meta_min = 0
        adduct_min = 0
        delta_min = abs(signals[isignal] - (metas[meta_min] + adducts_sort[adduct_min]))
        for imeta in range(len(metas)):
            signal_meta = signals[isignal] - metas[imeta]
            # print(f'signal_meta: {signal_meta}')
            ind = bisect.bisect_left(adducts_sort, signal_meta, 0, len(adducts_sort)-1)
            # print(f'ind: {ind}')

            adduct_curr = None
            if ind == 0:
                adduct_curr = ind
            elif ind == len(adducts_sort) - 1:
                adduct_curr = ind - 1
            else:
                if abs(signal_meta - adducts_sort[ind-1]) < abs(signal_meta - adducts_sort[ind]):
                    adduct_curr = ind - 1
                else:
                    adduct_curr = ind
            delta = abs(signal_meta - adducts_sort[adduct_curr])

            if delta < delta_min:
                delta_min = delta
                meta_min = imeta
                adduct_min = adducts_ind[adduct_curr]

        list_pairs.append((meta_min, adduct_min))
    
    return list_pairs

def annotate(input_name):
    input_file = open(input_name)
    output_file = open(output_name, 'w')
    
    n_tests = int(input_file.readline().strip())
    for itest in range(n_tests):

        int_numbers = input_file.readline().strip()
        nmeta, nadduct, nsignals = [int(x) for x in int_numbers.split()]

        metas = [float(x) for x in input_file.readline().strip().split()]
        adducts = [float(x) for x in input_file.readline().strip().split()]
        signals = [float(x) for x in input_file.readline().strip().split()]
        
        adducts_ind = np.argsort(adducts)
        adducts_sort = [adducts[adducts_ind[i]] for i in range(len(adducts))]
        
        list_pairs = find_pairs(metas, adducts_sort, adducts_ind, signals)
        
        for x in list_pairs:
            output_file.write(f'{x[0] + 1} {x[1] + 1}\n')
    
    print(f'created output file {output_name}')
    input_file.close()
    output_file.close()

input_name = '2.txt'
output_name = '2_output_optimized.txt'
pairs = annotate(input_name)

created output file 2_output_optimized.txt


In [1]:
%timeit

# putting the entire solution together input file 3.txt

import numpy as np
import bisect

def find_pairs(metas, adducts_sort, adducts_ind, signals):
    list_pairs = []
    for isignal in range(len(signals)):
        if isignal % 1000 == 0:
            print(f'processing isignal: {isignal}')
        meta_min = 0
        adduct_min = 0
        delta_min = abs(signals[isignal] - (metas[meta_min] + adducts_sort[adduct_min]))
        for imeta in range(len(metas)):
            signal_meta = signals[isignal] - metas[imeta]
            # print(f'signal_meta: {signal_meta}')
            ind = bisect.bisect_left(adducts_sort, signal_meta, 0, len(adducts_sort)-1)
            # print(f'ind: {ind}')

            adduct_curr = None
            if ind == 0:
                adduct_curr = ind
            elif ind == len(adducts_sort) - 1:
                adduct_curr = ind - 1
            else:
                if abs(signal_meta - adducts_sort[ind-1]) < abs(signal_meta - adducts_sort[ind]):
                    adduct_curr = ind - 1
                else:
                    adduct_curr = ind
            delta = abs(signal_meta - adducts_sort[adduct_curr])

            if delta < delta_min:
                delta_min = delta
                meta_min = imeta
                adduct_min = adducts_ind[adduct_curr]

        list_pairs.append((meta_min, adduct_min))
    
    return list_pairs

def annotate(input_name):
    input_file = open(input_name)
    output_file = open(output_name, 'w')
    
    n_tests = int(input_file.readline().strip())
    for itest in range(n_tests):

        int_numbers = input_file.readline().strip()
        nmeta, nadduct, nsignals = [int(x) for x in int_numbers.split()]

        metas = [float(x) for x in input_file.readline().strip().split()]
        adducts = [float(x) for x in input_file.readline().strip().split()]
        signals = [float(x) for x in input_file.readline().strip().split()]
        
        adducts_ind = np.argsort(adducts)
        adducts_sort = [adducts[adducts_ind[i]] for i in range(len(adducts))]
        
        list_pairs = find_pairs(metas, adducts_sort, adducts_ind, signals)
        
        for x in list_pairs:
            output_file.write(f'{x[0] + 1} {x[1] + 1}\n')
    
    print(f'created output file {output_name}')
    input_file.close()
    output_file.close()

input_name = '3.txt'
output_name = '3_output.txt'
# pairs = annotate(input_name)

In [2]:
%timeit

# the entire solution together input file 4.txt

import numpy as np
import bisect

def find_pairs(metas, adducts_sort, adducts_ind, signals):
    list_pairs = []
    for isignal in range(len(signals)):
        if isignal % 10 == 0:
            print(f'processing isignal: {isignal}')
        meta_min = 0
        adduct_min = 0
        delta_min = abs(signals[isignal] - (metas[meta_min] + adducts_sort[adduct_min]))
        for imeta in range(len(metas)):
            if imeta % 100000 == 0:
                print(f'    processing imeta: {imeta}')
            signal_meta = signals[isignal] - metas[imeta]
            # print(f'signal_meta: {signal_meta}')
            ind = bisect.bisect_left(adducts_sort, signal_meta, 0, len(adducts_sort)-1)
            # print(f'ind: {ind}')

            adduct_curr = None
            if ind == 0:
                adduct_curr = ind
            elif ind == len(adducts_sort) - 1:
                adduct_curr = ind - 1
            else:
                if abs(signal_meta - adducts_sort[ind-1]) < abs(signal_meta - adducts_sort[ind]):
                    adduct_curr = ind - 1
                else:
                    adduct_curr = ind
            delta = abs(signal_meta - adducts_sort[adduct_curr])

            if delta < delta_min:
                delta_min = delta
                meta_min = imeta
                adduct_min = adducts_ind[adduct_curr]

        list_pairs.append((meta_min, adduct_min))
    
    return list_pairs

def annotate(input_name):
    input_file = open(input_name)
    output_file = open(output_name, 'w')
    
    n_tests = int(input_file.readline().strip())
    for itest in range(n_tests):

        int_numbers = input_file.readline().strip()
        nmeta, nadduct, nsignals = [int(x) for x in int_numbers.split()]

        metas = [float(x) for x in input_file.readline().strip().split()]
        adducts = [float(x) for x in input_file.readline().strip().split()]
        signals = [float(x) for x in input_file.readline().strip().split()]
        
        adducts_ind = np.argsort(adducts)
        adducts_sort = [adducts[adducts_ind[i]] for i in range(len(adducts))]
        
        list_pairs = find_pairs(metas, adducts_sort, adducts_ind, signals)
        
        for x in list_pairs:
            output_file.write(f'{x[0] + 1} {x[1] + 1}\n')
    
    print(f'created output file {output_name}')
    input_file.close()
    output_file.close()

input_name = '4.txt'
output_name = input_name[:-4] + '_output.txt'
# pairs = annotate(input_name)

In [3]:
%timeit

# the entire solution together input file 5.txt - works perfectly

import numpy as np
import bisect

def find_pairs(metas, adducts_sort, adducts_ind, signals):
    list_pairs = []
    for isignal in range(len(signals)):
        if isignal % 1000 == 0:
            print(f'processing isignal: {isignal}')
        meta_min = 0
        adduct_min = 0
        delta_min = abs(signals[isignal] - (metas[meta_min] + adducts_sort[adduct_min]))
        for imeta in range(len(metas)):
            if imeta % 1000 == 0:
                print(f'    processing imeta: {imeta}')
            signal_meta = signals[isignal] - metas[imeta]
            # print(f'signal_meta: {signal_meta}')
            ind = bisect.bisect_left(adducts_sort, signal_meta, 0, len(adducts_sort)-1)
            # print(f'ind: {ind}')

            adduct_curr = None
            if ind == 0:
                adduct_curr = ind
            elif ind == len(adducts_sort) - 1:
                adduct_curr = ind - 1
            else:
                if abs(signal_meta - adducts_sort[ind-1]) < abs(signal_meta - adducts_sort[ind]):
                    adduct_curr = ind - 1
                else:
                    adduct_curr = ind
            delta = abs(signal_meta - adducts_sort[adduct_curr])

            if delta < delta_min:
                delta_min = delta
                meta_min = imeta
                adduct_min = adducts_ind[adduct_curr]

        list_pairs.append((meta_min, adduct_min))
    
    return list_pairs

def annotate(input_name):
    input_file = open(input_name)
    output_file = open(output_name, 'w')
    
    n_tests = int(input_file.readline().strip())
    for itest in range(n_tests):

        int_numbers = input_file.readline().strip()
        nmeta, nadduct, nsignals = [int(x) for x in int_numbers.split()]

        metas = [float(x) for x in input_file.readline().strip().split()]
        adducts = [float(x) for x in input_file.readline().strip().split()]
        signals = [float(x) for x in input_file.readline().strip().split()]
        
        adducts_ind = np.argsort(adducts)
        adducts_sort = [adducts[adducts_ind[i]] for i in range(len(adducts))]
        
        list_pairs = find_pairs(metas, adducts_sort, adducts_ind, signals)
        
        for x in list_pairs:
            output_file.write(f'{x[0] + 1} {x[1] + 1}\n')
    
    print(f'created output file {output_name}')
    input_file.close()
    output_file.close()

input_name = '5.txt'
output_name = input_name[:-4] + '_output.txt'
# pairs = annotate(input_name)

In [5]:
s = '1987.txt'
print(s[:-4])
output_fname = s[:-4] + '_output.txt'
print(output_fname)

1987
1987_output.txt


### Sorting metas and searching signal-adduct difference in metas

In [4]:
%timeit

# gives wrong answer for some signals but works quickly
# sorting metas and searching for signal adducts difference in sorted metas
# the entire solution together input file 4.txt

import numpy as np
import bisect

def find_pairs(metas_sort, metas_ind, adducts, signals):
    list_pairs = []
    for isignal in range(len(signals)):
        if isignal % 10 == 0:
            print(f'processing isignal: {isignal}')
        meta_min = 0
        adduct_min = 0
        delta_min = abs(signals[isignal] - (metas_sort[meta_min] + adducts[adduct_min]))
        for iadduct in range(len(adducts)):
            # if imeta % 100000 == 0:
            if iadduct % 100 == 0:
                print(f'    processing iadduct: {iadduct}')
            signal_adduct = signals[isignal] - adducts[iadduct]
            # print(f'signal_meta: {signal_meta}')
            ind = bisect.bisect_left(metas_sort, signal_adduct, 0, len(metas_sort)-1)
            # print(f'ind: {ind}')

            meta_curr = None
            if ind == 0:
                meta_curr = ind
            elif ind == len(metas_sort) - 1:
                meta_curr = ind - 1
            else:
                if abs(signal_adduct - metas_sort[ind-1]) < abs(signal_adduct - metas_sort[ind]):
                    meta_curr = ind - 1
                else:
                    meta_curr = ind
            delta = abs(signal_adduct - metas_sort[meta_curr])

            if delta < delta_min:
                delta_min = delta
                meta_min = metas_ind[meta_curr]
                adduct_min = iadduct

        list_pairs.append((meta_min, adduct_min))
    
    return list_pairs

def annotate(input_name):
    input_file = open(input_name)
    output_file = open(output_name, 'w')
    
    n_tests = int(input_file.readline().strip())
    for itest in range(n_tests):

        int_numbers = input_file.readline().strip()
        nmeta, nadduct, nsignals = [int(x) for x in int_numbers.split()]

        metas = [float(x) for x in input_file.readline().strip().split()]
        adducts = [float(x) for x in input_file.readline().strip().split()]
        signals = [float(x) for x in input_file.readline().strip().split()]
        
        metas_ind = np.argsort(metas)
        metas_sort = [metas[metas_ind[i]] for i in range(len(metas))]
        
        list_pairs = find_pairs(metas_sort, metas_ind, adducts, signals)
        
        for x in list_pairs:
            output_file.write(f'{x[0] + 1} {x[1] + 1}\n')
    
    print(f'created output file {output_name}')
    input_file.close()
    output_file.close()

input_name = '4.txt'
output_name = input_name[:-4] + '_output_sorted metas.txt'
# pairs = annotate(input_name)

In [5]:
import numpy as np
import bisect

def find_pairs_sorted_metas(metas_sort, metas_ind, adducts, signals):
    list_pairs = []
    for isignal in range(len(signals)):
        if isignal % 10 == 0:
            print(f'processing isignal: {isignal}')
        meta_min = 0
        adduct_min = 0
        delta_min = abs(signals[isignal] - (metas_sort[meta_min] + adducts[adduct_min]))
        for iadduct in range(len(adducts)):
            # if imeta % 100000 == 0:
            if iadduct % 100 == 0:
                print(f'    processing iadduct: {iadduct}')
            signal_adduct = signals[isignal] - adducts[iadduct]
            # print(f'signal_meta: {signal_meta}')
            ind = bisect.bisect_left(metas_sort, signal_adduct, 0, len(metas_sort)-1)

            meta_curr = None
            if ind == 0:
                meta_curr = ind
            elif ind == len(metas_sort) - 1:
                meta_curr = ind - 1
            else:
                if abs(signal_adduct - metas_sort[ind-1]) < abs(signal_adduct - metas_sort[ind]):
                    meta_curr = ind - 1
                else:
                    meta_curr = ind
            delta = abs(signal_adduct - metas_sort[meta_curr])

            if delta < delta_min:
                delta_min = delta
                meta_min = metas_ind[meta_curr]
                adduct_min = iadduct

        list_pairs.append((meta_min, adduct_min))
    
    return list_pairs

def annotate(input_name):
    input_file = open(input_name)
    output_file = open(output_name, 'w')
    
    n_tests = int(input_file.readline().strip())
    for itest in range(n_tests):

        int_numbers = input_file.readline().strip()
        nmeta, nadduct, nsignals = [int(x) for x in int_numbers.split()]

        metas = [float(x) for x in input_file.readline().strip().split()]
        adducts = [float(x) for x in input_file.readline().strip().split()]
        signals = [float(x) for x in input_file.readline().strip().split()]
        
        metas_ind = np.argsort(metas)
        metas_sort = [metas[metas_ind[i]] for i in range(len(metas))]
        
        list_pairs = find_pairs_sorted_metas(metas_sort, metas_ind, adducts, signals)
        
        for x in list_pairs:
            output_file.write(f'{x[0] + 1} {x[1] + 1}\n')
    
    print(f'created output file {output_name}')
    input_file.close()
    output_file.close()

input_name = '1.txt'
output_name = input_name[:-4] + '_output_sorted metas.txt'
pairs = annotate(input_name)

processing isignal: 0
    processing iadduct: 0
    processing iadduct: 0
    processing iadduct: 0
    processing iadduct: 0
    processing iadduct: 0
    processing iadduct: 0
    processing iadduct: 0
    processing iadduct: 0
created output file 1_output_sorted metas.txt


In [5]:
import numpy as np
import bisect

def find_pairs_sorted_metas(metas_sort, metas_ind, adducts, signals):
    list_pairs = []
    for isignal in range(len(signals)):
        if isignal % 10 == 0:
            print(f'processing isignal: {isignal}')
        meta_min = 0
        adduct_min = 0
        delta_min = abs(signals[isignal] - (metas_sort[meta_min] + adducts[adduct_min]))
        for iadduct in range(len(adducts)):
            # if imeta % 100000 == 0:
            if iadduct % 100 == 0:
                print(f'    processing iadduct: {iadduct}')
            signal_adduct = signals[isignal] - adducts[iadduct]
            # print(f'signal_meta: {signal_meta}')
            ind = bisect.bisect_left(metas_sort, signal_adduct, 0, len(metas_sort)-1)

            meta_curr = None
            if ind == 0:
                meta_curr = ind
            elif ind == len(metas_sort) - 1:
                meta_curr = ind - 1
            else:
                if abs(signal_adduct - metas_sort[ind-1]) < abs(signal_adduct - metas_sort[ind]):
                    meta_curr = ind - 1
                else:
                    meta_curr = ind
            delta = abs(signal_adduct - metas_sort[meta_curr])

            if delta < delta_min:
                delta_min = delta
                meta_min = metas_ind[meta_curr]
                adduct_min = iadduct

        list_pairs.append((meta_min, adduct_min))
    
    return list_pairs

def annotate(input_name):
    input_file = open(input_name)
    output_file = open(output_name, 'w')
    
    n_tests = int(input_file.readline().strip())
    for itest in range(n_tests):

        int_numbers = input_file.readline().strip()
        nmeta, nadduct, nsignals = [int(x) for x in int_numbers.split()]

        metas = [float(x) for x in input_file.readline().strip().split()]
        adducts = [float(x) for x in input_file.readline().strip().split()]
        signals = [float(x) for x in input_file.readline().strip().split()]
        
        metas_ind = np.argsort(metas)
        metas_sort = [metas[metas_ind[i]] for i in range(len(metas))]
        
        list_pairs = find_pairs_sorted_metas(metas_sort, metas_ind, adducts, signals)
        
        for x in list_pairs:
            output_file.write(f'{x[0] + 1} {x[1] + 1}\n')
    
    print(f'created output file {output_name}')
    input_file.close()
    output_file.close()

input_name = '2.txt'
output_name = input_name[:-4] + '_output_sorted metas.txt'
# pairs = annotate(input_name)

In [None]:
import numpy as np
import bisect

def find_pairs_sorted_metas(metas_sort, metas_ind, adducts, signals):
    list_pairs = []
    for isignal in range(len(signals)):
        if isignal % 10 == 0:
            print(f'processing isignal: {isignal}')
        meta_min = 0
        adduct_min = 0
        delta_min = abs(signals[isignal] - (metas_sort[meta_min] + adducts[adduct_min]))
        for iadduct in range(len(adducts)):
            # if imeta % 100000 == 0:
            if iadduct % 100 == 0:
                print(f'    processing iadduct: {iadduct}')
            signal_adduct = signals[isignal] - adducts[iadduct]
            ind = bisect.bisect_left(metas_sort, signal_adduct, 0, len(metas_sort)-1)

            meta_curr = None
            if ind == 0:
                meta_curr = ind
            elif ind == len(metas_sort) - 1:
                meta_curr = ind - 1
            else:
                if abs(signal_adduct - metas_sort[ind-1]) < abs(signal_adduct - metas_sort[ind]):
                    meta_curr = ind - 1
                else:
                    meta_curr = ind
            delta = abs(signal_adduct - metas_sort[meta_curr])
            print(f'meta_curr: {meta_curr}')

            if delta < delta_min:
                delta_min = delta
                meta_min = metas_ind[meta_curr]
                adduct_min = iadduct

        list_pairs.append((meta_min, adduct_min))
    
    return list_pairs

def annotate(input_name):
    input_file = open(input_name)
    output_file = open(output_name, 'w')
    
    n_tests = int(input_file.readline().strip())
    for itest in range(n_tests):

        int_numbers = input_file.readline().strip()
        nmeta, nadduct, nsignals = [int(x) for x in int_numbers.split()]

        metas = [float(x) for x in input_file.readline().strip().split()]
        adducts = [float(x) for x in input_file.readline().strip().split()]
        signals = [float(x) for x in input_file.readline().strip().split()]
        
        metas_ind = np.argsort(metas)
        metas_sort = [metas[metas_ind[i]] for i in range(len(metas))]
        
        list_pairs = find_pairs_sorted_metas(metas_sort, metas_ind, adducts, signals)
        
        for x in list_pairs:
            output_file.write(f'{x[0] + 1} {x[1] + 1}\n')
    
    print(f'created output file {output_name}')
    input_file.close()
    output_file.close()

input_name = '4.txt'
output_name = input_name[:-4] + '_output_sorted metas.txt'
# pairs = annotate(input_name)