# Search a list of target features in a feature table

This is the simpliest method to match m/z and retention time between a list of targets and a dataset, by defined windows of m/z and retention time.

The only trick is to index the dataset so that the search isn't too slow.

This was using jupyter/scipy-notebook, e.g.:

docker run -v /home/shuzhao/play:/home/jovyan -p 8888:8888 jupyter/scipy-notebook

Shuzhao Li, 2021-06-23

In [1]:
# This is a list of labeled compounds, 
targets = "labeled_targets.txt"
targets = open(targets).readlines()
print(targets[:10], '\n')

# name at col 0, m/z at col 1 and rt col 2
wanted = []
for line in targets[1:]:
    a = line.split('\t')
    wanted.append([a[0], float(a[1]), float(a[2])])
    
print(wanted)

['name\tm/z(M+H)\tRT\n', 'betaine_D11\t129.15410400000002\t0\n', 'citrulline_D4\t180.1275884\t0\n', 'creatine_D3\t135.0951996\t0\n', 'homocysteine_D5\t141.0734744\t0\n', 'ornithine_D6\t139.1341256\t0\n', 'phenylacetylglutamine_D5\t270.1490786\t0\n', 'taurine_D4\t130.0465648\t0\n', 'test_pos\t267.0048\t164\n'] 

[['betaine_D11', 129.15410400000002, 0.0], ['citrulline_D4', 180.1275884, 0.0], ['creatine_D3', 135.0951996, 0.0], ['homocysteine_D5', 141.0734744, 0.0], ['ornithine_D6', 139.1341256, 0.0], ['phenylacetylglutamine_D5', 270.1490786, 0.0], ['taurine_D4', 130.0465648, 0.0], ['test_pos', 267.0048, 164.0]]


One can compute the adducts if needed:

PROTON = 1.00727646677

SODIUM = 21.9820 + PROTON

H2O = 18.0106

In [2]:
feature_table = "input_data/ave_log2_modc_ae_2012.txt"

# m/z at col 1, rt at col 2

row = 0
features = []
for line in open(feature_table).readlines()[1:]:
    a = line.split('\t')
    row += 1
    features.append([ 'row'+str(row), float(a[0]), float(a[1]), line ])
    
print (features[:3])

tree = {}
all_mzs = [x[1] for x in features]
min_mz, max_mz = min(all_mzs), max(all_mzs)
print("min_mz, max_mz = ", min_mz, max_mz)

for ii in range(int(min_mz), int(max_mz)+1):
    tree[ii] = []

# building feature index so that the search isn't too slow.
for f in features:
    tree[int(f[1])].append(f)


[['row1', 85.02783, 59.0, '85.02783\t59\t17.23\t17.18\t17.44\t15.67\t15.57\t17.26\t16.94\t16.98\t16.56\n'], ['row2', 85.04717, 124.0, '85.04717\t124\t16.53\t15.8\t16.62\t14.37\t14.85\t15.63\t16.71\t15.65\t16.27\n'], ['row3', 85.06532, 68.0, '85.06532\t68\t10.8\t10.61\t11.36\t14.87\t14.89\t12.53\t14.42\t14.03\t10.81\n']]
min_mz, max_mz =  85.02783 1956.091


In [3]:
# 0.000010 is 10 ppm
PPM_tolerance = 0.000010
# use a large number to include anything in RTime, small number to be specific
RTime_tolerance = 5000       # seconds in retention time, usually a small number
                            # more lenient for diff instruments
                            # and possible diff void volume
             
# F1 = (m/z, rt)
def match2(F1, F2):
    if abs(F1[0]-F2[0])/F1[0] < PPM_tolerance and abs(F1[1] - F2[1]) < RTime_tolerance:
        return True
    else:
        return False
    
# test
match2((129.1541, 55), (129.1533, 144))

True

In [4]:
result_file = "search_result.tsv"

def find_targets(targetList, featuresTree, result_file):
    # target format: name at col 0, m/z at col 1 and rt col 2
    # feature format: row_number, m/z, rt, original line
    matched = []
    # filter targetList
    new = []
    min_mz, max_mz = min(featuresTree.keys()), max(featuresTree.keys())
    for x in targetList:
        if min_mz < x[1] < max_mz:
            new.append(x)
        else:
            print("out of m/z range: ", x)
    for x in new:
        # mz -0.1 to + 0.1 as search range
        neighbors = set([int(x[1]-0.1), int(x[1]+0.1)])
        for ii in neighbors:
            for F in featuresTree[ii]:
                if match2( (x[1], x[2]), F[1:3] ):
                    matched.append((x + F))
                
    if matched:
        s = "#Matched result\n"
        for line in matched:
            s += '\t'.join([str(x) for x in line]) + '\n'
        with open(result_file, 'w') as file:
            file.write(s)
        print("Found %d matches, result written to %s" %(len(matched), result_file))
    else:
        print("No match found")
        
    return matched

matched = find_targets( wanted, tree, result_file )

Found 1 matches, result written to search_result.tsv


In [5]:
# Can do more with matched here
print(matched[:3])

[['test_pos', 267.0048, 164.0, 'row5819', 267.0048, 164.0, '267.0048\t164\t15.57\t14.45\t14.78\t12.09\t14.34\t14.38\t14.97\t14.97\t14.47\n']]
