In [2]:
import os
import numpy as np
from itertools import islice
import os

In [3]:
from rdkit import Chem

In [17]:
datapath = "G:\\Dev\\Data"

all_gnps_path = datapath + os.sep + "GNPSLibraries_allSMILES.mgf"
all_gnps_fragments_path = datapath + os.sep + "GNPS 35000 Spectrum Fragments"
binned_datapath = datapath + os.sep + "GNPS 35000" + os.sep + "GNPS Python Binned"
filtered_datapath = datapath + os.sep + "GNPS 35000" + os.sep + "GNPS Python Filtered"

nist_path = datapath + os.sep + "MSMS-NIST-Curated-Pos-MfKit.msp"
nist_fragments_path = datapath + os.sep + "MSMS-NIST Fragments"
binned_nist_datapath = datapath + os.sep + "MSMS-NIST" + os.sep + "Python Binned"
filtered_nist_datapath = datapath + os.sep + "MSMS-NIST" + os.sep + "Python Filtered"

spectrum_smiles_path = datapath + os.sep + "Spectrum-Smiles.tsv"
nist_spectrum_smiles_path = datapath + os.sep + "NIST Spectrum-Smiles.tsv"

amino_acid_path = datapath + os.sep + "Amino Acids"
nist_amino_acid_path = datapath + os.sep + "NIST Amino Acids"

amino_acid_shifts_path = datapath + os.sep + "Fragment Masses.txt"

In [5]:
amino_acid_smarts = {}

# amino acid SMARTS string taken from http://www.daylight.com/dayhtml_tutorials/languages/smarts/smarts_examples.html
# note that glutamine SMARTS is not found
amino_acid_smarts['Alanine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH3X4])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Arginine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][CH2X4][CH2X4][NHX3][CH0X3](=[NH2X3+,NHX2+0])[NH2X3])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Asparagine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][CX3](=[OX1])[NX3H2])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Aspartic Acid'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][CX3](=[OX1])[OH0-,OH])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Cysteine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][SX2H,SX1H0-])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Glutamic Acid'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][CH2X4][CX3](=[OX1])[OH0-,OH])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Glutamine'] = ''
amino_acid_smarts['Glycine'] = '[$([$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H2][CX3](=[OX1])[OX2H,OX1-,N])]'
amino_acid_smarts['Histidine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][#6X3]1:[$([#7X3H+,#7X2H0+0]:[#6X3H]:[#7X3H]),$([#7X3H])]:[#6X3H]:[$([#7X3H+,#7X2H0+0]:[#6X3H]:[#7X3H]),$([#7X3H])]:[#6X3H]1)[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Isoleucine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CHX4]([CH3X4])[CH2X4][CH3X4])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Leucine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][CHX4]([CH3X4])[CH3X4])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Lysine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][CH2X4][CH2X4][CH2X4][NX4+,NX3+0])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Methionine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][CH2X4][SX2][CH3X4])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Phenylalanine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][cX3]1[cX3H][cX3H][cX3H][cX3H][cX3H]1)[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Proline'] = '[$([NX3H,NX4H2+]),$([NX3](C)(C)(C))]1[CX4H]([CH2][CH2][CH2]1)[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Serine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][OX2H])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Threonine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CHX4]([CH3X4])[OX2H])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Tryptophan'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][cX3]1[cX3H][nX3H][cX3]2[cX3H][cX3H][cX3H][cX3H][cX3]12)[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Tyrosine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][cX3]1[cX3H][cX3H][cX3]([OHX2,OH0X1-])[cX3H][cX3H]1)[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Valine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CHX4]([CH3X4])[CH3X4])[CX3](=[OX1])[OX2H,OX1-,N]'


In [7]:
def save_spectrum_smiles(path, spectrum_smiles):
    with open(path, 'w') as f:
        for mol_name, smiles in spectrum_smiles:
            f.write(mol_name + "\t" + smiles + "\n")

In [8]:
with open(all_gnps_path, 'r') as f:
    gnps_content = f.readlines()

In [25]:
# >compound Kanamycin A M+H
# >formula C18H36N4O11
# >parentmass 485.0
# >ionization [M + H]+
# >InChI InChI=1S/C18H36N4O11/c19-2-6-10(25)12(27)13(28)18(30-6)33-16-5(21)1-4(20)15(14(16)29)32-17-11(26)8(22)9(24)7(3-23)31-17/h4-18,23-29H,1-3,19-22H2/t4-,5+,6-,7-,8+,9-,10-,11-,12+,13-,14-,15+,16-,17-,18-/m1/s1
# >InChIKey N/A
# >smiles O
#
# >ms2peaks

count = 1
spectrum_smiles = []

for index, line in enumerate(gnps_content):
    if line.startswith("INCHIKEY="):
        inchi_key_index = index
        loop_index = inchi_key_index + 1
        smiles_index = inchi_key_index - 1
        parent_mass_index = smiles_index - 6
        if gnps_content[parent_mass_index].startswith("MSLEVEL"):
            parent_mass_index -= 3

        starting_index = parent_mass_index - 2

        while not gnps_content[starting_index].startswith("BEGIN IONS"):
            starting_index -= 1

        filepath = os.path.join(all_gnps_fragments_path, "GNPS_ALL_" + str(count) + ".ms")

        inchi_key = line[9:-1]

        if inchi_key != "" and gnps_content[loop_index] != "END IONS\n":
            mol_name = "GNPS_ALL_" + str(count)
            spectrum_smiles.append((mol_name, gnps_content[smiles_index][7:-1]))

            with open(filepath, 'w') as f:
                f.write(">compound " + mol_name + "\n")
                f.write(">formula N/A\n")
                f.write(">parentmass " + gnps_content[parent_mass_index][8:])
                f.write(">ionization N/A\n")
                f.write(">InChI N/A\n")
                f.write(">InChIKey " + inchi_key + "\n")
                f.write(">smiles " + gnps_content[smiles_index][7:])
                f.write("\n")
                f.write(">ms2peaks\n")
                while gnps_content[loop_index] != "END IONS\n":
                    mass, intensity = gnps_content[loop_index].split()
                    if float(mass) <= 1000:
                        f.write(mass + " " + intensity + "\n")
                    loop_index += 1
                count += 1

print("Done")

Done


In [26]:
save_spectrum_smiles(spectrum_smiles_path, spectrum_smiles)

In [27]:
with open(nist_path, 'r') as f:
    nist_content = f.readlines()

In [28]:
# >compound Kanamycin A M+H
# >formula C18H36N4O11
# >parentmass 485.0
# >ionization [M + H]+
# >InChI InChI=1S/C18H36N4O11/c19-2-6-10(25)12(27)13(28)18(30-6)33-16-5(21)1-4(20)15(14(16)29)32-17-11(26)8(22)9(24)7(3-23)31-17/h4-18,23-29H,1-3,19-22H2/t4-,5+,6-,7-,8+,9-,10-,11-,12+,13-,14-,15+,16-,17-,18-/m1/s1
# >InChIKey N/A
# >smiles O
#
# >ms2peaks

count = 1
spectrum_smiles = []

for index, line in enumerate(nist_content):
    if line.startswith("NAME:"):
        loop_index = index + 18
        smiles_index = index + 7
        parent_mass_index = index + 1
        formula_index = index + 11
        ionization_index = index + 2
        inchi_key_index = index + 9
        inchi_key = nist_content[inchi_key_index][10:]
        
        filepath = os.path.join(nist_fragments_path, "GNPS_ALL_" + str(count) + ".ms")

        if inchi_key != "" and nist_content[loop_index] != "END IONS\n":
            mol_name = "GNPS_ALL_" + str(count)
            spectrum_smiles.append((mol_name, nist_content[smiles_index][8:-1]))

            with open(filepath, 'w') as f:
                f.write(">compound " + nist_content[index][6:])
                f.write(">formula " + nist_content[formula_index])
                f.write(">parentmass " + nist_content[parent_mass_index][13:])
                f.write(">ionization " + nist_content[ionization_index][15:])
                f.write(">InChI N/A\n")
                f.write(">InChIKey " + nist_content[inchi_key_index][10:])
                f.write(">smiles " + nist_content[smiles_index][8:])
                f.write("\n")
                f.write(">ms2peaks\n")
                while nist_content[loop_index] != "\n":
                    mass, intensity = nist_content[loop_index].split()
                    if float(mass) <= 1000:
                        f.write(mass + " " + intensity + "\n")
                    loop_index += 1
                count += 1

print("Done")

Done


In [29]:
save_spectrum_smiles(nist_spectrum_smiles_path, spectrum_smiles)

In [7]:
MAX_MASS = 1000  # Maximum fragment size in Daltons

def write_binned_files(path, binned_path, bin_size=1):
    num_bins = MAX_MASS//bin_size #  Calculate number of bins
    for file in os.listdir(path):
        filepath = os.path.join(path, file)
        binned_values = np.zeros(num_bins, dtype=float)
        with open(filepath, 'r') as f:
            filename = f.name
            
            unsplit_lines = list(islice(f, 9, None))
            for line in unsplit_lines:
                if ' ' in line:  # Only lines with mass and intensity values have a space. Ignores label/blank lines
                    split_line = line.split()
                    mass = round(float(split_line[0]))  # Mass of fragment, to nearest Da
                    if mass <= MAX_MASS:  # If fragment isn't too big
                        mass_bin = (int(mass) // bin_size)-1  # Bin fragment belongs in.
                        if mass_bin < 0:
                            mass_bin = 0
                        intensity = float(split_line[1])
                        binned_values[mass_bin] = binned_values[mass_bin] + intensity  # Sum intensities for bin

        binned_filename = file.split(".")[0] + " Binned.txt"
        binned_filepath = os.path.join(binned_path, binned_filename)
        with open(binned_filepath, 'w') as f:  # Write bins and intensities to new file.
            for index, intensity in enumerate(binned_values):
                mass = index*bin_size
                f.write(str(mass+1) + "  " + str(intensity) + "\n")


write_binned_files(all_gnps_fragments_path, binned_datapath)
write_binned_files(nist_fragments_path, binned_nist_datapath)
print("Done")

Done


In [9]:
def normalize_and_filter(spectrum, min_percent=0.005):
    max_intensity = np.amax(spectrum)  # Find max intensity
    spectrum = spectrum / max_intensity  # Normalize to 0-1
    filtered = np.where(spectrum < min_percent, 0, spectrum)  # Set values below threshold to 0.
    return filtered

def top_six_filter(spectrum):
    filtered_spectrum = np.zeros(spectrum.shape, float)
    for i in range(len(spectrum)):  # For each mass bin
        low_end = 0
        if i < 50:
            low_end = i  # If there are fewer than 50 bins behind current windows, only go back to index 0.
        if i >= 50:
            low_end = 50  # Else, go back 50 indices
        window_comparison = np.less(spectrum[i], spectrum[i-low_end:(i+50)])  # Compare value to all bins in 100Da range
        if np.sum(window_comparison) < 7:  # If value is among top 6 in 100Da range, add it to filtered array.
            filtered_spectrum[i] = spectrum[i]
    return filtered_spectrum

def filter_binned(path, filtered_path):
    for file in os.listdir(path):
        print(file)
        filepath = os.path.join(path, file)
        data = np.loadtxt(filepath, np.float32)
        # Filter all below threshold. Column 1 contains intensities.
        data[:, 1] = normalize_and_filter(data[:, 1])

        # Filter all below threshold. Column 1 contains intensities.
        data[:, 1] = top_six_filter(data[:, 1])

        filtered_filename = file.split("Binned")[0] + "Filtered.txt"
        filtered_filepath = os.path.join(filtered_path, filtered_filename)
        np.savetxt(filtered_filepath, data, fmt="%d %f")
        
    

In [1]:
filter_binned(binned_datapath, filtered_datapath)
filter_binned(binned_nist_datapath, filtered_nist_datapath)
print("Done")