In [1]:
import os
import pandas as pd

In [2]:
datapath = "G:\\Dev\\Data"

all_gnps_path = datapath + os.sep + "ALL_GNPS_20181012.mgf"
all_gnps_filtered_path = datapath + os.sep + "ALL_GNPS_20181012_filtered.mgf"
all_gnps_processed_path = datapath + os.sep + "ALL_GNPS_20181012_processed.mgf"

mibig_gnps_linked_file_path = datapath + os.sep + "mibig_gnps_links_q3_loose.csv"

gnps_dir = datapath + os.sep + "GNPS For Family"
gnps_5770_dir = datapath + os.sep + "GNPS"

Convert .mgf to individual .ms file
Sample .ms file as follows: 

-  compound Kanamycin A M+H
-  formula C18H36N4O11
-  parentmass 485.0
-  ionization [M + H]+
-  InChI InChI=1S/C18H36N4O11/c19-2-6-10(25)12(27)13(28)18(30-6)33-16-5(21)1-4(20)15(14(16)29)32-17-11(26)8(22)9(24)7(3-23)31-17/h4-18,23-29H,1-3,19-22H2/t4-,5+,6-,7-,8+,9-,10-,11-,12+,13-,14-,15+,16-,17-,18-/m1/s1
-  InChIKey N/A
-  smiles O

-  ms2peaks

In [None]:
gnps_5770_filenames = []

for file in os.listdir(gnps_5770_datapath):
    gnps_5770_filenames.append(file[:-3])

with open(all_gnps_path, 'r') as f:
    content = f.readlines()

for line in content:
    if line.startswith("SPECTRUMID=") and line[11:-1] not in gnps_5770_filenames:
        filename = line[11:-1]
        print(line[11:-1])
        filepath = os.path.join(datapath, filename + ".ms")
        spectrum_id_index = content.index(line)
        loop_index = spectrum_id_index + 2
        smiles_index = spectrum_id_index - 6
        if content[loop_index] != "END IONS\n" and content[smiles_index][7:-1] != "N/A" and \
                content[smiles_index][7:-1] != "" and content[smiles_index][7:-1] != " ":

            with open(filepath, 'w') as f:
                inchi_index = spectrum_id_index - 5
                smiles_index = inchi_index - 1
                name_index = smiles_index - 3
                ionmode_index = name_index - 2
                pepmass_index = ionmode_index - 6

                f.write(">compound " + content[name_index][5:])
                if content[inchi_index][7:].startswith("InChI"):
                    f.write(">formula " + content[inchi_index][7:].split("/")[1] + "\n")
                else:
                    f.write(">formula N/A\n")
                f.write(">parentmass " + content[pepmass_index][8:])
                f.write(">ionization " + content[ionmode_index][8:])
                f.write(">InChI " + content[inchi_index][7:-2] + "\n")
                f.write(">InChIKey N/A\n")
                if content[smiles_index][7:].startswith(" "):# there are some smiles with additional space at start
                    f.write(">smiles " + content[smiles_index][8:])
                else:
                    f.write(">smiles " + content[smiles_index][7:])
                f.write("\n")
                f.write(">ms2peaks\n")
                while content[loop_index] != "END IONS\n":
                    mass, intensity = content[loop_index].split("\t")
                    f.write(mass + " " + intensity)
                    loop_index += 1

Get only the ones that are linked with Mibig (350 of them). These samples will be run with CSI:FingerID Sirius tool

In [None]:
gnps_ids = set()
to_be_deleted = []

mibig_gnps_df = pd.read_csv(mibig_gnps_linked_file_path, sep=",")
mibig_gnps_df = mibig_gnps_df.set_index("gnps_id")

with open(all_gnps_path, 'r') as f:
    content = f.readlines()

for index, line in enumerate(content):
    if line.startswith("SPECTRUMID="):
        spectrum_id = line.split("=")[1][:-1]
        starting_index = index - 18
        loop_index = index + 2

        while not content[starting_index].startswith("BEGIN IONS"):
            starting_index -= 1

        while content[loop_index] != "END IONS\n":
            loop_index += 1

        if spectrum_id not in mibig_gnps_df.index:
            to_be_deleted.append((starting_index, loop_index + 1))


for start, end in to_be_deleted:
    while start <= end:
        content[start] = "TO BE DELETED\n"
        start += 1

filtered_content = [line for line in content if line != "TO BE DELETED\n"]

with open(all_gnps_filtered_path, 'w') as f:
    for line in filtered_content:
        f.write(line)

Filter noises out from those 350 samples

In [None]:
to_be_deleted = []

with open(all_gnps_filtered_path, 'r') as f:
    content = f.readlines()

def filter_and_normalise(content, peak_intensities):
    max = 0

    for index, peak, intensity in peak_intensities:
        if peak > 1000:
            content[index] = "TO BE DELETED\n"
        else:
            if intensity > max:
                max = intensity

    for index, peak, intensity in peak_intensities:
        if intensity < max * 0.005:
            content[index] = "TO BE DELETED\n"

for index, line in enumerate(content):
    if line.startswith("CHARGE="):
        peak_intensities = []
        charge_index = index
        cursor_index = index
        pep_mass_index = index - 1

        starting_index = pep_mass_index - 1

        while not content[starting_index].startswith("BEGIN IONS"):
            starting_index -= 1

        charge = int(content[charge_index][:-1].split("=")[1])
        pep_mass = float(content[pep_mass_index][:-1].split("=")[1])

        if charge == 0:
            content[charge_index] = "CHARGE=1\n"
        elif charge > 1:
            print("check")
            content[pep_mass_index] = str((pep_mass * charge) - charge + 1) + "\n"
            content[charge_index] = "CHARGE=1\n"

        while not content[cursor_index].startswith("SCANS"):
            cursor_index += 1

        spectrum_id_index = cursor_index - 1
        spectrum_id = content[spectrum_id_index][:-1].split("=")[1]

        loop_index = cursor_index + 1

        cursor_index = index

        while not content[cursor_index].startswith("NAME"):
            cursor_index += 1

        content[cursor_index] = "NAME=" + spectrum_id + "\n"

        while content[loop_index] != "END IONS\n":
            peak, intensity = content[loop_index][:-1].split("\t")
            peak_intensities.append((loop_index, float(peak), float(intensity)))
            loop_index += 1

        filter_and_normalise(content, peak_intensities)

filtered_content = [line for line in content if line != "TO BE DELETED\n"]
print(len(filtered_content))

with open(all_gnps_processed_path, 'w') as f:
    for line in filtered_content:
        f.write(line)