In [None]:
import os
import numpy as np
from itertools import islice
import os

In [None]:
from rdkit import Chem
from sklearn.metrics import roc_auc_score

In [None]:
datapath = "G:\\Dev\\Data"

all_gnps_path = datapath + os.sep + "GNPSLibraries_allSMILES.mgf"
all_gnps_fragments_path = datapath + os.sep + "GNPS 35000 Spectrum Fragments"
binned_datapath = datapath + os.sep + "GNPS 35000" + os.sep + "GNPS Python Binned"
filtered_datapath = datapath + os.sep + "GNPS 35000" + os.sep + "GNPS Python Filtered"

nist_path = datapath + os.sep + "MSMS-NIST-Curated-Pos-MfKit.msp"
nist_fragments_path = datapath + os.sep + "MSMS-NIST Fragments"
binned_nist_datapath = datapath + os.sep + "MSMS-NIST" + os.sep + "Python Binned"
filtered_nist_datapath = datapath + os.sep + "MSMS-NIST" + os.sep + "Python Filtered"

spectrum_smiles_path = datapath + os.sep + "Spectrum-Smiles.tsv"
nist_spectrum_smiles_path = datapath + os.sep + "NIST Spectrum-Smiles.tsv"

amino_acid_path = datapath + os.sep + "Amino Acids"
nist_amino_acid_path = datapath + os.sep + "NIST Amino Acids"

amino_acid_shifts_path = datapath + os.sep + "Fragment Masses.txt"

In [None]:
amino_acid_smarts = {}

# amino acid SMARTS string taken from http://www.daylight.com/dayhtml_tutorials/languages/smarts/smarts_examples.html
# note that glutamine SMARTS is not found
amino_acid_smarts['Alanine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH3X4])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Arginine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][CH2X4][CH2X4][NHX3][CH0X3](=[NH2X3+,NHX2+0])[NH2X3])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Asparagine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][CX3](=[OX1])[NX3H2])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Aspartic Acid'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][CX3](=[OX1])[OH0-,OH])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Cysteine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][SX2H,SX1H0-])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Glutamic Acid'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][CH2X4][CX3](=[OX1])[OH0-,OH])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Glutamine'] = ''
amino_acid_smarts['Glycine'] = '[$([$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H2][CX3](=[OX1])[OX2H,OX1-,N])]'
amino_acid_smarts['Histidine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][#6X3]1:[$([#7X3H+,#7X2H0+0]:[#6X3H]:[#7X3H]),$([#7X3H])]:[#6X3H]:[$([#7X3H+,#7X2H0+0]:[#6X3H]:[#7X3H]),$([#7X3H])]:[#6X3H]1)[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Isoleucine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CHX4]([CH3X4])[CH2X4][CH3X4])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Leucine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][CHX4]([CH3X4])[CH3X4])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Lysine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][CH2X4][CH2X4][CH2X4][NX4+,NX3+0])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Methionine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][CH2X4][SX2][CH3X4])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Phenylalanine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][cX3]1[cX3H][cX3H][cX3H][cX3H][cX3H]1)[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Proline'] = '[$([NX3H,NX4H2+]),$([NX3](C)(C)(C))]1[CX4H]([CH2][CH2][CH2]1)[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Serine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][OX2H])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Threonine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CHX4]([CH3X4])[OX2H])[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Tryptophan'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][cX3]1[cX3H][nX3H][cX3]2[cX3H][cX3H][cX3H][cX3H][cX3]12)[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Tyrosine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CH2X4][cX3]1[cX3H][cX3H][cX3]([OHX2,OH0X1-])[cX3H][cX3H]1)[CX3](=[OX1])[OX2H,OX1-,N]'
amino_acid_smarts['Valine'] = '[$([NX3H2,NX4H3+]),$([NX3H](C)(C))][CX4H]([CHX4]([CH3X4])[CH3X4])[CX3](=[OX1])[OX2H,OX1-,N]'


In [None]:
mol_shifts = []
#amino_acid_shifts_path = datapath + os.sep + "Fragment Masses Shuffled.txt"

with open(amino_acid_shifts_path, 'r') as f:
    for line in f:
        mol_name, shift = line.split(", ")
        mol_shifts.append((mol_name, amino_acid_smarts[mol_name], int(float(shift[:-1]))))

In [None]:
def save_averaged_peaks_intensities(amino_acid, path, filtered_path, shift, average_dict):
    average_dict[amino_acid] = {}
    
    with open(path, 'w') as f:
        for file in os.listdir(filtered_path):
            filename = file[:-4]
            print(filename)
            filepath = os.path.join(filtered_path, file)
            data = np.loadtxt(filepath, np.float32)
            max_avg = 0.0

            for i in range(0, 999-shift+1):
                intensity1 = float(data[i][1])
                intensity2 = float(data[i+shift][1])
                if intensity1 == 0 or intensity2 == 0:
                    average = 0.0
                else:
                    average = (intensity1 + intensity2)/2

                if average > max_avg:
                    max_avg = average
            
            mol_name = filename.split(" ")[0]
            average_dict[amino_acid][mol_name] = max_avg
            f.write(str(max_avg) + "\n")

In [None]:
with open(spectrum_smiles_path, 'r') as f:
    gnps_content = f.readlines()

with open(nist_spectrum_smiles_path, 'r') as f:
    nist_content = f.readlines()

In [None]:
def has_substruct_match_smiles_smarts(amino_acid, path, content, smarts, has_sub_dict):
    count = 0
    has_sub_dict[amino_acid] = {}
    
    with open(path, 'w') as f:    
        for line in content:
            mol_name, smiles = line.split("\t")
            m = Chem.MolFromSmiles(smiles[:-1])
            patt = Chem.MolFromSmarts(smarts)
            if m is not None:
                has_sub_dict[amino_acid][mol_name] = int(m.HasSubstructMatch(patt))
                f.write(str(int(m.HasSubstructMatch(patt))) + "\n")
            else:
                count += 1
                has_sub_dict[amino_acid][mol_name] = 0
                f.write("0\n")

    print(count)

In [None]:
def calculate_auc_score(amino_acid, peaks_dict, has_substructure_dict):  
    y_values = []
    y_true = []
    
    for mol_name, average in peaks_dict[amino_acid].items():
        y_values.append(peaks_dict[amino_acid][mol_name])
        y_true.append(has_substructure_dict[amino_acid][mol_name])
    
    y_values_arr = np.array(y_values)
    y_true_arr = np.array(y_true)
    
    score = roc_auc_score(y_true_arr, y_values_arr)
    amino_acid_baseline_aucs[amino_acid] = score
    nist_amino_acid_baseline_aucs[amino_acid] = score

In [None]:
amino_acid_baseline_aucs = {}
nist_amino_acid_baseline_aucs = {}

for mol_name, smarts, shift in mol_shifts:

    if smarts != '':
        spectrum_max_average_dict = {}
        nist_spectrum_max_average_dict = {}
        has_substructure_dict = {}
        nist_has_substructure_dict = {}
        new_path = amino_acid_path + os.sep + mol_name + " Peaks Shift Averaged.txt"
        new_nist_path = nist_amino_acid_path + os.sep + ("NIST %s Peaks Shift Averaged.txt" % mol_name)
        save_averaged_peaks_intensities(mol_name, new_path, filtered_datapath, shift, spectrum_max_average_dict)
        save_averaged_peaks_intensities(mol_name, new_nist_path, filtered_nist_datapath, shift, nist_spectrum_max_average_dict)
        new_path = amino_acid_path + os.sep + mol_name + " Has Substructure.txt"
        new_nist_path = nist_amino_acid_path + os.sep + ("NIST %s Has Substructure.txt" % mol_name)
        has_substruct_match_smiles_smarts(mol_name, new_path, gnps_content, smarts, has_substructure_dict)
        has_substruct_match_smiles_smarts(mol_name, new_nist_path, nist_content, smarts, nist_has_substructure_dict)
        calculate_auc_score(mol_name, spectrum_max_average_dict, has_substructure_dict)
        calculate_auc_score(mol_name, nist_spectrum_max_average_dict, nist_has_substructure_dict)


In [None]:
baseline_auc_path = amino_acid_path + os.sep + "Baseline AUC.txt"
nist_baseline_auc_path = nist_amino_acid_path + os.sep + "NIST Baseline AUC.txt"

with open(baseline_auc_path, 'w') as f:
    for amino_acid, score in amino_acid_baseline_aucs.items():
        f.write(amino_acid + "," + score + "\n")

with open(nist_baseline_auc_path, 'w') as f:
    for amino_acid, score in nist_amino_acid_baseline_aucs.items():
        f.write(amino_acid + "," + score + "\n")

In [None]:
peak_average_intensities_path = amino_acid_path + os.sep + "Valine Peaks Shift Averaged.txt"
has_sub_path = amino_acid_path + os.sep + "Valine Has Substructure.txt"

has_substructure = np.loadtxt(has_sub_path, np.int32)
peak_average_intensities = np.loadtxt(peak_average_intensities_path, np.float32)

all_ones = np.where(has_substructure == 1)
non_zeroes = np.where(peak_average_intensities > 0.0)

print(len(all_ones[0]))
print(len(non_zeroes[0]))

count = 0

for index, max_average_intensity in enumerate(peak_average_intensities):
    if (has_substructure[index] == 1 and max_average_intensity > 0) or (has_substructure[index] == 0 and max_average_intensity == 0) :
        count += 1
        
print(count/len(peak_average_intensities))