In [104]:
import os
import json
import numpy as np
from rdkit import Chem

from matplotlib import pyplot as plt
from itertools import islice
import pandas as pd
import re
from sklearn import metrics

In [50]:
gnps_dir = "G:\\Dev\\Data\\GNPS For Family"

path = "G:\\Dev\\CSIFingerID\\"
sample_fingerprints_legend_path = path + "fingerprints.csv"
output_file_dir = path + "all_sirius_output_minibatches"

json_file_dir = "G:\\Dev\\trees_gnps"
fragments_occurences = "G:\\Dev\\fragment_occurences.csv"
losses_occurences = "G:\\Dev\\losses_occurences.csv"

fingerprints_path = "G:\\Dev\\Data\\1000\\GNPS Python Master\\Final Fingerprints.txt"

In [36]:
spectrum_smiles = {}
spectrum_formula = {}

for file in os.listdir(gnps_dir):
    filepath = os.path.join(gnps_dir, file)
    with open(filepath, 'r') as f:
        content = f.readlines()
        smiles = content[6].split(" ")[1][:-1]
        formula = content[1].split(" ")[1][:-1]
        mol_name = file[:-3]
        spectrum_smiles[mol_name] = smiles
        spectrum_formula[mol_name] = formula

In [37]:
def file_has_fingerprint(file_content):
    return "fingerprints.csv" in file_content

def file_incomplete(file_content):
    return len(file_content) <= 2

def retrieve_fingerprint_file(formula, folder):
    content = os.listdir(folder)
    return next((filename for filename in content if formula in filename), None)

In [38]:
def load_fingerprints(mol_ids, fingerprint_smarts):
    fingerprints = pd.DataFrame(0, index=mol_ids, columns=range(154), dtype=int)
    for index in fingerprints.index:
        smiles = spectrum_smiles[index]
        m = Chem.MolFromSmiles(smiles)
        for i, smarts in enumerate(fingerprint_smarts):
            patt = Chem.MolFromSmarts(smarts)
            fingerprints.at[index, i] = int(m.HasSubstructMatch(patt))
            
    return fingerprints

def load_csi_fingerprints(output_file_dir, mol_folder_file, gnps_mol_ids):
    mol_ids = [entry[1] for entry in mol_folder_file if entry[1] in gnps_mol_ids]
    fingerprints = pd.DataFrame(0, index=mol_ids, columns=range(154), dtype=float)
    for directory, mol, formula in mol_folder_file:
        if mol in gnps_mol_ids:
            file_dir = os.path.join(output_file_dir, directory)
            folder = "1_{0}_{1}".format(directory, mol)
            fingerprints_file_dir = os.path.join(os.path.join(file_dir, folder), "fingerprints")
            fingerprint_file = os.listdir(fingerprints_file_dir)[1]
            full_fingerprint_filepath = os.path.join(fingerprints_file_dir, fingerprint_file)
            with open(full_fingerprint_filepath, 'r') as f:
                content = f.readlines()
                rel_content = content[:154]
                for index, prediction in enumerate(rel_content):
                    fingerprints.at[mol, index] = float(prediction[:-1])

    return fingerprints

In [39]:
desc_list = []

with open(sample_fingerprints_legend_path, 'r') as f:
    for line in f:
        rel_index, abs_index, desc = line.split("\t")
        if desc[:-1] != "description":
            desc_list.append(desc[:-1])

desc_list = desc_list[:154]
fingerprint_smarts = [entry.split(" ")[0] for entry in desc_list]

In [42]:
correct_folder_file = []
wrong_folder_file = []

for directory in os.listdir(output_file_dir):
    file_dir = os.path.join(output_file_dir, directory)
    files_in_output_dir = os.listdir(file_dir)
    if file_has_fingerprint(files_in_output_dir):
        mol = files_in_output_dir[1].split("_")[-1]
        folder = "1_{0}_{1}".format(directory, mol)
        fingerprints_file_dir = os.path.join(os.path.join(file_dir, folder), "fingerprints")
        if mol in spectrum_formula:
            formula = spectrum_formula[mol]
            if retrieve_fingerprint_file(formula, fingerprints_file_dir):
                correct_folder_file.append((directory, mol, formula))
            else:
                wrong_folder_file.append((directory, mol, formula))
                
print(len(correct_folder_file))
print(len(wrong_folder_file))

25
56


In [43]:
wrong_mol_names = [stuff[1] for stuff in wrong_folder_file]
correct_mol_names = [stuff[1] for stuff in correct_folder_file]

In [44]:
print(wrong_mol_names)

['CCMSLIB00000001565', 'CCMSLIB00000072257', 'CCMSLIB00000072565', 'CCMSLIB00000075015', 'CCMSLIB00000075306', 'CCMSLIB00000075314', 'CCMSLIB00000001607', 'CCMSLIB00000078898', 'CCMSLIB00000081221', 'CCMSLIB00000081265', 'CCMSLIB00000081267', 'CCMSLIB00000081287', 'CCMSLIB00000204740', 'CCMSLIB00000223873', 'CCMSLIB00000223874', 'CCMSLIB00000001616', 'CCMSLIB00000223877', 'CCMSLIB00000223956', 'CCMSLIB00000223959', 'CCMSLIB00000424857', 'CCMSLIB00000424858', 'CCMSLIB00000424859', 'CCMSLIB00000424860', 'CCMSLIB00000424862', 'CCMSLIB00000424864', 'CCMSLIB00000424865', 'CCMSLIB00000424872', 'CCMSLIB00000424873', 'CCMSLIB00000424908', 'CCMSLIB00000424916', 'CCMSLIB00000424917', 'CCMSLIB00000001624', 'CCMSLIB00000427306', 'CCMSLIB00000478582', 'CCMSLIB00000478591', 'CCMSLIB00000478592', 'CCMSLIB00000001635', 'CCMSLIB00000478595', 'CCMSLIB00000004459', 'CCMSLIB00000004694', 'CCMSLIB00000004912', 'CCMSLIB00000006289', 'CCMSLIB00000078118', 'CCMSLIB00000077082', 'CCMSLIB00000001653', 'CCMSLIB0

In [45]:
print(correct_mol_names)

['CCMSLIB00000072234', 'CCMSLIB00000072237', 'CCMSLIB00000075308', 'CCMSLIB00000001606', 'CCMSLIB00000223870', 'CCMSLIB00000223876', 'CCMSLIB00000223902', 'CCMSLIB00000001621', 'CCMSLIB00000424790', 'CCMSLIB00000001623', 'CCMSLIB00000424918', 'CCMSLIB00000424925', 'CCMSLIB00000427300', 'CCMSLIB00000427303', 'CCMSLIB00000001625', 'CCMSLIB00000478584', 'CCMSLIB00000478593', 'CCMSLIB00000478652', 'CCMSLIB00000531504', 'CCMSLIB00000539139', 'CCMSLIB00000001645', 'CCMSLIB00000006292', 'CCMSLIB00000077108', 'CCMSLIB00000077134', 'CCMSLIB00000072062']


Train our final model

In [200]:
component_set = set()

for formula in combined_index:
    pure_formula = formula.split('_loss')
    components = re.findall('[A-Z][^A-Z\d]*', pure_formula[0])
    for component in components:
        component_set.add(component)

mol_mass = {'C': 12.00000000000, 'H': 1.00782503214, 'O': 15.99491462210, 'N': 14.00307400524,
            'P': 30.97376151200, 'S': 31.97207069000, 'Cl': 34.96885271000, 'I': 126.904468, 'Br': 78.9183376,
            'Si': 27.9769265327, 'F': 18.99840320500, 'D': 2.01410177800}

ion_mass = {'H': 1.007276, 'Na': 22.989218, 'K': 38.963158, 'Cl37Na': 59.958168968 }

def get_number(string):
    num = 1
    num_in_string = re.findall("[\d]+", string)
    if len(num_in_string) > 0:
        num = int(num_in_string[0])
    return num

def get_string(string):
    return re.findall("[^\d]+", string)[0]  

def get_molecular_mass(formula):
    mass = 0.0
    segments = re.findall('[A-Z][a-z]*[0-9]*', formula)
    for segment in segments:
        mass += mol_mass[get_string(segment)] * get_number(segment)
    
    return mass

fragment_mass_to_formula_dict = {}
loss_mass_to_formula_dict = {}

for formula in fragments_df.index:
    mass = get_molecular_mass(formula)
    mass_int, decimals = str(mass).split(".")
    if mass_int not in fragment_mass_to_formula_dict:
        fragment_mass_to_formula_dict[int(mass_int)] = {}
    fragment_mass_to_formula_dict[int(mass_int)][float(mass)] = formula
    
for formula in losses_df.index:
    pure_formula = formula.split('_loss')
    mass = get_molecular_mass(pure_formula[0])
    mass_int, decimals = str(mass).split(".")
    if mass_int not in loss_mass_to_formula_dict:
        loss_mass_to_formula_dict[int(mass_int)] = {}
    loss_mass_to_formula_dict[int(mass_int)][float(mass)] = pure_formula[0]

In [201]:
def subtract_formula(a, b):
    string = ""
    a_segments = re.findall('[A-Z][a-z]*[0-9]*', a)
    b_segments = re.findall('[A-Z][a-z]*[0-9]*', b)

    a_segment_count = [(get_string(a_segment), get_number(a_segment)) for a_segment in a_segments]

    for b_segment in b_segments:
        for index, a_segment in enumerate(a_segment_count):
            if a_segment[0] == get_string(b_segment):
                a_segment_count[index] = (a_segment[0], a_segment[1] - get_number(b_segment))

    for component, amount in a_segment_count:
        if amount != 0:
            if amount == 1:
                amount = ""
            string += component + str(amount)

    return string

In [62]:
def assign_between_bins(value, bins):
    if len(bins) == 2:
        bin1, bin2 = bins
    elif len(bins) > 2:
        differences_in_bins = [(mass_bin, abs(value-mass_bin)) for mass_bin in bins]
        differences_in_bins.sort(key = lambda t: t[1])
        bin1, bin2 = differences_in_bins[:2]
        
    biggest_bin = max(bin1, bin2)
    differences = [(bin1, abs(value - bin1)), (bin2, abs(value - bin2))]
    smallest_diff = min(differences, key = lambda t: t[1])
    biggest_diff = max(differences, key = lambda t: t[1])
    nearest_bin_allocation = (smallest_diff[0], 1-(smallest_diff[1]/float(biggest_bin)))
    furthest_bin_allocation = (biggest_diff[0], smallest_diff[1]/float(biggest_bin))
    
    return nearest_bin_allocation, furthest_bin_allocation

In [203]:
def convert_lines_to_list(lines):
    mass_intensity_list = []
    for line in lines:
        if ' ' in line:  # Only lines with mass and intensity values have a space. Ignores label/blank lines
            split_line = line.split()
            mass = float(split_line[0])
            intensity = float(split_line[1])
            mass_intensity_list.append((mass, intensity))
            
    return mass_intensity_list

def count_intensities_in_vector(mass_intensities, ionization):
    total_intensity = 0
    assigned_mass = set()
    
    for mass, intensity in mass_intensities:
        non_ionized_mass = mass - ionization
        mass_int, decimals = str(non_ionized_mass).split(".")
        if int(mass_int) in fragment_mass_to_formula_dict:
            for full_mass, formula in fragment_mass_to_formula_dict[int(mass_int)].items():
                if full_mass-0.025 <= non_ionized_mass <= full_mass+0.025: # within 0.05 width
                    total_intensity += intensity
                    
    return total_intensity

In [204]:
def assign_to_tree(mol_ids, folder_files):
    tree_intensities = pd.DataFrame(0.0, index=mol_ids, columns=combined_index, dtype=float)
        
    for folder_file in folder_files:
        directory, mol, formula = folder_file
        spec_path = output_file_dir + "\\" + directory + "\\1_{0}_{1}\\".format(directory, mol) + "spectrum.ms"
        split_count = 0
        assigned_formula_intensity = []
        with open(spec_path, 'r') as f:
            content = f.readlines()
            cursor = 0
            while not content[cursor].startswith(">ms2peaks"):
                cursor += 1
                
            ionization_entries_count = []
            unsplit_lines = list(islice(f, cursor+1, None))
            mass_intensity_list = convert_lines_to_list(unsplit_lines)
            for ion in ion_mass.keys():
                ionization_entries_count.append((ion, count_intensities_in_vector(mass_intensity_list, ion_mass[ion])))
            ion_with_most_entries = max(ionization_entries_count, key=lambda x:x[1])[0]
            ionization = ion_mass[ion_with_most_entries]
            print(ionization_entries_count)
            
            for mass, intensity in mass_intensity_list:
                non_ionized_mass = mass - ionization
                mass_int, decimals = str(non_ionized_mass).split(".")
                if int(mass_int) in fragment_mass_to_formula_dict:
                    chosen_bins = []
                    for full_mass, formula in fragment_mass_to_formula_dict[int(mass_int)].items():
                        if full_mass-0.025 <= non_ionized_mass <= full_mass+0.025: # within 0.05 width
                            chosen_bins.append(full_mass)
                    
                    if len(chosen_bins) > 1:
                        nearest_bin_allocation, furthest_bin_allocation = assign_between_bins(chosen_bins)
                        tree_intensities.at[mol, fragment_mass_to_formula_dict[int(mass_int)][nearest_bin_allocation[0]]] += float(intensity) * nearest_bin_allocation[1]
                        tree_intensities.at[mol, fragment_mass_to_formula_dict[int(mass_int)][furthest_bin_allocation[0]]] += float(intensity) * furthest_bin_allocation[1]
                        assigned_formula_intensity.append((fragment_mass_to_formula_dict[int(mass_int)][nearest_bin_allocation[0]], 
                                                           nearest_bin_allocation[0], intensity*nearest_bin_allocation[1]))
                        assigned_formula_intensity.append((fragment_mass_to_formula_dict[int(mass_int)][furthest_bin_allocation[0]], 
                                                           furthest_bin_allocation[0], intensity*furthest_bin_allocation[1]))
                        split_count += 1
                        
                    elif len(chosen_bins) == 1:
                        tree_intensities.at[mol, fragment_mass_to_formula_dict[int(mass_int)][chosen_bins[0]]] += float(intensity)
                        assigned_formula_intensity.append((fragment_mass_to_formula_dict[int(mass_int)][chosen_bins[0]], chosen_bins[0], intensity))
                    
                    else:
                        continue
            
            print(directory, mol, split_count)
            assigned_formula_intensity.sort(key = lambda t: t[1])
            assigned_formula_intensity.reverse() # highest to smallest

            peak_differences = [(subtract_formula(formula_intensity_i[0], formula_intensity_j[0]) + "_loss", 
                                 (formula_intensity_i[2] + formula_intensity_j[2]) / 2.0 ) 
                                for i, formula_intensity_i in enumerate(assigned_formula_intensity)
                                for j, formula_intensity_j in enumerate(assigned_formula_intensity) 
                                if i != j 
                                and j > i
                                and len(re.findall('[A-Z][a-z]*[0-9]*', formula_intensity_i[0])) >= len(re.findall('[A-Z][a-z]*[0-9]*', formula_intensity_j[0]))
                                and subtract_formula(formula_intensity_i[0], formula_intensity_j[0]) + "_loss" in losses_df.index]

            for formula, intensity in peak_differences:
                tree_intensities.at[mol, formula] += float(intensity)
            
            if np.amax(tree_intensities.loc[mol]) > 0:
                tree_intensities.loc[mol] = tree_intensities.loc[mol].div(np.amax(tree_intensities.loc[mol].values)).mul(999)
                            
    return tree_intensities

In [195]:
assigned_tree_path = "G:\\Dev\\Data\\assigned_tree_final.pkl"

In [187]:
assigned_tree = pd.read_pickle(assigned_tree_path)
print(assigned_tree)

                    BrH_loss  Br_loss       C10  C10H10  C10H10ClN  \
CCMSLIB00000001548       0.0      0.0  0.000000     0.0        0.0   
CCMSLIB00000001549       0.0      0.0  0.000000     0.0        0.0   
CCMSLIB00000001550       0.0      0.0  0.000000     0.0        0.0   
CCMSLIB00000001555       0.0      0.0  0.000000     0.0        0.0   
CCMSLIB00000001563       0.0      0.0  0.000000     0.0        0.0   
CCMSLIB00000001565       0.0      0.0  0.000000     0.0        0.0   
CCMSLIB00000001566       0.0      0.0  0.000000     0.0        0.0   
CCMSLIB00000001568       0.0      0.0  0.000000     0.0        0.0   
CCMSLIB00000001569       0.0      0.0  0.000000     0.0        0.0   
CCMSLIB00000001570       0.0      0.0  0.000000     0.0        0.0   
CCMSLIB00000001572       0.0      0.0  0.000000     0.0        0.0   
CCMSLIB00000001574       0.0      0.0  0.000000     0.0        0.0   
CCMSLIB00000001576       0.0      0.0  0.000000     0.0        0.0   
CCMSLIB00000001581  

In [188]:
fingerprints = load_fingerprints(assigned_tree.index, fingerprint_smarts)

In [189]:
assigned_tree.sort_index(inplace=True)
fingerprints.sort_index(inplace=True)

In [126]:
from keras.layers import Input, Dense
from keras.models import Model,Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from keras.optimizers import SGD

def baseline_model(x_train_formula, x_train_fingerprints):
    class_model = Sequential()
    class_model.add(Dense(1000, input_dim=x_train_formula.shape[1], kernel_initializer='normal', activation='relu'))
    class_model.add(Dense(500,kernel_initializer='normal',activation = 'relu'))
    class_model.add(Dense(x_train_fingerprints.shape[1],kernel_initializer='normal',activation = 'sigmoid'))
    class_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return class_model

In [127]:
epochs = 100

mod = baseline_model(assigned_tree, fingerprints)
preprocessed_assigned_tree = np.log(assigned_tree.values+1)

history = mod.fit(preprocessed_assigned_tree, fingerprints.values, epochs=epochs, validation_split=0.2, verbose=0)

In [205]:
correct_assigned_trees = assign_to_tree(correct_mol_names, correct_folder_file)
print(correct_assigned_trees.shape)

[('H', 0), ('Na', 0), ('K', 0), ('Cl37Na', 0)]
batch110 CCMSLIB00000072234 0
[('H', 0), ('Na', 0), ('K', 0), ('Cl37Na', 0)]
batch111 CCMSLIB00000072237 0
[('H', 0), ('Na', 0), ('K', 0), ('Cl37Na', 0)]
batch172 CCMSLIB00000075308 0
[('H', 0), ('Na', 0), ('K', 0), ('Cl37Na', 0)]
batch19 CCMSLIB00000001606 0
[('H', 0), ('Na', 0), ('K', 0), ('Cl37Na', 0)]
batch236 CCMSLIB00000223870 0
[('H', 0), ('Na', 0), ('K', 0), ('Cl37Na', 0)]
batch239 CCMSLIB00000223876 0
[('H', 0), ('Na', 0), ('K', 0), ('Cl37Na', 0)]
batch246 CCMSLIB00000223902 0
[('H', 0), ('Na', 0), ('K', 0), ('Cl37Na', 0)]
batch25 CCMSLIB00000001621 0
[('H', 0), ('Na', 0), ('K', 0), ('Cl37Na', 0)]
batch254 CCMSLIB00000424790 0
[('H', 0), ('Na', 0), ('K', 0), ('Cl37Na', 0)]
batch27 CCMSLIB00000001623 0
[('H', 0), ('Na', 0), ('K', 0), ('Cl37Na', 0)]
batch278 CCMSLIB00000424918 0
[('H', 0), ('Na', 0), ('K', 0), ('Cl37Na', 0)]
batch279 CCMSLIB00000424925 0
[('H', 0), ('Na', 0), ('K', 0), ('Cl37Na', 0)]
batch286 CCMSLIB00000427300 0
[(

In [131]:
correct_fingerprints = load_fingerprints(correct_assigned_trees.index, fingerprint_smarts)

In [132]:
correct_assigned_trees.sort_index(inplace=True)
correct_fingerprints.sort_index(inplace=True)

In [133]:
predicted = mod.predict(np.log(correct_assigned_trees.values + 1))
print(predicted)

[[3.3995173e-05 8.2369279e-06 9.9929059e-01 ... 4.9363379e-03
  9.8101258e-01 9.9994814e-01]
 [3.3995173e-05 8.2369279e-06 9.9929059e-01 ... 4.9363379e-03
  9.8101258e-01 9.9994814e-01]
 [3.3995173e-05 8.2369279e-06 9.9929059e-01 ... 4.9363379e-03
  9.8101258e-01 9.9994814e-01]
 ...
 [3.3995173e-05 8.2369279e-06 9.9929059e-01 ... 4.9363379e-03
  9.8101258e-01 9.9994814e-01]
 [3.3995173e-05 8.2369279e-06 9.9929059e-01 ... 4.9363379e-03
  9.8101258e-01 9.9994814e-01]
 [3.3995173e-05 8.2369279e-06 9.9929059e-01 ... 4.9363379e-03
  9.8101258e-01 9.9994814e-01]]


In [208]:
def compute_auc(indexes, true, pred):
    auc_scores = []
    
    for i, index in enumerate(indexes):
        nonzero_vals = np.count_nonzero(true[:, i]) # Count number of nonzero values
        if nonzero_vals > 0 and nonzero_vals < true[:, i].size: # If there are no 1s or no 0s, can't compute.
            fp_true = true[:, i]
            fp_pred = pred[:, i]
            score = metrics.roc_auc_score(fp_true, fp_pred)
            auc_scores.append((index, score))
        else:
            auc_scores.append((index, 0.0))
            
    print("Compute AUC done")
    return auc_scores

def compute_f1(indexes, true, pred):
    f1_scores = []
    
    for i, index in enumerate(indexes):
        nonzero_vals = np.count_nonzero(true[:, i]) # Count number of nonzero values
        if nonzero_vals > 0 and nonzero_vals < true[:, i].size: # If there are no 1s or no 0s, can't compute.
            fp_true = true[:, i]
            fp_pred = pred[:, i]
            score = metrics.f1_score(fp_true, fp_pred, average='micro')
            f1_scores.append((index, score))
        else:
            f1_scores.append((index, 0.0))
            
    print("Compute F1 done")
    return f1_scores

In [209]:
compute_auc(correct_fingerprints.columns.tolist(), correct_fingerprints.values, predicted)

Compute AUC done


[(0, 0.5),
 (1, 0.0),
 (2, 0.5),
 (3, 0.0),
 (4, 0.5),
 (5, 0.5),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.5),
 (10, 0.5),
 (11, 0.5),
 (12, 0.5),
 (13, 0.5),
 (14, 0.5),
 (15, 0.5),
 (16, 0.0),
 (17, 0.5),
 (18, 0.5),
 (19, 0.5),
 (20, 0.0),
 (21, 0.5),
 (22, 0.5),
 (23, 0.0),
 (24, 0.5),
 (25, 0.5),
 (26, 0.5),
 (27, 0.5),
 (28, 0.5),
 (29, 0.0),
 (30, 0.0),
 (31, 0.0),
 (32, 0.0),
 (33, 0.5),
 (34, 0.5),
 (35, 0.5),
 (36, 0.5),
 (37, 0.5),
 (38, 0.0),
 (39, 0.0),
 (40, 0.0),
 (41, 0.5),
 (42, 0.5),
 (43, 0.5),
 (44, 0.5),
 (45, 0.0),
 (46, 0.0),
 (47, 0.5),
 (48, 0.5),
 (49, 0.5),
 (50, 0.5),
 (51, 0.0),
 (52, 0.0),
 (53, 0.0),
 (54, 0.0),
 (55, 0.0),
 (56, 0.0),
 (57, 0.0),
 (58, 0.0),
 (59, 0.0),
 (60, 0.0),
 (61, 0.5),
 (62, 0.5),
 (63, 0.5),
 (64, 0.5),
 (65, 0.5),
 (66, 0.5),
 (67, 0.0),
 (68, 0.0),
 (69, 0.0),
 (70, 0.0),
 (71, 0.5),
 (72, 0.5),
 (73, 0.5),
 (74, 0.5),
 (75, 0.5),
 (76, 0.0),
 (77, 0.5),
 (78, 0.0),
 (79, 0.5),
 (80, 0.5),
 (81, 0.0),
 (82, 0.5),
 (83, 0.5),
 (