In [1]:
import os
import json
import numpy as np

import pandas as pd

In [2]:
gnps_dir = "G:\\Dev\\Data\\GNPS"
json_file_dir = "G:\\Dev\\trees_gnps"
fragments_occurences = "G:\\Dev\\fragment_occurences.csv"
losses_occurences = "G:\\Dev\\losses_occurences.csv"

In [3]:
fragments_df = pd.read_csv(fragments_occurences, names=["formula", "occurences"])
fragments_df = fragments_df.set_index("formula")
losses_df = pd.read_csv(losses_occurences, names=["formula", "occurences"])
losses_df = losses_df.set_index("formula")


In [5]:
losses_df.index = [index + "_loss" for index in losses_df.index]
print(losses_df.index)
combined_index = fragments_df.index.union(losses_df.index)
print(combined_index)

Index(['CO_loss', 'H2O_loss', 'C2H2_loss', 'CH3_loss', 'CHN_loss', 'CH4_loss',
       'H3N_loss', 'H2_loss', 'C2H4_loss', 'C2H2O_loss',
       ...
       'C3H8S_loss', 'OP_loss', 'CH2N3_loss', 'C7H7NO3_loss', 'O2P_loss',
       'C2H5NOS_loss', 'C10H4N3_loss', 'C9H17O3_loss', 'C3H7N4_loss',
       'C4H6NO2_loss'],
      dtype='object', length=892)
Index(['BrH_loss', 'Br_loss', 'C10', 'C10H10', 'C10H10ClN', 'C10H10ClNO',
       'C10H10ClNO2', 'C10H10F2O2', 'C10H10FN', 'C10H10N',
       ...
       'O2_loss', 'O3S_loss', 'O3_loss', 'O4_loss', 'O5_loss', 'OP_loss',
       'OS_loss', 'O_loss', 'S2_loss', 'S_loss'],
      dtype='object', length=5508)


In [4]:
spectrum_smiles = {}

for file in os.listdir(gnps_dir):
    filepath = os.path.join(gnps_dir, file)
    with open(filepath, 'r') as f:
        content = f.readlines()
        smiles = content[6].split(" ")[1]
        mol_name = file[:-3]
        spectrum_smiles[mol_name] = smiles

In [6]:
def read_tree(filepath, fragment_formula, losses_formula):
    fragments = {}
    losses = {}
    
    with open(filepath, 'r') as f:
        data = json.load(f)

    for fragment in data['fragments']:
        if fragment['molecularFormula'] in fragment_formula:
            if "intensity" in fragment:
                fragments[(fragment['molecularFormula'])] = float(fragment['intensity'])

    for loss in data['losses']:
        loss_index = loss['molecularFormula'] + "_loss"
        if loss["source"] in fragments and loss["target"] in fragments and loss_index in losses_formula:
            losses[loss_index] = (fragments[loss["source"]] + fragments[loss["target"]]) /2 
        
    return fragments, losses

In [7]:
print(losses_df.index)
print("CO_loss" in losses_df.index)

Index(['CO_loss', 'H2O_loss', 'C2H2_loss', 'CH3_loss', 'CHN_loss', 'CH4_loss',
       'H3N_loss', 'H2_loss', 'C2H4_loss', 'C2H2O_loss',
       ...
       'C3H8S_loss', 'OP_loss', 'CH2N3_loss', 'C7H7NO3_loss', 'O2P_loss',
       'C2H5NOS_loss', 'C10H4N3_loss', 'C9H17O3_loss', 'C3H7N4_loss',
       'C4H6NO2_loss'],
      dtype='object', length=892)
True


In [8]:
filepath = os.path.join(json_file_dir, "CCMSLIB00000001563.json")
fragments, losses = read_tree(filepath, fragments_df.index, losses_df.index)
print(fragments)
print(losses)

{'C16H24N2O4': 6160.996094, 'C15H24N2O3': 16870.810547, 'C14H18N2O3': 2070.579102, 'C14H16N2O2': 809.072083, 'C11H17NO3': 21815.289062, 'C11H15NO2': 64301.03125, 'C10H17NO2': 14258.55957, 'C10H15NO': 6297.509766, 'C6H13N': 15535.5, 'C8H9NO2': 1387.92395, 'C8H8NO2': 1246.405029, 'C7H8NO': 845.15387, 'C6H7NO2': 2094.37793, 'C13H15NO2': 2861.543945, 'C13H16N2O': 1143.875977, 'C13H13NO': 1225.858032, 'C12H13N': 3435.073975, 'C8H9N': 6533.36377, 'C15H22N2O': 1184.531006}
{'CO_loss': 2330.4660034999997, 'C2H6O_loss': 4115.787598, 'H2O_loss': 2043.7009885, 'C3HN_loss': 11942.934082, 'C4H4O2_loss': 14897.029784999999, 'C3H6_loss': 32844.4776, 'C3H7_loss': 32773.7181395, 'C2H2_loss': 1741.15094, 'CHN_loss': 1835.308014, 'C4H4_loss': 4984.2188725}


In [18]:
def dict_is_empty(sample_dict):
    return len(sample_dict.keys()) == 0

def get_mol_names(path_dir):
    return [file[:-3] for file in os.listdir(path_dir)]

def load_tree(path_dir):
    mol_ids = get_mol_names(path_dir)
    intensities = pd.DataFrame(0.0, index=mol_ids, columns=combined_index, dtype=float)
    for file in os.listdir(path_dir):
        mol_name = file[:-3]
        tree_path = json_file_dir + os.sep + mol_name + ".json"
        fragments, losses = read_tree(tree_path, fragments_df.index, losses_df.index)
        if dict_is_empty(fragments) or dict_is_empty(losses):
            intensities.drop([mol_name], inplace=True)
        else:
            for fragment, intensity in fragments.items():
                intensities.at[mol_name, fragment] = float(intensity)
            for fragment, intensity in losses.items():
                intensities.at[mol_name, fragment] = float(intensity)
            intensities.loc[mol_name] = intensities.loc[mol_name].div(np.amax(intensities.loc[mol_name].values)).mul(999)
    return intensities

In [None]:
intensities = load_tree(gnps_dir)
print(intensities)

999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.

999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.

999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.

999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.0
999.

In [155]:
def fragments_shifts_model(input_to_network):
    class_model = Sequential()
    class_model.add(Dense(2500, input_dim=input_to_network.shape[1], kernel_initializer='normal', activation='relu'))
    class_model.add(Dense(1000,kernel_initializer='normal',activation = 'relu'))
    class_model.add(Dense(500,kernel_initializer='normal',activation = 'relu'))
    class_model.add(Dense(100,kernel_initializer='normal',activation = 'relu'))
    class_model.add(Dense(50,kernel_initializer='normal',activation = 'relu'))
    class_model.add(Dense(1,kernel_initializer='normal',activation = 'sigmoid'))
    class_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return class_model

In [None]:
tree = np.log(intensities.values+1)
#train without the 27 for correct
fingerprints = 