In [1]:
import os
import json
import numpy as np
from rdkit import Chem

from matplotlib import pyplot as plt
from sklearn import metrics
from itertools import islice
import pandas as pd
import re

In [2]:
gnps_dir = "G:\\Dev\\Data\\GNPS"
json_file_dir = "G:\\Dev\\trees_gnps"
fragments_occurences = "G:\\Dev\\fragment_occurences.csv"
losses_occurences = "G:\\Dev\\losses_occurences.csv"

path = "G:\\Dev\\CSIFingerID\\"
sample_fingerprints_legend_path = path + "fingerprints.csv"
output_file_dir = path + "all_sirius_output_minibatches"

In [3]:
fragments_df = pd.read_csv(fragments_occurences, names=["formula", "occurences"])
fragments_df = fragments_df.set_index("formula")
losses_df = pd.read_csv(losses_occurences, names=["formula", "occurences"])
losses_df = losses_df.set_index("formula")


In [4]:
losses_df.index = [index + "_loss" for index in losses_df.index]
print(losses_df.index)
combined_index = fragments_df.index.union(losses_df.index)
print(combined_index)

Index(['CO_loss', 'H2O_loss', 'C2H2_loss', 'CH3_loss', 'CHN_loss', 'CH4_loss',
       'H3N_loss', 'H2_loss', 'C2H4_loss', 'C2H2O_loss',
       ...
       'C3H8S_loss', 'OP_loss', 'CH2N3_loss', 'C7H7NO3_loss', 'O2P_loss',
       'C2H5NOS_loss', 'C10H4N3_loss', 'C9H17O3_loss', 'C3H7N4_loss',
       'C4H6NO2_loss'],
      dtype='object', length=892)
Index(['BrH_loss', 'Br_loss', 'C10', 'C10H10', 'C10H10ClN', 'C10H10ClNO',
       'C10H10ClNO2', 'C10H10F2O2', 'C10H10FN', 'C10H10N',
       ...
       'O2_loss', 'O3S_loss', 'O3_loss', 'O4_loss', 'O5_loss', 'OP_loss',
       'OS_loss', 'O_loss', 'S2_loss', 'S_loss'],
      dtype='object', length=5508)


In [5]:
def read_tree(filepath, fragment_formula, losses_formula):
    fragments = {}
    losses = {}
    all_fragments = {}
    
    with open(filepath, 'r') as f:
        data = json.load(f)

    for fragment in data['fragments']:
        if "intensity" in fragment:
            all_fragments[(fragment['molecularFormula'])] = float(fragment['intensity'])
            if fragment['molecularFormula'] in fragment_formula:
                fragments[(fragment['molecularFormula'])] = float(fragment['intensity'])

    for loss in data['losses']:
        loss_index = loss['molecularFormula'] + "_loss"
        if loss_index in losses_formula and loss["source"] in all_fragments and loss["target"] in all_fragments:
            losses[loss_index] = (all_fragments[loss["source"]] + all_fragments[loss["target"]]) /2 
                
    return fragments, losses

# Load a master file containing CDK fingerprints for all molecules.
# Each molecules CDK bit set is added as a 320 element array to a Pandas dataframe.
def load_fingerprints_master(path, number_of_rows=0):
    BITS = 307  # Total number of bits in fingerprint

    fp_all = np.loadtxt(path, dtype="U25", skiprows=number_of_rows) # Get master file as numpy array of Strings
    fp_ids = np.unique(fp_all[:, 0]) # Trim duplicate filename rows, store unique filenames

    # Construct empty Pandas dataframe of correct size.
    # Number of rows is equal to the number of unique molecules (found in fp_ids).
    fingerprints = pd.DataFrame(0, index = fp_ids, columns=range(BITS), dtype=int)

    # Populate the dataframe using each molecule's filename to place data in the correct row.
    for row in fp_all:
        fingerprints.at[row[0], int(row[1])] = int(row[2])

    # Convert populated dataframe into a numpy array for use as output by neural networks.
    return fingerprints

# Load the names of all substructures included in the CDK fingerprint in the correct order
# This is used for boxplots, when performance metrics for individual substructures are calculated.
def load_fingerprint_legend():
    fingerprint_legend = []
    # Open file containing substructure names.
    with open(fingerprints_names_path, 'r') as f:
        # Add each name to the list of substructure names.
        lines = list(islice(f, 0, None))
        for line in lines:
            fingerprint_legend.append(line[:-1])
    return fingerprint_legend

In [6]:
def dict_is_empty(sample_dict):
    return len(sample_dict.keys()) == 0

def get_mol_names(path_dir):
    return [file[:-3] for file in os.listdir(path_dir)]

Assigning .ms mass intensities to tree

In [7]:
component_set = set()

for formula in combined_index:
    pure_formula = formula.split('_loss')
    components = re.findall('[A-Z][^A-Z\d]*', pure_formula[0])
    for component in components:
        component_set.add(component)

mol_mass = {'C': 12.00000000000, 'H': 1.00782503214, 'O': 15.99491462210, 'N': 14.00307400524,
            'P': 30.97376151200, 'S': 31.97207069000, 'Cl': 34.96885271000, 'I': 126.904468, 'Br': 78.9183376,
            'Si': 27.9769265327, 'F': 18.99840320500, 'D': 2.01410177800}

ion_mass = {'H': 1.007276, 'Na': 22.989218, 'K': 38.963158}

def get_number(string):
    num = 1
    num_in_string = re.findall("[\d]+", string)
    if len(num_in_string) > 0:
        num = int(num_in_string[0])
    return num

def get_string(string):
    return re.findall("[^\d]+", string)[0]  

def get_molecular_mass(formula):
    mass = 0
    segments = re.findall('[A-Z][a-z]*[0-9]*', formula)
    for segment in segments:
        mass += mol_mass[get_string(segment)] * get_number(segment)
    
    return mass

fragment_mass_to_formula_dict = {}
loss_mass_to_formula_dict = {}

for formula in fragments_df.index:
    mass = get_molecular_mass(formula)
    mass_int, decimals = str(mass).split(".")
    if mass_int not in fragment_mass_to_formula_dict:
        fragment_mass_to_formula_dict[int(mass_int)] = {}
    fragment_mass_to_formula_dict[int(mass_int)][round(mass, 6)] = formula
    
for formula in losses_df.index:
    pure_formula = formula.split('_loss')
    mass = get_molecular_mass(pure_formula[0])
    mass_int, decimals = str(mass).split(".")
    if mass_int not in loss_mass_to_formula_dict:
        loss_mass_to_formula_dict[int(mass_int)] = {}
    loss_mass_to_formula_dict[int(mass_int)][round(mass, 6)] = pure_formula[0]

In [8]:
def subtract_formula(a, b):
    string = ""
    a_segments = re.findall('[A-Z][a-z]*[0-9]*', a)
    b_segments = re.findall('[A-Z][a-z]*[0-9]*', b)

    a_segment_count = [(get_string(a_segment), get_number(a_segment)) for a_segment in a_segments]

    for b_segment in b_segments:
        for index, a_segment in enumerate(a_segment_count):
            if a_segment[0] == get_string(b_segment):
                a_segment_count[index] = (a_segment[0], a_segment[1] - get_number(b_segment))

    for component, amount in a_segment_count:
        if amount != 0:
            if amount == 1:
                amount = ""
            string += component + str(amount)

    return string

In [9]:
def convert_lines_to_list(lines):
    mass_intensity_list = []
    for line in lines:
        if ' ' in line:  # Only lines with mass and intensity values have a space. Ignores label/blank lines
            split_line = line.split()
            mass = float(split_line[0])
            intensity = float(split_line[1])
            mass_intensity_list.append((mass, intensity))
            
    return mass_intensity_list

def count_in_vector(mass_intensities, ionization):
    count = 0
    for mass, intensity in mass_intensities:
        non_ionized_mass = mass - ionization
        mass_int, decimals = str(non_ionized_mass).split(".")
        if int(mass_int) in fragment_mass_to_formula_dict:
            for full_mass, formula in fragment_mass_to_formula_dict[int(mass_int)].items():
                if full_mass-0.0001*full_mass <= non_ionized_mass <= full_mass+0.0001*full_mass:
                    count += 1
    return count

def assign_to_tree(gnps_dir):
    mol_ids = get_mol_names(gnps_dir)
    tree_intensities = pd.DataFrame(0.0, index=mol_ids, columns=combined_index, dtype=float)
    
    for file in os.listdir(gnps_dir):
        assigned_formula_intensity = []
        filepath = os.path.join(gnps_dir, file)
        print(file)
        with open(filepath, 'r') as f:
            ionization_entries_count = []
            unsplit_lines = list(islice(f, 9, None))
            mass_intensity_list = convert_lines_to_list(unsplit_lines)
            for ion in ion_mass.keys():
                ionization_entries_count.append((ion, count_in_vector(mass_intensity_list, ion_mass[ion])))
            ion_with_most_entries = max(ionization_entries_count, key=lambda x:x[1])[0]
            ionization = ion_mass[ion_with_most_entries]

            for mass, intensity in mass_intensity_list:
                non_ionized_mass = mass - ionization
                mass_int, decimals = str(non_ionized_mass).split(".")
                if int(mass_int) in fragment_mass_to_formula_dict:
                    for full_mass, formula in fragment_mass_to_formula_dict[int(mass_int)].items():
                        if full_mass-0.0001*full_mass <= non_ionized_mass <= full_mass+0.0001*full_mass:
                            tree_intensities.at[file[:-3], fragment_mass_to_formula_dict[int(mass_int)][full_mass]] = float(intensity)
                            assigned_formula_intensity.append((fragment_mass_to_formula_dict[int(mass_int)][full_mass], intensity))

            assigned_formula_intensity.reverse() # highest to smallest

            peak_differences = [(subtract_formula(formula_intensity_i[0], formula_intensity_j[0]) + "_loss", 
                                 (formula_intensity_i[1] + formula_intensity_i[1]) / 2.0 ) 
                                for i, formula_intensity_i in enumerate(assigned_formula_intensity)
                                for j, formula_intensity_j in enumerate(assigned_formula_intensity) 
                                if i != j 
                                and j > i
                                and len(re.findall('[A-Z][a-z]*[0-9]*', formula_intensity_i[0])) >= len(re.findall('[A-Z][a-z]*[0-9]*', formula_intensity_j[0]))
                                and subtract_formula(formula_intensity_i[0], formula_intensity_j[0]) + "_loss" in losses_df.index]

            for formula, intensity in peak_differences:
                tree_intensities.at[file[:-3], formula] = float(intensity)
            
            if np.amax(tree_intensities.loc[file[:-3]]) > 0:
                tree_intensities.loc[file[:-3]] = tree_intensities.loc[file[:-3]].div(np.amax(tree_intensities.loc[file[:-3]].values)).mul(999)
            else:
                tree_intensities.drop([file[:-3]], inplace=True)
                
    return tree_intensities

assigned_tree = assign_to_tree(gnps_dir)

CCMSLIB00000001548.ms
CCMSLIB00000001549.ms
CCMSLIB00000001550.ms
CCMSLIB00000001555.ms
CCMSLIB00000001563.ms
CCMSLIB00000001565.ms
CCMSLIB00000001566.ms
CCMSLIB00000001568.ms
CCMSLIB00000001569.ms
CCMSLIB00000001570.ms
CCMSLIB00000001572.ms
CCMSLIB00000001574.ms
CCMSLIB00000001576.ms
CCMSLIB00000001581.ms
CCMSLIB00000001590.ms
CCMSLIB00000001598.ms
CCMSLIB00000001600.ms
CCMSLIB00000001601.ms
CCMSLIB00000001602.ms
CCMSLIB00000001603.ms
CCMSLIB00000001604.ms
CCMSLIB00000001606.ms
CCMSLIB00000001607.ms
CCMSLIB00000001608.ms
CCMSLIB00000001609.ms
CCMSLIB00000001615.ms
CCMSLIB00000001616.ms
CCMSLIB00000001617.ms
CCMSLIB00000001621.ms
CCMSLIB00000001622.ms
CCMSLIB00000001623.ms
CCMSLIB00000001624.ms
CCMSLIB00000001625.ms
CCMSLIB00000001631.ms
CCMSLIB00000001633.ms
CCMSLIB00000001634.ms
CCMSLIB00000001635.ms
CCMSLIB00000001637.ms
CCMSLIB00000001638.ms
CCMSLIB00000001641.ms
CCMSLIB00000001642.ms
CCMSLIB00000001643.ms
CCMSLIB00000001645.ms
CCMSLIB00000001646.ms
CCMSLIB00000001650.ms
CCMSLIB000

CCMSLIB00000005996.ms
CCMSLIB00000005999.ms
CCMSLIB00000006008.ms
CCMSLIB00000006011.ms
CCMSLIB00000006014.ms
CCMSLIB00000006017.ms
CCMSLIB00000006020.ms
CCMSLIB00000006029.ms
CCMSLIB00000006035.ms
CCMSLIB00000006038.ms
CCMSLIB00000006041.ms
CCMSLIB00000006047.ms
CCMSLIB00000006050.ms
CCMSLIB00000006059.ms
CCMSLIB00000006062.ms
CCMSLIB00000006068.ms
CCMSLIB00000006071.ms
CCMSLIB00000006077.ms
CCMSLIB00000006080.ms
CCMSLIB00000006094.ms
CCMSLIB00000006097.ms
CCMSLIB00000006100.ms
CCMSLIB00000006115.ms
CCMSLIB00000006121.ms
CCMSLIB00000006130.ms
CCMSLIB00000006142.ms
CCMSLIB00000006145.ms
CCMSLIB00000006148.ms
CCMSLIB00000006154.ms
CCMSLIB00000006157.ms
CCMSLIB00000006160.ms
CCMSLIB00000006166.ms
CCMSLIB00000006169.ms
CCMSLIB00000006178.ms
CCMSLIB00000006181.ms
CCMSLIB00000006184.ms
CCMSLIB00000006187.ms
CCMSLIB00000006190.ms
CCMSLIB00000006202.ms
CCMSLIB00000006205.ms
CCMSLIB00000006208.ms
CCMSLIB00000006211.ms
CCMSLIB00000006214.ms
CCMSLIB00000006217.ms
CCMSLIB00000006232.ms
CCMSLIB000

CCMSLIB00000077069.ms
CCMSLIB00000077071.ms
CCMSLIB00000077072.ms
CCMSLIB00000077073.ms
CCMSLIB00000077074.ms
CCMSLIB00000077075.ms
CCMSLIB00000077076.ms
CCMSLIB00000077077.ms
CCMSLIB00000077078.ms
CCMSLIB00000077079.ms
CCMSLIB00000077080.ms
CCMSLIB00000077082.ms
CCMSLIB00000077086.ms
CCMSLIB00000077088.ms
CCMSLIB00000077089.ms
CCMSLIB00000077092.ms
CCMSLIB00000077097.ms
CCMSLIB00000077098.ms
CCMSLIB00000077099.ms
CCMSLIB00000077100.ms
CCMSLIB00000077101.ms
CCMSLIB00000077102.ms
CCMSLIB00000077106.ms
CCMSLIB00000077107.ms
CCMSLIB00000077108.ms
CCMSLIB00000077111.ms
CCMSLIB00000077112.ms
CCMSLIB00000077113.ms
CCMSLIB00000077114.ms
CCMSLIB00000077115.ms
CCMSLIB00000077117.ms
CCMSLIB00000077118.ms
CCMSLIB00000077121.ms
CCMSLIB00000077123.ms
CCMSLIB00000077124.ms
CCMSLIB00000077125.ms
CCMSLIB00000077126.ms
CCMSLIB00000077129.ms
CCMSLIB00000077130.ms
CCMSLIB00000077131.ms
CCMSLIB00000077132.ms
CCMSLIB00000077134.ms
CCMSLIB00000077135.ms
CCMSLIB00000077136.ms
CCMSLIB00000077137.ms
CCMSLIB000

CCMSLIB00000078337.ms
CCMSLIB00000078338.ms
CCMSLIB00000078339.ms
CCMSLIB00000078341.ms
CCMSLIB00000078342.ms
CCMSLIB00000078343.ms
CCMSLIB00000078344.ms
CCMSLIB00000078345.ms
CCMSLIB00000078346.ms
CCMSLIB00000078347.ms
CCMSLIB00000078348.ms
CCMSLIB00000078351.ms
CCMSLIB00000078352.ms
CCMSLIB00000078353.ms
CCMSLIB00000078354.ms
CCMSLIB00000078355.ms
CCMSLIB00000078356.ms
CCMSLIB00000078358.ms
CCMSLIB00000078359.ms
CCMSLIB00000078360.ms
CCMSLIB00000078361.ms
CCMSLIB00000078362.ms
CCMSLIB00000078363.ms
CCMSLIB00000078364.ms
CCMSLIB00000078365.ms
CCMSLIB00000078367.ms
CCMSLIB00000078368.ms
CCMSLIB00000078369.ms
CCMSLIB00000078371.ms
CCMSLIB00000078372.ms
CCMSLIB00000078373.ms
CCMSLIB00000078374.ms
CCMSLIB00000078375.ms
CCMSLIB00000078377.ms
CCMSLIB00000078378.ms
CCMSLIB00000078379.ms
CCMSLIB00000078381.ms
CCMSLIB00000078383.ms
CCMSLIB00000078384.ms
CCMSLIB00000078385.ms
CCMSLIB00000078386.ms
CCMSLIB00000078387.ms
CCMSLIB00000078388.ms
CCMSLIB00000078390.ms
CCMSLIB00000078391.ms
CCMSLIB000

CCMSLIB00000078789.ms
CCMSLIB00000078790.ms
CCMSLIB00000078791.ms
CCMSLIB00000078792.ms
CCMSLIB00000078793.ms
CCMSLIB00000078794.ms
CCMSLIB00000078795.ms
CCMSLIB00000078796.ms
CCMSLIB00000078797.ms
CCMSLIB00000078798.ms
CCMSLIB00000078799.ms
CCMSLIB00000078801.ms
CCMSLIB00000078802.ms
CCMSLIB00000078803.ms
CCMSLIB00000078804.ms
CCMSLIB00000078805.ms
CCMSLIB00000078806.ms
CCMSLIB00000078807.ms
CCMSLIB00000078808.ms
CCMSLIB00000078809.ms
CCMSLIB00000078810.ms
CCMSLIB00000078811.ms
CCMSLIB00000078812.ms
CCMSLIB00000078813.ms
CCMSLIB00000078814.ms
CCMSLIB00000078815.ms
CCMSLIB00000078816.ms
CCMSLIB00000078817.ms
CCMSLIB00000078818.ms
CCMSLIB00000078819.ms
CCMSLIB00000078820.ms
CCMSLIB00000078821.ms
CCMSLIB00000078822.ms
CCMSLIB00000078845.ms
CCMSLIB00000078850.ms
CCMSLIB00000078851.ms
CCMSLIB00000078852.ms
CCMSLIB00000078855.ms
CCMSLIB00000078856.ms
CCMSLIB00000078857.ms
CCMSLIB00000078858.ms
CCMSLIB00000078859.ms
CCMSLIB00000078861.ms
CCMSLIB00000078862.ms
CCMSLIB00000078866.ms
CCMSLIB000

CCMSLIB00000079266.ms
CCMSLIB00000079267.ms
CCMSLIB00000079268.ms
CCMSLIB00000079271.ms
CCMSLIB00000079272.ms
CCMSLIB00000079273.ms
CCMSLIB00000079274.ms
CCMSLIB00000079275.ms
CCMSLIB00000079350.ms
CCMSLIB00000079351.ms
CCMSLIB00000079352.ms
CCMSLIB00000079353.ms
CCMSLIB00000079354.ms
CCMSLIB00000079355.ms
CCMSLIB00000079356.ms
CCMSLIB00000079357.ms
CCMSLIB00000079358.ms
CCMSLIB00000079359.ms
CCMSLIB00000079360.ms
CCMSLIB00000079361.ms
CCMSLIB00000079362.ms
CCMSLIB00000079363.ms
CCMSLIB00000079364.ms
CCMSLIB00000079365.ms
CCMSLIB00000079366.ms
CCMSLIB00000079367.ms
CCMSLIB00000079368.ms
CCMSLIB00000079369.ms
CCMSLIB00000079370.ms
CCMSLIB00000079371.ms
CCMSLIB00000079372.ms
CCMSLIB00000079373.ms
CCMSLIB00000079374.ms
CCMSLIB00000079375.ms
CCMSLIB00000079376.ms
CCMSLIB00000079377.ms
CCMSLIB00000079378.ms
CCMSLIB00000079380.ms
CCMSLIB00000079381.ms
CCMSLIB00000079382.ms
CCMSLIB00000079383.ms
CCMSLIB00000079384.ms
CCMSLIB00000079385.ms
CCMSLIB00000079386.ms
CCMSLIB00000079387.ms
CCMSLIB000

CCMSLIB00000079721.ms
CCMSLIB00000079722.ms
CCMSLIB00000079723.ms
CCMSLIB00000079724.ms
CCMSLIB00000079725.ms
CCMSLIB00000079726.ms
CCMSLIB00000079727.ms
CCMSLIB00000079728.ms
CCMSLIB00000079729.ms
CCMSLIB00000079730.ms
CCMSLIB00000079731.ms
CCMSLIB00000079732.ms
CCMSLIB00000079733.ms
CCMSLIB00000079734.ms
CCMSLIB00000079735.ms
CCMSLIB00000079736.ms
CCMSLIB00000079737.ms
CCMSLIB00000079738.ms
CCMSLIB00000079739.ms
CCMSLIB00000079740.ms
CCMSLIB00000079741.ms
CCMSLIB00000079742.ms
CCMSLIB00000079743.ms
CCMSLIB00000079744.ms
CCMSLIB00000079745.ms
CCMSLIB00000079746.ms
CCMSLIB00000079747.ms
CCMSLIB00000079749.ms
CCMSLIB00000079750.ms
CCMSLIB00000079751.ms
CCMSLIB00000079752.ms
CCMSLIB00000079753.ms
CCMSLIB00000079754.ms
CCMSLIB00000079755.ms
CCMSLIB00000079756.ms
CCMSLIB00000079757.ms
CCMSLIB00000079758.ms
CCMSLIB00000079759.ms
CCMSLIB00000079760.ms
CCMSLIB00000079761.ms
CCMSLIB00000079762.ms
CCMSLIB00000079763.ms
CCMSLIB00000079764.ms
CCMSLIB00000079765.ms
CCMSLIB00000079766.ms
CCMSLIB000

CCMSLIB00000080105.ms
CCMSLIB00000080106.ms
CCMSLIB00000080107.ms
CCMSLIB00000080108.ms
CCMSLIB00000080109.ms
CCMSLIB00000080110.ms
CCMSLIB00000080111.ms
CCMSLIB00000080112.ms
CCMSLIB00000080113.ms
CCMSLIB00000080114.ms
CCMSLIB00000080115.ms
CCMSLIB00000080116.ms
CCMSLIB00000080117.ms
CCMSLIB00000080118.ms
CCMSLIB00000080119.ms
CCMSLIB00000080120.ms
CCMSLIB00000080121.ms
CCMSLIB00000080122.ms
CCMSLIB00000080123.ms
CCMSLIB00000080124.ms
CCMSLIB00000080125.ms
CCMSLIB00000080126.ms
CCMSLIB00000080127.ms
CCMSLIB00000080128.ms
CCMSLIB00000080129.ms
CCMSLIB00000080130.ms
CCMSLIB00000080131.ms
CCMSLIB00000080132.ms
CCMSLIB00000080133.ms
CCMSLIB00000080134.ms
CCMSLIB00000080135.ms
CCMSLIB00000080136.ms
CCMSLIB00000080137.ms
CCMSLIB00000080138.ms
CCMSLIB00000080139.ms
CCMSLIB00000080140.ms
CCMSLIB00000080141.ms
CCMSLIB00000080142.ms
CCMSLIB00000080143.ms
CCMSLIB00000080144.ms
CCMSLIB00000080145.ms
CCMSLIB00000080146.ms
CCMSLIB00000080147.ms
CCMSLIB00000080148.ms
CCMSLIB00000080149.ms
CCMSLIB000

CCMSLIB00000080484.ms
CCMSLIB00000080485.ms
CCMSLIB00000080486.ms
CCMSLIB00000080487.ms
CCMSLIB00000080488.ms
CCMSLIB00000080489.ms
CCMSLIB00000080490.ms
CCMSLIB00000080491.ms
CCMSLIB00000080492.ms
CCMSLIB00000080493.ms
CCMSLIB00000080494.ms
CCMSLIB00000080495.ms
CCMSLIB00000080496.ms
CCMSLIB00000080497.ms
CCMSLIB00000080498.ms
CCMSLIB00000080499.ms
CCMSLIB00000080500.ms
CCMSLIB00000080502.ms
CCMSLIB00000080503.ms
CCMSLIB00000080504.ms
CCMSLIB00000080505.ms
CCMSLIB00000080506.ms
CCMSLIB00000080507.ms
CCMSLIB00000080508.ms
CCMSLIB00000080509.ms
CCMSLIB00000080510.ms
CCMSLIB00000080511.ms
CCMSLIB00000080512.ms
CCMSLIB00000080513.ms
CCMSLIB00000080514.ms
CCMSLIB00000080515.ms
CCMSLIB00000080516.ms
CCMSLIB00000080517.ms
CCMSLIB00000080518.ms
CCMSLIB00000080519.ms
CCMSLIB00000080520.ms
CCMSLIB00000080521.ms
CCMSLIB00000080522.ms
CCMSLIB00000080523.ms
CCMSLIB00000080524.ms
CCMSLIB00000080525.ms
CCMSLIB00000080526.ms
CCMSLIB00000080527.ms
CCMSLIB00000080528.ms
CCMSLIB00000080529.ms
CCMSLIB000

CCMSLIB00000084849.ms
CCMSLIB00000084850.ms
CCMSLIB00000084851.ms
CCMSLIB00000084852.ms
CCMSLIB00000084853.ms
CCMSLIB00000084854.ms
CCMSLIB00000084855.ms
CCMSLIB00000084856.ms
CCMSLIB00000084857.ms
CCMSLIB00000084858.ms
CCMSLIB00000084859.ms
CCMSLIB00000084860.ms
CCMSLIB00000084861.ms
CCMSLIB00000084862.ms
CCMSLIB00000084863.ms
CCMSLIB00000084864.ms
CCMSLIB00000084865.ms
CCMSLIB00000084866.ms
CCMSLIB00000084867.ms
CCMSLIB00000084868.ms
CCMSLIB00000084869.ms
CCMSLIB00000084870.ms
CCMSLIB00000084871.ms
CCMSLIB00000084872.ms
CCMSLIB00000084874.ms
CCMSLIB00000084875.ms
CCMSLIB00000084876.ms
CCMSLIB00000084877.ms
CCMSLIB00000084879.ms
CCMSLIB00000084880.ms
CCMSLIB00000084881.ms
CCMSLIB00000084882.ms
CCMSLIB00000084884.ms
CCMSLIB00000084885.ms
CCMSLIB00000084886.ms
CCMSLIB00000084887.ms
CCMSLIB00000084889.ms
CCMSLIB00000084890.ms
CCMSLIB00000084891.ms
CCMSLIB00000084892.ms
CCMSLIB00000084893.ms
CCMSLIB00000084894.ms
CCMSLIB00000084895.ms
CCMSLIB00000084896.ms
CCMSLIB00000084897.ms
CCMSLIB000

CCMSLIB00000085288.ms
CCMSLIB00000085289.ms
CCMSLIB00000085290.ms
CCMSLIB00000085291.ms
CCMSLIB00000085292.ms
CCMSLIB00000085293.ms
CCMSLIB00000085294.ms
CCMSLIB00000085295.ms
CCMSLIB00000085296.ms
CCMSLIB00000085297.ms
CCMSLIB00000085298.ms
CCMSLIB00000085299.ms
CCMSLIB00000085301.ms
CCMSLIB00000085302.ms
CCMSLIB00000085303.ms
CCMSLIB00000085305.ms
CCMSLIB00000085306.ms
CCMSLIB00000085307.ms
CCMSLIB00000085309.ms
CCMSLIB00000085310.ms
CCMSLIB00000085312.ms
CCMSLIB00000085313.ms
CCMSLIB00000085314.ms
CCMSLIB00000085315.ms
CCMSLIB00000085317.ms
CCMSLIB00000085318.ms
CCMSLIB00000085319.ms
CCMSLIB00000085320.ms
CCMSLIB00000085321.ms
CCMSLIB00000085322.ms
CCMSLIB00000085323.ms
CCMSLIB00000085324.ms
CCMSLIB00000085325.ms
CCMSLIB00000085326.ms
CCMSLIB00000085327.ms
CCMSLIB00000085328.ms
CCMSLIB00000085329.ms
CCMSLIB00000085330.ms
CCMSLIB00000085331.ms
CCMSLIB00000085332.ms
CCMSLIB00000085333.ms
CCMSLIB00000085334.ms
CCMSLIB00000085335.ms
CCMSLIB00000085336.ms
CCMSLIB00000085337.ms
CCMSLIB000

CCMSLIB00000085690.ms
CCMSLIB00000085691.ms
CCMSLIB00000085692.ms
CCMSLIB00000085693.ms
CCMSLIB00000085695.ms
CCMSLIB00000085696.ms
CCMSLIB00000085697.ms
CCMSLIB00000085698.ms
CCMSLIB00000085699.ms
CCMSLIB00000085700.ms
CCMSLIB00000085701.ms
CCMSLIB00000085702.ms
CCMSLIB00000085703.ms
CCMSLIB00000085704.ms
CCMSLIB00000085705.ms
CCMSLIB00000085706.ms
CCMSLIB00000085707.ms
CCMSLIB00000085710.ms
CCMSLIB00000085711.ms
CCMSLIB00000085713.ms
CCMSLIB00000085714.ms
CCMSLIB00000085715.ms
CCMSLIB00000085716.ms
CCMSLIB00000085717.ms
CCMSLIB00000085718.ms
CCMSLIB00000085719.ms
CCMSLIB00000085720.ms
CCMSLIB00000085721.ms
CCMSLIB00000085722.ms
CCMSLIB00000085723.ms
CCMSLIB00000085724.ms
CCMSLIB00000085725.ms
CCMSLIB00000085726.ms
CCMSLIB00000085727.ms
CCMSLIB00000085728.ms
CCMSLIB00000085729.ms
CCMSLIB00000085731.ms
CCMSLIB00000085732.ms
CCMSLIB00000085733.ms
CCMSLIB00000085734.ms
CCMSLIB00000085735.ms
CCMSLIB00000085736.ms
CCMSLIB00000085737.ms
CCMSLIB00000085738.ms
CCMSLIB00000085739.ms
CCMSLIB000

CCMSLIB00000086094.ms
CCMSLIB00000086095.ms
CCMSLIB00000086096.ms
CCMSLIB00000086097.ms
CCMSLIB00000086098.ms
CCMSLIB00000086099.ms
CCMSLIB00000086101.ms
CCMSLIB00000086102.ms
CCMSLIB00000086103.ms
CCMSLIB00000086104.ms
CCMSLIB00000086105.ms
CCMSLIB00000086106.ms
CCMSLIB00000086107.ms
CCMSLIB00000086108.ms
CCMSLIB00000086109.ms
CCMSLIB00000086110.ms
CCMSLIB00000086111.ms
CCMSLIB00000086112.ms
CCMSLIB00000086113.ms
CCMSLIB00000086114.ms
CCMSLIB00000086115.ms
CCMSLIB00000086116.ms
CCMSLIB00000086117.ms
CCMSLIB00000086118.ms
CCMSLIB00000086119.ms
CCMSLIB00000086120.ms
CCMSLIB00000086121.ms
CCMSLIB00000086122.ms
CCMSLIB00000086124.ms
CCMSLIB00000086125.ms
CCMSLIB00000086126.ms
CCMSLIB00000086127.ms
CCMSLIB00000086128.ms
CCMSLIB00000086129.ms
CCMSLIB00000086130.ms
CCMSLIB00000086131.ms
CCMSLIB00000086132.ms
CCMSLIB00000086133.ms
CCMSLIB00000086134.ms
CCMSLIB00000086135.ms
CCMSLIB00000086136.ms
CCMSLIB00000086137.ms
CCMSLIB00000086138.ms
CCMSLIB00000086139.ms
CCMSLIB00000086140.ms
CCMSLIB000

CCMSLIB00000478085.ms
CCMSLIB00000478087.ms
CCMSLIB00000478088.ms
CCMSLIB00000478090.ms
CCMSLIB00000478091.ms
CCMSLIB00000478092.ms
CCMSLIB00000478093.ms
CCMSLIB00000478094.ms
CCMSLIB00000478096.ms
CCMSLIB00000478098.ms
CCMSLIB00000478099.ms
CCMSLIB00000478100.ms
CCMSLIB00000478101.ms
CCMSLIB00000478102.ms
CCMSLIB00000478104.ms
CCMSLIB00000478105.ms
CCMSLIB00000478112.ms
CCMSLIB00000478116.ms
CCMSLIB00000478118.ms
CCMSLIB00000478119.ms
CCMSLIB00000478120.ms
CCMSLIB00000478121.ms
CCMSLIB00000478122.ms
CCMSLIB00000478123.ms
CCMSLIB00000478414.ms
CCMSLIB00000478415.ms
CCMSLIB00000478416.ms
CCMSLIB00000478417.ms
CCMSLIB00000478418.ms
CCMSLIB00000478419.ms
CCMSLIB00000478420.ms
CCMSLIB00000478421.ms
CCMSLIB00000478422.ms
CCMSLIB00000478423.ms
CCMSLIB00000478424.ms
CCMSLIB00000478425.ms
CCMSLIB00000478426.ms
CCMSLIB00000478427.ms
CCMSLIB00000478428.ms
CCMSLIB00000478429.ms
CCMSLIB00000478432.ms
CCMSLIB00000478433.ms
CCMSLIB00000478435.ms
CCMSLIB00000478436.ms
CCMSLIB00000478437.ms
CCMSLIB000

CCMSLIB00000577680.ms
CCMSLIB00000577681.ms
CCMSLIB00000577682.ms
CCMSLIB00000577683.ms
CCMSLIB00000577684.ms
CCMSLIB00000577685.ms
CCMSLIB00000577688.ms
CCMSLIB00000577691.ms
CCMSLIB00000577692.ms
CCMSLIB00000577693.ms
CCMSLIB00000577695.ms
CCMSLIB00000577696.ms
CCMSLIB00000577697.ms
CCMSLIB00000577700.ms
CCMSLIB00000577701.ms
CCMSLIB00000577703.ms
CCMSLIB00000577704.ms
CCMSLIB00000577705.ms
CCMSLIB00000577706.ms
CCMSLIB00000577708.ms
CCMSLIB00000577709.ms
CCMSLIB00000577712.ms
CCMSLIB00000577713.ms
CCMSLIB00000577715.ms
CCMSLIB00000577717.ms
CCMSLIB00000577718.ms
CCMSLIB00000577719.ms
CCMSLIB00000577720.ms
CCMSLIB00000577721.ms
CCMSLIB00000577723.ms
CCMSLIB00000577727.ms
CCMSLIB00000577728.ms
CCMSLIB00000577730.ms
CCMSLIB00000577731.ms
CCMSLIB00000577734.ms
CCMSLIB00000577737.ms
CCMSLIB00000577740.ms
CCMSLIB00000577741.ms
CCMSLIB00000577742.ms
CCMSLIB00000577743.ms
CCMSLIB00000577744.ms
CCMSLIB00000577745.ms
CCMSLIB00000577746.ms
CCMSLIB00000577749.ms
CCMSLIB00000577750.ms
CCMSLIB000

CCMSLIB00000579779.ms
CCMSLIB00000579781.ms
CCMSLIB00000579782.ms
CCMSLIB00000579783.ms
CCMSLIB00000579784.ms
CCMSLIB00000579785.ms
CCMSLIB00000579786.ms
CCMSLIB00000579787.ms
CCMSLIB00000579788.ms
CCMSLIB00000579789.ms
CCMSLIB00000579790.ms
CCMSLIB00000579791.ms
CCMSLIB00000579792.ms
CCMSLIB00000579793.ms
CCMSLIB00000579794.ms
CCMSLIB00000579795.ms
CCMSLIB00000579796.ms
CCMSLIB00000579797.ms
CCMSLIB00000579798.ms
CCMSLIB00000579799.ms
CCMSLIB00000579800.ms
CCMSLIB00000579801.ms
CCMSLIB00000579802.ms
CCMSLIB00000579803.ms
CCMSLIB00000579805.ms
CCMSLIB00000579806.ms
CCMSLIB00000579807.ms
CCMSLIB00000579808.ms
CCMSLIB00000579809.ms
CCMSLIB00000579810.ms
CCMSLIB00000579811.ms
CCMSLIB00000579812.ms
CCMSLIB00000579813.ms
CCMSLIB00000579814.ms
CCMSLIB00000579815.ms
CCMSLIB00000579816.ms
CCMSLIB00000579817.ms
CCMSLIB00000579818.ms
CCMSLIB00000579819.ms
CCMSLIB00000579820.ms
CCMSLIB00000579822.ms
CCMSLIB00000579823.ms
CCMSLIB00000579825.ms
CCMSLIB00000579826.ms
CCMSLIB00000579827.ms
CCMSLIB000

In [13]:
assigned_tree_path = "G:\\Dev\\Data\\assigned_tree_final.pkl"

In [14]:
assigned_tree.to_pickle(assigned_tree_path)

In [15]:
assigned_tree = pd.read_pickle(assigned_tree_path)
print(assigned_tree)

                    BrH_loss  Br_loss  C10  C10H10  C10H10ClN  C10H10ClNO  \
CCMSLIB00000001548       0.0      0.0  0.0     0.0        0.0    0.000000   
CCMSLIB00000001549       0.0      0.0  0.0     0.0        0.0  385.558788   
CCMSLIB00000001550       0.0      0.0  0.0     0.0        0.0    0.000000   
CCMSLIB00000001555       0.0      0.0  0.0     0.0        0.0    0.000000   
CCMSLIB00000001563       0.0      0.0  0.0     0.0        0.0    0.000000   
CCMSLIB00000001565       0.0      0.0  0.0     0.0        0.0    0.000000   
CCMSLIB00000001566       0.0      0.0  0.0     0.0        0.0    0.000000   
CCMSLIB00000001568       0.0      0.0  0.0     0.0        0.0    0.000000   
CCMSLIB00000001569       0.0      0.0  0.0     0.0        0.0    0.000000   
CCMSLIB00000001570       0.0      0.0  0.0     0.0        0.0    0.000000   
CCMSLIB00000001572       0.0      0.0  0.0     0.0        0.0    0.000000   
CCMSLIB00000001574       0.0      0.0  0.0     0.0        0.0    0.000000   

In [16]:
from keras.layers import Input, Dense
from keras.models import Model,Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from keras.optimizers import SGD

def baseline_model(x_train_formula, x_train_fingerprints):
    class_model = Sequential()
    class_model.add(Dense(2500, input_dim=x_train_formula.shape[1], kernel_initializer='normal', activation='relu'))
    class_model.add(Dense(1200,kernel_initializer='normal',activation = 'relu'))
    class_model.add(Dense(600,kernel_initializer='normal',activation = 'relu'))
    class_model.add(Dense(x_train_fingerprints.shape[1],kernel_initializer='normal',activation = 'sigmoid'))
    class_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return class_model

Using TensorFlow backend.


In [17]:
fingerprints_path = "G:\\Dev\\Data\\1000\\GNPS Python Master\\Final Fingerprints.txt"

In [18]:
fingerprints = load_fingerprints_master(fingerprints_path)

for idx, value in fingerprints.astype(bool).sum(axis=0).iteritems():
    if value < 10:
        fingerprints.drop(columns=[idx], inplace=True)
        
fingerprints.sort_index(inplace=True)

print(fingerprints)

                    0    1    2    3    4    5    7    8    11   12  ...   \
CCMSLIB00000001548    1    1    1    0    0    0    0    0    1    0 ...    
CCMSLIB00000001549    1    1    0    0    1    0    0    0    1    0 ...    
CCMSLIB00000001550    0    0    0    0    1    0    0    0    0    0 ...    
CCMSLIB00000001555    1    1    1    0    0    0    1    0    1    0 ...    
CCMSLIB00000001563    1    1    1    0    0    0    0    0    0    0 ...    
CCMSLIB00000001565    1    1    1    0    1    1    0    0    0    0 ...    
CCMSLIB00000001566    1    1    1    0    0    0    0    0    0    0 ...    
CCMSLIB00000001568    1    1    1    0    0    0    0    0    1    1 ...    
CCMSLIB00000001569    1    1    1    0    1    1    0    0    0    0 ...    
CCMSLIB00000001570    1    1    1    0    0    0    0    0    1    1 ...    
CCMSLIB00000001572    1    1    1    0    0    0    0    0    1    1 ...    
CCMSLIB00000001574    1    1    0    0    0    0    0    0    0    0 ...    

In [19]:
def compute_auc(indexes, true, pred):
    auc_scores = []
    
    for i, index in enumerate(indexes):
        nonzero_vals = np.count_nonzero(true[:, i]) # Count number of nonzero values
        if nonzero_vals > 0 and nonzero_vals < true[:, i].size: # If there are no 1s or no 0s, can't compute.
            fp_true = true[:, i]
            fp_pred = pred[:, i]
            score = metrics.roc_auc_score(fp_true, fp_pred)
            auc_scores.append((index, score))
        else:
            auc_scores.append((index, 0.0))
            
    print("Compute AUC done")
    return auc_scores

def compute_f1(indexes, true, pred):
    f1_scores = []
    
    for i, index in enumerate(indexes):
        nonzero_vals = np.count_nonzero(true[:, i]) # Count number of nonzero values
        if nonzero_vals > 0 and nonzero_vals < true[:, i].size: # If there are no 1s or no 0s, can't compute.
            fp_true = true[:, i]
            fp_pred = pred[:, i]
            score = metrics.f1_score(fp_true, fp_pred, average='micro')
            f1_scores.append((index, score))
        else:
            f1_scores.append((index, 0.0))
            
    print("Compute F1 done")
    return f1_scores

In [20]:
def get_tree_loss_training(fingerprints, train_tree):
    train_tree.sort_index(inplace=True)
    x_train_formula = np.log(train_tree.values+1)
    
    train_fingerprints = fingerprints[fingerprints.index.isin(train_tree.index)]
    train_fingerprints.sort_index(inplace=True)
    x_train_fingerprints = train_fingerprints.values
    
    return x_train_formula, x_train_fingerprints

def get_tree_loss_validation(fingerprints, validate_tree):
    validate_tree.sort_index(inplace=True)
    x_validate_formula = np.log(validate_tree.values+1)
    
    validate_fingerprints = fingerprints[fingerprints.index.isin(validate_tree.index)]
    validate_fingerprints.sort_index(inplace=True)
    x_validate_fingerprints = validate_fingerprints.values
    
    return x_validate_formula, x_validate_fingerprints

def get_tree_loss_test(fingerprints, test_tree):
    test_tree.sort_index(inplace=True)
    x_test_formula = np.log(test_tree.values+1)
    
    test_fingerprints = fingerprints[fingerprints.index.isin(test_tree.index)]
    test_fingerprints.sort_index(inplace=True)
    x_test_fingerprints = test_fingerprints.values
    
    return x_test_formula, x_test_fingerprints

def run_experiments(fingerprints, tree_with_losses):
    epochs = 100

    train, validate, test = np.split(tree_with_losses.sample(frac=1), [int(.6*len(tree_with_losses)), int(.8*len(tree_with_losses))])
    
    x_train_formula_with_loss, x_train_fingerprints_with_loss = get_tree_loss_training(fingerprints, train)
    x_validate_formula_with_loss, x_validate_fingerprints_with_loss = get_tree_loss_validation(fingerprints, validate)
    x_test_formula_with_loss, x_test_fingerprints_with_loss = get_tree_loss_test(fingerprints, test)
    
    mod = baseline_model(x_train_formula_with_loss, x_train_fingerprints_with_loss)
    history = mod.fit(x_train_formula_with_loss, x_train_fingerprints_with_loss, epochs=epochs, 
                        validation_data=(x_validate_formula_with_loss,x_validate_fingerprints_with_loss), verbose=0)
    
    predicted = mod.predict(x_test_formula_with_loss)
    stats = compute_auc(fingerprints.columns.tolist(), x_test_fingerprints_with_loss, predicted)
    
    prediction = np.zeros((predicted.shape))
    prediction[predicted > 0.5] = 1
    f1_stats = compute_f1(fingerprints.columns.tolist(), x_test_fingerprints_with_loss, prediction)
    
    return stats, f1_stats


In [21]:
# experiment_auc_path = "G://Dev//Data//tree_vs_tree_with_shifts_experiment//"

for i in range(1):
    print(i)
    auc_stats, f1_stats = run_experiments(fingerprints, assigned_tree)
    print(auc_stats)
    print()
    print(f1_stats)
    
#     base_exp_aucs_path = experiment_auc_path + "experiment_{}_aucs.csv".format(i)
#     f1_scores_path = experiment_auc_path + "f1_scores_experiment_{}.csv".format(i)
    
#     with open(base_exp_aucs_path, 'w') as f:
#         for i, auc_score in enumerate(base_stats):
#             fingerprint_index, auc = auc_score
#             f.write(str(fingerprint_index) + "," + str(auc) + "," + str(comparison_stats[i][1]) + "\n")
    
#     with open(f1_scores_path, 'w') as f:
#         for i, f1_score in enumerate(base_f1_stats):
#             fingerprint_index, f1 = f1_score
#             f.write(str(fingerprint_index) + "," + str(f1) + "," + str(comparison_f1_stats[i][1]) + "\n")

0
Compute AUC done
Compute F1 done
[(0, 0.779719562243502), (1, 0.8078255347400629), (2, 0.8180961785635913), (3, 0.8362604060172337), (4, 0.7718704135370802), (5, 0.6908488063660477), (7, 0.6634994766084226), (8, 0.8772646536412076), (11, 0.7753380146871869), (12, 0.6387273566401293), (13, 0.7494522179490677), (14, 0.7973311132254995), (15, 0.7704833443299667), (16, 0.6780106571936056), (17, 0.7977557734157222), (18, 0.8202453987730061), (19, 0.6683610867659947), (20, 0.9005706760316067), (22, 0.7971984834937862), (23, 0.7036568213783403), (24, 0.7287599497276916), (25, 0.8054706331941913), (26, 0.7253193960511034), (27, 0.7876432486297957), (28, 0.8535309503051439), (29, 0.6223000583771162), (31, 0.4335222319093287), (32, 0.8183443841671689), (34, 0.8787481804949054), (36, 0.6691629955947136), (37, 0.6397896581945662), (38, 0.7881429816913688), (39, 0.8209050963789731), (40, 0.6860448873160737), (47, 0.8455178416013925), (48, 0.7439664939664941), (51, 0.7931034482758621), (54, 0.5224

In [None]:
experiment_auc_path = "G://Dev//Data//tree_vs_tree_with_shifts_experiment//"

exp_aucs_path = experiment_auc_path + "experiment_{}_aucs.csv".format(i)
f1_scores_path = experiment_auc_path + "f1_scores_experiment_{}.csv".format(i)

with open(base_exp_aucs_path, 'w') as f:
    for fingerprint_index, auc_score in stats:
        fingerprint_index, auc = auc_score
        f.write(str(fingerprint_index) + "," + str(auc) + "," + str(comparison_stats[i][1]) + "\n")

with open(f1_scores_path, 'w') as f:
    for i, f1_score in enumerate(base_f1_stats):
        fingerprint_index, f1 = f1_score
        f.write(str(fingerprint_index) + "," + str(f1) + "," + str(comparison_f1_stats[i][1]) + "\n")