In [1]:
import numpy as np # v. 1.22.1
from matplotlib import pyplot as plt # v. 3.7.5
import os
import pandas as pd # v. 1.5.3
import seaborn as sns # v. 0.12.2
import pykrev as pk # v. 1.2.4
import datetime

import sys
sys.path.append('../mchem_functions')
import ms_functions_and_defs as msf

In [2]:
csv_dir = '../../Data/MS_data/CSV_files'
include_srfa = 1
formula_filter = 2 # 2 or 3, or however many technical replicates of the single biological replicate you have. If 3, 2 is a "soft" filter, 3 a "hard" filter
chosen_date = [''] #date in the format dd/mm/yyyy

In [3]:
# Change directory to the csv_dir
try: os.chdir(csv_dir)
except FileNotFoundError: pass
chosen_date = [x.replace('/','-') for x in chosen_date]

In [4]:
processed_csv_data_dir = 'processed_csv_data_dir'
if not os.path.exists(processed_csv_data_dir): os.mkdir(processed_csv_data_dir)

In [5]:
# load the csv files
csv_list = []
files_list = os.listdir()

for f in files_list:
    if f.endswith('.csv'):
        if chosen_date not in  [[],['']]:
            if f.split('_')[-2] in chosen_date:
                csv_list.append(f)
        else:
            csv_list.append(f)

if not include_srfa: csv_list = [csv for csv in csv_list if 'SRFA' not in csv.upper()]
# csv_list

In [6]:
def season(date:str,sep='-'):
    if sep in date:
        month = int(date.split(sep)[1])
    else:
        month = int(date)

    if month in [9,10,11]: return 'Aut'
    elif month in [12,1,]: return 'Win1'
    elif month in [2]: return 'Win2'
    elif month in [3,4,5]: return 'Spr'
    elif month in [6,7,8]: return 'Sum'

def csv_date_path(path:str):
    if path.endswith('.csv'): path.replace('.csv','')
    
    if chosen_date not in [[],[''],'']:
        if type(chosen_date) != str:
            path += '_'
            for s in chosen_date:
                path += s
                if s != chosen_date[-1]:
                    path += '_'
            
        else: path += f'_{chosen_date}'

    path += '.csv'

    return path

def short_name(longname:str):
    new_name = longname.replace('MP-','').replace('-OM','').split('-')

    if 'PL' in new_name:
        new_name = '-'.join([new_name[0],new_name[1],season(new_name[3])])
    elif 'SRFA' in new_name[0].upper(): new_name = '-'.join(new_name)
    else:
        new_name = '-'.join([new_name[0],new_name[2],season(new_name[4])])
    
    return new_name

In [7]:
sample_names = []
for csv in csv_list:
    if 'SRFA' in csv.upper():
        sample_names.append('_'.join([csv.split('_')[0],csv.split('_')[1]]).upper())
    else:
        sample_names.append(csv.split('_')[0])

sample_names_unique = np.unique(np.array(sample_names))
# sample_names_unique

In [8]:
short_names = [short_name(x) for x in sample_names_unique]
# short_names

In [9]:
samples_formulae_dict = {}
general_approved_formulae = pk.msTupleDict()

for name in sample_names_unique:
    replicate_csvs = [csv for csv in csv_list if name in csv]
    replicate_formulae_list = []

    # mz_TupleDict = pk.msTupleDict()
    sample_dataDict = pk.msTupleDict()

    if len(replicate_csvs) > 1:
        
        for csv in replicate_csvs:
            csv_df = pd.read_csv(csv)
            pk_df = pk.read_corems(csv_df)
            csv_split = csv.split('_')

            sample_dataDict[str(msf.roman_to_integer(csv_split[1]))] = pk_df

        # put the dataDict into an ordination df, so that we can drop certain formulae
        ord = sample_dataDict.to_OrdinationMatrix()
        
        # filter by the number of non-NAN values wanted
        formula_filtered = ord.dropna(thresh=formula_filter, axis=1)

    else:
        csv = replicate_csvs[0]
        csv_df = pd.read_csv(csv)
        pk_df = pk.read_corems(csv_df)
        sample_dataDict[1] = pk_df
        formula_filtered = pd.DataFrame(pk_df.intensity/np.sum(pk_df.intensity),index=pk_df.formula,columns=[1]).T

    name = short_name(name)

    samples_formulae_dict[name] = {}
    samples_formulae_dict[name]['formulae'] = list(formula_filtered.columns)
    samples_formulae_dict[name]['assigned_intensities'] = formula_filtered

    formula_filtered_arr = formula_filtered.to_numpy()
    for i in range(len(formula_filtered_arr[:,0])):
        formula_filtered_arr[i,:] = formula_filtered_arr[i,:] / np.nansum(formula_filtered_arr[i,:])

    samples_formulae_dict[name]['avg_rel_intensities'] = np.nanmean(formula_filtered_arr,axis=0)

    general_approved_formulae[name] = list(formula_filtered.columns)

    mzs_dict = pk.msTupleDict()

    for f in formula_filtered.columns:
        for idx in formula_filtered.index:
            if f == list(formula_filtered.columns)[0]:
                mzs_dict[idx] = []

            if f in sample_dataDict[idx].formula:
                mzs_dict[idx].append(sample_dataDict[idx].mz[np.where(np.array(sample_dataDict[idx].formula) == f)][0])

            else: mzs_dict[idx].append(np.nan)

    mzs_df = pd.DataFrame(mzs_dict.values(),index=mzs_dict.keys(),columns=formula_filtered.columns)

    samples_formulae_dict[name]['assigned_m/z'] = mzs_df
    samples_formulae_dict[name]['avg_m/z'] = np.nanmean(mzs_df.to_numpy(),axis=0)

In [10]:
general_df = pk.msTupleDict()

for name in samples_formulae_dict.keys():
    formulae = general_approved_formulae[name]

    general_df[name] = pk.msTuple(formula=formulae,intensity=samples_formulae_dict[name]['avg_rel_intensities'],mz=np.ones(len(formulae)))

general_ord_df = general_df.to_OrdinationMatrix()
# general_ord_df

In [11]:
short_replicate_names = []
for csv in csv_list:
    csv = csv.split('_')
    short_replicate_names.append('_'.join([short_name(csv[0]),csv[1]]))

all_short_names = np.unique((short_names + short_replicate_names))
all_short_names.sort()
# all_short_names

In [12]:
overall_intensity_df = pk.msTupleDict()
overall_mz_df = pk.msTupleDict()
replicate = []

for name in all_short_names:

    if name in general_approved_formulae.keys():
        formulae = general_approved_formulae[name]
        intensities = samples_formulae_dict[name]['avg_rel_intensities']
        mz = samples_formulae_dict[name]['avg_m/z']
        replicate.append(0)
    
    else:
        name_split = name.split('_')
        roman_no = str(msf.roman_to_integer(name_split[1]))

        formulae = general_approved_formulae[name_split[0]]
        # if int(roman_no) in samples_formulae_dict[name_split[0]]['assigned_intensities'].index:
        intensities = samples_formulae_dict[name_split[0]]['assigned_intensities'].loc[roman_no].to_numpy()
        mz = samples_formulae_dict[name_split[0]]['assigned_m/z'].loc[roman_no].to_numpy()

        replicate.append(1)

    overall_intensity_df[name] = pk.msTuple(formula=formulae,intensity=intensities,mz=np.ones(len(formulae)))
    overall_mz_df[name] = pk.msTuple(formula=formulae,intensity=mz,mz=np.ones(len(formulae)))

In [13]:
pd.DataFrame(samples_formulae_dict)

Unnamed: 0,AM-A-Aut,AM-B-Win1,AM-C-Aut,AM-C-Win1,AM-C-Win2,AM-D-Win1,AM-E-Win1,AM-E-Win2,HM-PL-Aut,HM-PL-Win2,...,HM-H-Aut,HM-H-Win2,SRFA_04-02-2025,SRFA_06-12-2024,SRFA_08-11-2024,SRFA_11-04-2025,SRFA_14-03-2025,SRFA_20-03-2025,SRFA_27-03-2025,SRFA_28-02-2025
formulae,"[C26H34O9, C38H74O7, C18H18O6, C34H67N1O3, C13...","[C23H30O12, C26H34O9, C21H26O7, C18H18O6, C21H...","[C18H18O6, C21H44O5, C22H44O7, C13H14O8, C17H1...","[C21H26O7, C18H18O6, C23H26O10, C22H44O7, C20H...","[C20H20O15, C27H32O11, C23H26O10, C33H28O1S1, ...","[C23H30O12, C21H26O7, C17H14O5, C18H18O6, C21H...","[C13H26O2, C13H16O4, C42H32N2O1, C13H12O12, C1...","[C18H18O6, C13H26O2, C20H34O2, C13H16O4, C19H3...","[C18H18O6, C13H26O2, C15H8O8, C13H16O4, C13H12...","[C21H18O17, C26H34O9, C18H18O6, C26H28O17, C21...",...,"[C13H12O12, C18H16O8, C19H26O7, C25H10N2O7S1, ...","[C26H34O9, C31H42O12, C23H22O16, C18H18O6, C21...","[C28H52N2O24, C61H36S1, C34H40O22, C59H28O2S1,...","[C60H32O1S1, C59H28O2S1, C36H32O21, C35H28O22,...","[C29H56N2O23, C61H36S1, C60H32O1S1, C59H28O2S1...","[C60H32O1S1, C34H40O22, C37H36O20, C36H32O21, ...","[C37H36O20, C24H36N2O28, C36H32O21, C26H28N2O2...","[C39H44O18, C38H40O19, C37H36O20, C36H32O21, C...","[C60H32O1S1, C40H32O18, C36H32O21, C39H28O19, ...","[C29H56N2O23, C61H36S1, C60H32O1S1, C56H32O4S1..."
assigned_intensities,C26H34O9 C38H74O7 C18H18O6 C34H67N1O3 C1...,C23H30O12 C26H34O9 C21H26O7 C18H18O6 C21H...,C18H18O6 C21H44O5 C22H44O7 C13H14O8 C17...,C21H26O7 C18H18O6 C23H26O10 C22H44O7 C20...,C20H20O15 C27H32O11 C23H26O10 C33H28O1S1 C2...,C23H30O12 C21H26O7 C17H14O5 C18H18O6 C21H...,C13H26O2 C13H16O4 C42H32N2O1 C13H12O12 C14...,C18H18O6 C13H26O2 C20H34O2 C13H16O4 C19...,C18H18O6 C13H26O2 C15H8O8 C13H16O4 C13H...,C21H18O17 C26H34O9 C18H18O6 C26H28O17 C21H...,...,C13H12O12 C18H16O8 C19H26O7 C25H10N2O7S1 C...,C26H34O9 C31H42O12 C23H22O16 C18H18O6 C21H...,C28H52N2O24 C61H36S1 C34H40O22 C59H28O2S...,C60H32O1S1 C59H28O2S1 C36H32O21 C35H28O2...,C29H56N2O23 C61H36S1 C60H32O1S1 C59H28O2...,C60H32O1S1 C34H40O22 C37H36O20 C36H32O21...,C37H36O20 C24H36N2O28 C36H32O21 C26H28N2...,C39H44O18 C38H40O19 C37H36O20 C36H32O21 ...,C60H32O1S1 C40H32O18 C36H32O21 C39H28O19...,C29H56N2O23 C61H36S1 C60H32O1S1 C56H32O4...
avg_rel_intensities,"[0.0003626972108306729, 0.0008012163684758479,...","[0.00041097500151961316, 0.0012894830563309064...","[0.0006700403449576709, 0.0008344927069082711,...","[0.000949944512360248, 0.0003724152285465057, ...","[0.0005517433938589105, 0.0008488515173678924,...","[0.00029029495040004384, 0.0019990978502561077...","[0.0012547670146184438, 0.0007956999558677236,...","[0.0008663763298855323, 0.002120038368016807, ...","[0.0006390643814488137, 0.0010417449679008559,...","[0.00012765035651089943, 0.0009837526183441398...",...,"[0.0010870657893427835, 0.0008380170613573285,...","[0.0011080823406139646, 0.0003102463510833743,...","[0.00012915764099853894, 0.0001527093632889803...","[0.00022213238888118617, 0.0002447956052841891...","[9.322805371615167e-05, 0.00017527246730187265...","[0.0003269641879426724, 0.00019458311771139374...","[0.00045960760266008645, 0.0004155610874327466...","[0.00020616016461979852, 0.0002055427786124408...","[0.00030807872370783713, 5.512191504048127e-05...","[0.00010595195130630199, 0.0002344496890470388..."
assigned_m/z,C26H34O9 C38H74O7 C18H18O6 C34H67N...,C23H30O12 C26H34O9 C21H26O7 C18H1...,C18H18O6 C21H44O5 C22H44O7 C13H1...,C21H26O7 C18H18O6 C23H26O10 C22H4...,C20H20O15 C27H32O11 C23H26O10 C33H28O...,C23H30O12 C21H26O7 C17H14O5 C18H1...,C13H26O2 C13H16O4 C42H32N2O1 C13H12...,C18H18O6 C13H26O2 C20H34O2 C13H1...,C18H18O6 C13H26O2 C15H8O8 C13H1...,C21H18O17 C26H34O9 C18H18O6 C26H28...,...,C13H12O12 C18H16O8 C19H26O7 C25H10N...,C26H34O9 C31H42O12 C23H22O16 C18H1...,C28H52N2O24 C61H36S1 C34H40O22 C59H28...,C60H32O1S1 C59H28O2S1 C36H32O21 C35H28...,C29H56N2O23 C61H36S1 C60H32O1S1 C59H28...,C60H32O1S1 C34H40O22 C37H36O20 C36H32...,C37H36O20 C24H36N2O28 C36H32O21 C26H28N...,C39H44O18 C38H40O19 C37H36O20 C36H32...,C60H32O1S1 C40H32O18 C36H32O21 C39H28...,C29H56N2O23 C61H36S1 C60H32O1S1 C56H32...
avg_m/z,"[489.2130735008832, 641.5361451944, 329.103023...","[497.1665377437625, 489.21316305620854, 389.16...","[329.1030225334693, 375.3115058369095, 419.301...","[389.1607410881416, 329.1029300553382, 461.145...","[499.0728188130681, 531.187408429008, 461.1450...","[497.16667633219646, 389.16073444888326, 297.0...","[213.18603489126684, 235.09765284433342, 579.2...","[329.102928741356, 213.18601606491666, 305.248...","[329.1030023830783, 213.18601343383986, 315.01...","[541.0473484245043, 489.21315878858906, 329.10...",...,"[359.0256004130435, 359.0773590319633, 365.160...","[489.21308099478046, 605.2601930817575, 553.08...","[799.2837348006915, 799.2466948946739, 799.194...","[799.2100805546951, 799.1737637888112, 799.136...","[799.3199750224944, 799.246893594214, 799.2104...","[799.2097798751946, 799.1939048767306, 799.172...","[799.1725939571834, 799.1382480326479, 799.136...","[799.2457385305241, 799.2092874022393, 799.172...","[799.2097935825499, 799.1515618114809, 799.136...","[799.3200686115426, 799.2461871442326, 799.209..."


In [14]:
overall_dfs_list = [[overall_intensity_df,'overall_intensity_ordination_table'], [overall_mz_df,'overall_mz_ordination_table']]

for d in overall_dfs_list:
    ord_df = d[0].to_OrdinationMatrix()
    ord_df.insert(0, 'replicate_y/n', replicate)

    ord_df_path = f'{processed_csv_data_dir}/{d[1]}'
    ord_df.to_csv(csv_date_path(ord_df_path))

In [15]:
vk_areas = msf.vk_areas

In [16]:
#Add information about mean element counts, element ratios, and compound class counts
avg_elements = {}
avg_ratios = {}
molec_class_no = {}
avg_dbe = []
avg_ai = []
avg_nosc = []
# elem_classes_list = ['CHO','CHNO','CHNOS','CHOS','CHS',]
elem_classes = {}
avg_gfe = [] # LaRowe et al., 2011

ai_keys = ['non_aromatics','aromatics','condensed_aromatics']
ai_classes = {}

for i in range(len(short_names)):

    name = short_names[i]
    msTuple = general_df[name]

    elementdf = pd.DataFrame(pk.element_counts(msTuple))
    
    for element in elementdf.columns:
        if i == 0:
            avg_elements[f'avg_{element}'] = []
        
        avg_elements[f'avg_{element}'].append(np.mean(elementdf[element]))

    for element in elementdf.columns:
        if element != 'C':
            if i == 0:
                avg_ratios[f'avg_{element}/C'] = []
            
            avg_ratios[f'avg_{element}/C'].append(avg_elements[f'avg_{element}'][i] / avg_elements['avg_C'][i])

            elementdf[f'{element}/C'] = elementdf[f'{element}'] / elementdf['C']
        
    #---

    tot_formulae = len(elementdf)
    vk_sorted = msf.molecclass(elementdf,vk_areas)

    for molec_class in vk_sorted:
        if i == 0:
            molec_class_no[f'{molec_class}_tot'] = []
            molec_class_no[f'{molec_class}_%'] = []
        
        molec_class_no[f'{molec_class}_tot'].append(len(vk_sorted[molec_class]))
        molec_class_no[f'{molec_class}_%'].append(100*len(vk_sorted[molec_class])/tot_formulae)
    
    ai_values = pk.aromaticity_index(msTuple, index_type='rAImod')

    avg_dbe.append(np.mean(pk.double_bond_equivalent(msTuple)))
    avg_ai.append(np.mean(ai_values))


    for j in range(len(msf.ai_boundaries)+1):
        # if first
        if j == 0:
            ai_selection = ai_values[np.where(ai_values <= msf.ai_boundaries[j])]

        # if last
        elif j == len(msf.ai_boundaries):
            ai_selection = ai_values[np.where(ai_values >= msf.ai_boundaries[-1])]

        # if second to last
        elif msf.ai_boundaries[j] == msf.ai_boundaries[-1]:
            ai_selection = ai_values[np.where((ai_values > msf.ai_boundaries[j-1])&(ai_values < msf.ai_boundaries[j]))]

        # all other values
        else:
            ai_selection = ai_values[np.where((ai_values > msf.ai_boundaries[j-1])&(ai_values <= msf.ai_boundaries[j]))]

        if i == 0:
            ai_classes[f'{ai_keys[j]}_tot'] = []
            ai_classes[f'{ai_keys[j]}_%'] = []

        ai_classes[f'{ai_keys[j]}_tot'].append(len(ai_selection))
        ai_classes[f'{ai_keys[j]}_%'].append(100 * len(ai_selection) / len(ai_values))

    nosc = np.mean(pk.nominal_oxidation_state(msTuple))
    avg_nosc.append(nosc)
    avg_gfe.append((60.3 - (28.5 * nosc)))

    ecompounds, ecounts = pk.compound_class(msTuple, method='ELEM')

    ecompounds_unique = np.unique(ecompounds)
    for e_class in ecompounds_unique:
        if i == 0:
            elem_classes[f'{e_class}_tot'] = []
            elem_classes[f'{e_class}_%'] = []

        # add 0's for those classes which were not present in the previous samples
        elif [x for x in elem_classes.keys() if f'{e_class}_' in x] == []:
            elem_classes[f'{e_class}_tot'] = [0] * i
            elem_classes[f'{e_class}_%'] = [0] * i

        elem_classes[f'{e_class}_tot'].append(ecounts[e_class])
        elem_classes[f'{e_class}_%'].append(100*ecounts[e_class]/tot_formulae)

    # check that all the items in the dict have the same length, if not it's because that elemental class was not presen in ecompounds and must therefore be set to 0
    for j in elem_classes:
        if len(elem_classes[j]) != i+1:
            elem_classes[j].append(0)

In [17]:
# create the meta_data_dict which will be saved to a CSV file
meta_data_dict = {}
columns_list = []
index_list = []

for i in range(len(short_names)):
    name = short_names[i]
    index_list.append(name)

    meta_data_dict[name] = []

    for e in avg_elements:
        if i == 0: columns_list.append(e)
        meta_data_dict[name].append(avg_elements[e][i])
    
    for ratio in avg_ratios:
        if i == 0: columns_list.append(ratio)
        meta_data_dict[name].append(avg_ratios[ratio][i])

    

    # take care that these are in the right order for the columns_list too
    meta_data_dict[name].append(avg_dbe[i])
    meta_data_dict[name].append(avg_ai[i])
    if i == 0:
        columns_list += ['avg_dbe','avg_ai']

    meta_data_dict[name].append(avg_nosc[i])
    meta_data_dict[name].append(avg_gfe[i])
    if i == 0:
        columns_list += ['avg_nosc','avg_gfe']

    for ai_class in ai_classes:
        if i == 0: columns_list.append(ai_class)
        meta_data_dict[name].append(ai_classes[ai_class][i])  

    for x in molec_class_no:
        if i == 0: columns_list.append(x.lower().replace(' ','_'))
        meta_data_dict[name].append(molec_class_no[x][i])

meta_data_df = pd.DataFrame(data=meta_data_dict.values(),index=meta_data_dict.keys(),columns=columns_list)

meta_data_path = f'{processed_csv_data_dir}/meta_data'

meta_data_df.to_csv(csv_date_path(meta_data_path))

# Resolving power

In [9]:
for csv in csv_list:
    df = pd.read_csv(csv)
    df = df[['m/z','Resolving Power']].loc[np.where(np.abs(400-df['m/z'])==np.min(np.abs(400-df['m/z'])))]

    if csv == csv_list[0]: df_concat = df
    else: df_concat = pd.concat([df_concat,df])

min_df = df_concat[df_concat['Resolving Power']==np.min(df_concat['Resolving Power'])]
max_df = df_concat[df_concat['Resolving Power']==np.max(df_concat['Resolving Power'])]

print(f"minimum resolving power: {min_df}\nmaximum resolving power: {max_df}\naverage resolving power: {np.mean(df_concat['Resolving Power'])}\ninterval: {np.min(df_concat['m/z'])}-{np.max(df_concat['m/z'])}")

minimum resolving power:              m/z  Resolving Power
1299  400.305866    294234.723752
maximum resolving power:             m/z  Resolving Power
189  399.166269    688508.355304
average resolving power: 469077.95888507017
interval: 399.1297560893096-400.30700329495806
