In [1]:
import numpy as np # v. 1.22.1
from matplotlib import pyplot as plt # v. 3.7.5
import os
import pandas as pd # v. 1.5.3
import seaborn as sns # v. 0.12.2
import pykrev as pk # v. 1.2.4
import datetime

import sys
sys.path.append('../mchem_functions')
import ms_functions_and_defs as msf

In [None]:
csv_dir = '../../Data/MS_data/CSV_files'
include_srfa = 1
formula_filter = 2 # /3, or however many technical replicates of the single biological replicate you have. If 3, 2 is a "soft" filter, 3 a "hard" filter
chosen_date = [''] #date in the format dd-mm-yyyy

In [13]:
# Change directory to the csv_dir
try: os.chdir(csv_dir)
except FileNotFoundError: pass

In [14]:
processed_csv_data_dir = 'processed_csv_data_dir'
if not os.path.exists(processed_csv_data_dir): os.mkdir(processed_csv_data_dir)

In [15]:
# load the csv files
csv_list = []
files_list = os.listdir()

for f in files_list:
    if f.endswith('.csv'):
        if chosen_date not in  [[],['']]:
            if f.split('_')[-2] in chosen_date:
                csv_list.append(f)
        else:
            csv_list.append(f)

if not include_srfa: csv_list = [csv for csv in csv_list if 'SRFA' not in csv.upper()]
# csv_list

In [16]:
def season(date:str,sep='-'):
    if sep in date:
        month = int(date.split(sep)[1])
    else:
        month = int(date)

    if month in [9,10,11]: return 'Aut'
    elif month in [12,1,2]: return 'Win'
    elif month in [3,4,5]: return 'Spr'
    elif month in [6,7,8]: return 'Sum'

def csv_date_path(path:str):
    if path.endswith('.csv'): path.replace('.csv','')
    
    if chosen_date not in [[],[''],'']:
        if type(chosen_date) != str:
            path += '_'
            for s in chosen_date:
                path += s
                if s != chosen_date[-1]:
                    path += '_'
            
        else: path += f'_{chosen_date}'

    path += '.csv'

    return path

def short_name(longname:str):
    new_name = longname.replace('MP-','').replace('-OM','').split('-')

    if 'PL' in new_name:
        new_name = '-'.join([new_name[0],new_name[1],season(new_name[3])])
    elif 'SRFA' in new_name[0].upper(): new_name = '-'.join(new_name)
    else:
        new_name = '-'.join([new_name[0],new_name[2],season(new_name[4])])
    
    return new_name

In [17]:
sample_names = []
for csv in csv_list:
    if 'SRFA' in csv.upper():
        sample_names.append('_'.join([csv.split('_')[0],csv.split('_')[1]]).upper())
    else:
        sample_names.append(csv.split('_')[0])

sample_names_unique = np.unique(np.array(sample_names))
sample_names_unique

array(['MP-HM-PW-A-15-10-2024-OM', 'MP-HM-PW-D-15-10-2024-OM',
       'MP-HM-PW-G-17-10-2024-OM', 'SRFA_08-11-2024'], dtype='<U24')

In [18]:
short_names = [short_name(x) for x in sample_names_unique]
# short_names

In [19]:
# sample_names_unique = sample_names_unique[sample_names_unique=='SRFA_08-11-2024']

In [20]:
samples_formulae_dict = {}
general_approved_formulae = pk.msTupleDict()

for name in sample_names_unique:
    replicate_csvs = [csv for csv in csv_list if name in csv]
    replicate_formulae_list = []

    # mz_TupleDict = pk.msTupleDict()
    sample_dataDict = pk.msTupleDict()

    if len(replicate_csvs) > 1:
        
        for csv in replicate_csvs:
            csv_df = pd.read_csv(csv)
            pk_df = pk.read_corems(csv_df)
            csv_split = csv.split('_')

            sample_dataDict[str(msf.roman_to_integer(csv_split[1]))] = pk_df

        # put the dataDict into an ordination df, so that we can drop certain formulae
        ord = sample_dataDict.to_OrdinationMatrix()
        
        # filter by the number of non-NAN values wanted
        formula_filtered = ord.dropna(thresh=formula_filter, axis=1)

    else:
        csv_df = pd.read_csv(csv)
        pk_df = pk.read_corems(csv_df)
        sample_dataDict[1] = pk_df
        formula_filtered = pd.DataFrame(pk_df.intensity/np.sum(pk_df.intensity),index=pk_df.formula,columns=[1]).T

    name = short_name(name)

    samples_formulae_dict[name] = {}
    samples_formulae_dict[name]['formulae'] = list(formula_filtered.columns)
    samples_formulae_dict[name]['assigned_intensities'] = formula_filtered

    formula_filtered_arr = formula_filtered.to_numpy()
    for i in range(len(formula_filtered_arr[:,0])):
        formula_filtered_arr[i,:] = formula_filtered_arr[i,:] / np.nansum(formula_filtered_arr[i,:])

    samples_formulae_dict[name]['avg_rel_intensities'] = np.nanmean(formula_filtered_arr,axis=0)

    general_approved_formulae[name] = list(formula_filtered.columns)

    mzs_dict = pk.msTupleDict()

    for f in formula_filtered.columns:
        for idx in formula_filtered.index:
            if f == list(formula_filtered.columns)[0]:
                mzs_dict[idx] = []

            if f in sample_dataDict[idx].formula:
                mzs_dict[idx].append(sample_dataDict[idx].mz[np.where(np.array(sample_dataDict[idx].formula) == f)][0])

            else: mzs_dict[idx].append(np.nan)

    mzs_df = pd.DataFrame(mzs_dict.values(),index=mzs_dict.keys(),columns=formula_filtered.columns)

    samples_formulae_dict[name]['assigned_m/z'] = mzs_df
    samples_formulae_dict[name]['avg_m/z'] = np.nanmean(mzs_df.to_numpy(),axis=0)

In [21]:
general_df = pk.msTupleDict()

for name in samples_formulae_dict.keys():
    formulae = general_approved_formulae[name]

    general_df[name] = pk.msTuple(formula=formulae,intensity=samples_formulae_dict[name]['avg_rel_intensities'],mz=np.ones(len(formulae)))

general_ord_df = general_df.to_OrdinationMatrix()
# general_ord_df

In [22]:
short_replicate_names = []
for csv in csv_list:
    csv = csv.split('_')
    short_replicate_names.append('_'.join([short_name(csv[0]),csv[1]]))

all_short_names = np.unique((short_names + short_replicate_names))
all_short_names.sort()
# all_short_names

In [23]:
overall_intensity_df = pk.msTupleDict()
overall_mz_df = pk.msTupleDict()
replicate = []

for name in all_short_names:

    if name in general_approved_formulae.keys():
        formulae = general_approved_formulae[name]
        intensities = samples_formulae_dict[name]['avg_rel_intensities']
        mz = samples_formulae_dict[name]['avg_m/z']
        replicate.append(0)
    
    else:
        name_split = name.split('_')
        roman_no = str(msf.roman_to_integer(name_split[1]))

        formulae = general_approved_formulae[name_split[0]]
        intensities = samples_formulae_dict[name_split[0]]['assigned_intensities'].loc[roman_no].to_numpy()
        mz = samples_formulae_dict[name_split[0]]['assigned_m/z'].loc[roman_no].to_numpy()

        replicate.append(1)

    overall_intensity_df[name] = pk.msTuple(formula=formulae,intensity=intensities,mz=np.ones(len(formulae)))
    overall_mz_df[name] = pk.msTuple(formula=formulae,intensity=mz,mz=np.ones(len(formulae)))

In [24]:
overall_dfs_list = [[overall_intensity_df,'overall_intensity_ordination_table'], [overall_mz_df,'overall_mz_ordination_table']]

for d in overall_dfs_list:
    ord_df = d[0].to_OrdinationMatrix()
    ord_df.insert(0, 'replicate_y/n', replicate)

    ord_df_path = f'{processed_csv_data_dir}/{d[1]}'
    ord_df.to_csv(csv_date_path(ord_df_path))

In [25]:
vk_areas = msf.vk_areas

In [26]:
#Add information about mean element counts, element ratios, and compound class counts
avg_elements = {}
avg_ratios = {'O/C': [],'H/C': [],'N/C': [],}
molec_class_no = {}
avg_dbe = []
avg_ai = []
avg_nosc = []
# elem_classes_list = ['CHO','CHNO','CHNOS','CHOS','CHS',]
elem_classes = {}
avg_gfe = [] # LaRowe et al., 2011

for i in range(len(short_names)):

    name = short_names[i]
    msTuple = general_df[name]

    elementdf = pd.DataFrame(pk.element_counts(msTuple))
    
    for element in elementdf.columns:
        if i == 0:
            avg_elements[element] = []
        
        avg_elements[element].append(np.mean(elementdf[element]))
    
    avg_ratios['O/C'].append(avg_elements['O'][i] / avg_elements['C'][i])
    avg_ratios['H/C'].append(avg_elements['H'][i] / avg_elements['C'][i])
    avg_ratios['N/C'].append(avg_elements['N'][i] / avg_elements['C'][i])
    
    #---

    elementdf['O/C'] = elementdf['O'] / elementdf['C']
    elementdf['H/C'] = elementdf['H'] / elementdf['C']
    elementdf['N/C'] = elementdf['N'] / elementdf['C']

    tot_formulae = len(elementdf)
    vk_sorted = msf.molecclass(elementdf,vk_areas)

    for molec_class in vk_sorted:
        if i == 0:
            molec_class_no[f'{molec_class}_tot'] = []
            molec_class_no[f'{molec_class}_%'] = []
        
        molec_class_no[f'{molec_class}_tot'].append(len(vk_sorted[molec_class]))
        molec_class_no[f'{molec_class}_%'].append(100*len(vk_sorted[molec_class])/tot_formulae)
        
    avg_dbe.append(np.mean(pk.double_bond_equivalent(msTuple)))
    avg_ai.append(np.mean(pk.aromaticity_index(msTuple, index_type='rAImod')))

    nosc = np.mean(pk.nominal_oxidation_state(msTuple))
    avg_nosc.append(nosc)
    avg_gfe.append((60.3 - (28.5 * nosc)))

    ecompounds, ecounts = pk.compound_class(msTuple, method='ELEM')

    ecompounds_unique = np.unique(ecompounds)
    for e_class in ecompounds_unique:
        if i == 0:
            elem_classes[f'{e_class}_tot'] = []
            elem_classes[f'{e_class}_%'] = []

        # add 0's for those classes which were not present in the previous samples
        elif [x for x in elem_classes.keys() if f'{e_class}_' in x] == []:
            elem_classes[f'{e_class}_tot'] = [0] * i
            elem_classes[f'{e_class}_%'] = [0] * i

        elem_classes[f'{e_class}_tot'].append(ecounts[e_class])
        elem_classes[f'{e_class}_%'].append(100*ecounts[e_class]/tot_formulae)

    # check that all the items in the dict have the same length, if not it's because that elemental class was not presen in ecompounds and must therefore be set to 0
    for j in elem_classes:
        if len(elem_classes[j]) != i+1:
            elem_classes[j].append(0)

In [27]:
# create the meta_data_dict which will be saved to a CSV file
meta_data_dict = {}
columns_list = []
index_list = []

for i in range(len(short_names)):
    name = short_names[i]
    index_list.append(name)

    meta_data_dict[name] = []

    for e in avg_elements:
        if i == 0: columns_list.append(e)
        meta_data_dict[name].append(avg_elements[e][i])
    
    for ratio in avg_ratios:
        if i == 0: columns_list.append(ratio)
        meta_data_dict[name].append(avg_ratios[ratio][i])

    # take care that these are in the right order for the columns_list too
    meta_data_dict[name].append(avg_dbe[i])
    meta_data_dict[name].append(avg_ai[i])
    meta_data_dict[name].append(avg_nosc[i])
    meta_data_dict[name].append(avg_gfe[i])
    if i == 0:
        columns_list += ['avg_dbe','avg_ai','avg_nosc','avg_gfe']

    for x in molec_class_no:
        if i == 0: columns_list.append(x.lower().replace(' ','_'))
        meta_data_dict[name].append(molec_class_no[x][i])

meta_data_df = pd.DataFrame(data=meta_data_dict.values(),index=meta_data_dict.keys(),columns=columns_list)

meta_data_path = f'{processed_csv_data_dir}/meta_data'
meta_data_df.to_csv(csv_date_path(meta_data_path))