## This notebook creates a barplot showing how many methods detect each compound

In [5]:
from pathlib import Path
from definitions import ROOT_DIR
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch

- Inputs

In [6]:
p_root_dir = Path(ROOT_DIR).parents[0]
p_data = p_root_dir / "5_data_analysis"
p_out = p_root_dir / "6_plots/q2_plots/barplots"

# Compounds name and ID information
p_compounds = p_data / "compounds_ids.csv"

# Metrics and Catboost predictions for all ions in their target wells
p_predictions = p_data / "all_predictions_curated_11-Dec-2021.csv"

# Dataset info (lab, matrix, polarity, m/z range, ids, etc.)
p_datasets = p_data / "datasets.csv"

- Load and merge predictions with dataset metadata and compound classification

In [7]:
predictions = pd.read_csv(p_predictions, index_col=0)
predictions.neutral_loss.fillna('', inplace=True)

compounds = pd.read_csv(p_compounds, index_col='internal_id')

# Get a subset of most relevant information from Datasets file
datasets = pd.read_csv(p_datasets)
datasets_info = datasets.groupby('Clone ID').first()[['Polarity', 'Matrix short', 'Matrix long', 'Slide code']] # 'Participant lab', 'Technology'
datasets_info['sample_name'] = datasets_info['Matrix short']
# datasets_info['sample_name'] = datasets_info['Matrix short'] + '_' + datasets_info['Slide code']

# Merge with predictions and classification
df = pd.merge(predictions, datasets_info, left_on='dataset_id', right_on='Clone ID', how='left')

In [16]:
def filter_neutral_losses(df, filter_on=True, neutral_losses_to_keep=['']):
    '''
    Filter out entries for ions with neutral losses that are not in the list provided
    '''
    if filter_on:
        df = df[df.neutral_loss.isin(neutral_losses_to_keep)]
    return df

def calculate_detected_intensities(df):
    '''
    Make a column with background corrected intensities for detected compounds, and 0s for not detected compounds
    Change any negative values to zero
    '''
    intensities_for_twostate_spots = (df.pred_twostate == 1) * df.spot_intensity_bgr_corrected
    df['val_twostate'] = np.clip(intensities_for_twostate_spots, 0, None)
    intensities_for_threestate_spots = (df.pred_threestate == 2) * df.spot_intensity_bgr_corrected
    df['val_threestate'] = np.clip(intensities_for_threestate_spots, 0, None)
    return df

def filter_polarity(df, polarity):
    '''
    Filter out entries based on polarity pol ['pos', 'neg']
    '''
    return df[df['Polarity'] == polarity]

def group_by_molecule(df, intensity_col_name, prediction_col_name):
    '''
    Aggregate intensity and detection values per class
    '''
    
    if intensity_col_name == 'val_threestate':
        intensity_aggregation_func = lambda x: (x==2).any()
    else: intensity_aggregation_func = lambda x: (x==1).any()
            
    
    data = df.pivot_table(index=['name_short'],
                          columns=['sample_name'],
                          values=[intensity_col_name, prediction_col_name],
                          aggfunc = {
                                intensity_col_name : lambda x: np.log10(sum(x)+1),
                                prediction_col_name : intensity_aggregation_func
                          },
                          fill_value=0,
                          sort=False)
    data = data.stack(level=1, dropna=False).reset_index()
    return data

def prep_molecule_data(data, polarity,  intensity_col_name, prediction_col_name, nl_filter_on=None, neutral_losses_to_keep=None):
    '''
    '''
    data = filter_neutral_losses(data, nl_filter_on, neutral_losses_to_keep=[''])
    data = calculate_detected_intensities(data)
    data = filter_polarity(data, polarity)
    data = group_by_molecule(data, intensity_col_name, prediction_col_name)
    
    return data

In [25]:
data=df
filter_neutral_losses(data, filter_on=True, neutral_losses_to_keep=[''])
# data = calculate_detected_intensities(data)
# data = filter_polarity(data, 'pos')

Unnamed: 0,dataset_id,formula,adduct,neutral_loss,well,name_short,score,filename,occupancy_ratio,on_off_ratio,...,intensity_vs_far_bg_ratio,intensity_vs_other_spots_ratio,pred_val,pred_twostate,pred_threestate,Polarity,Matrix short,Matrix long,Slide code,sample_name
0,2021-06-18_10h37m54s,C4H6O5,-H,,59,Malic acid,,C4H6O5_-H__59_2021-06-18_10h37m54s.png,68.115942,2.373789e+08,...,2.373789e+08,1.015940e+04,0.999635,1,2,neg,9AA,9-aminoacridine,6J,9AA
8,2021-06-18_10h37m54s,C4H6O5,+Cl,,59,Malic acid,,C4H6O5_+Cl__59_2021-06-18_10h37m54s.png,0.000000,0.000000e+00,...,0.000000e+00,0.000000e+00,0.006168,0,0,neg,9AA,9-aminoacridine,6J,9AA
15,2021-06-18_10h37m54s,C4H6O5,[M]-,,59,Malic acid,,C4H6O5_[M]-__59_2021-06-18_10h37m54s.png,0.000000,0.000000e+00,...,0.000000e+00,0.000000e+00,0.006168,0,0,neg,9AA,9-aminoacridine,6J,9AA
24,2021-06-18_10h41m59s,C4H6O5,-H,,59,Malic acid,,C4H6O5_-H__59_2021-06-18_10h41m59s.png,68.918344,1.042197e+03,...,3.517809e+05,1.485349e+07,0.999663,1,2,neg,CHCA,α-Cyano-4-hydroxycinnamic acid,6G,CHCA
35,2021-06-18_10h41m59s,C4H6O5,[M]-,,59,Malic acid,,C4H6O5_[M]-__59_2021-06-18_10h41m59s.png,0.000000,0.000000e+00,...,0.000000e+00,0.000000e+00,0.006168,0,0,neg,CHCA,α-Cyano-4-hydroxycinnamic acid,6G,CHCA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109243,2021-07-10_00h13m11s,C44H78O2,+Na,,13,Cholesteryl ester 17:0,,C44H78O2_+Na__13_2021-07-10_00h13m11s.png,46.376812,1.694470e+06,...,1.694470e+06,1.694470e+06,0.999206,1,2,pos,CMBT,5-Chloro-2-mercaptobenzothiazole,3C,CMBT
109247,2021-07-10_00h13m11s,C44H78O2,+K,,13,Cholesteryl ester 17:0,,C44H78O2_+K__13_2021-07-10_00h13m11s.png,20.289855,6.028806e+04,...,6.028806e+04,2.845140e+02,0.980557,1,2,pos,CMBT,5-Chloro-2-mercaptobenzothiazole,3C,CMBT
109249,2021-07-10_00h13m11s,C44H78O2,+H,,13,Cholesteryl ester 17:0,,C44H78O2_+H__13_2021-07-10_00h13m11s.png,0.000000,0.000000e+00,...,0.000000e+00,0.000000e+00,0.006168,0,0,pos,CMBT,5-Chloro-2-mercaptobenzothiazole,3C,CMBT
109261,2021-12-07_17h42m06s,C44H78O2,+Na,,13,Cholesteryl ester 17:0,,C44H78O2_+Na__13_2021-12-07_17h42m06s.png,47.826087,6.579733e+05,...,6.579733e+05,6.579733e+05,0.998979,1,2,,,,,


In [15]:
prep_molecule_data(data=df, 
                   polarity='pos',  
                   nl_filter_on=True, 
                   neutral_losses_to_keep=[''], 
                   intensity_col_name='val_threestate',
                   prediction_col_name = 'pred_threestate')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['val_twostate'] = np.clip(intensities_for_twostate_spots, 0, None)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['val_threestate'] = np.clip(intensities_for_threestate_spots, 0, None)


Unnamed: 0,name_short,sample_name,pred_threestate,val_threestate
0,Malic acid,9AA,False,0.000000
1,Malic acid,CHCA,True,3.465996
2,Malic acid,CMBT,True,2.292008
3,Malic acid,ClCCA,True,2.686327
4,Malic acid,DAN,True,3.013238
...,...,...,...,...
1887,Cholesteryl ester 17:0,DHB,True,3.558614
1888,Cholesteryl ester 17:0,MAPS,0,0.000000
1889,Cholesteryl ester 17:0,NEDC,False,0.000000
1890,Cholesteryl ester 17:0,NOR,True,1.578031


- Prepare data in two ways:
    1. Sum intensities of all ion forms (adducts and neutral losses)
    2. Sum intensities only of common adducts, but do not include any neutral losses

In [6]:
# Make columns with intensities for only ion images classified as 'good'
# For the plots below we are going to use background-corrected intensity, ie. (mean_on_spot - mean_bgr)
df['val_twostate'] = (df.pred_twostate == 1) * df.spot_intensity_bgr_corrected
df['val_threestate'] = (df.pred_threestate == 2) * df.spot_intensity_bgr_corrected

In [7]:
# Group full data by compound
grouped_data = df.groupby(['Matrix short', 'Polarity', 'name_short'])

agg_data = grouped_data.agg({
#             'coarse_class' : 'first',
#             'fine_class' : 'first',
#             'morgan_class':'first', 
#             'hmdb_func_group_class':'first', 
#             'property_class':'first',
#             'coarse_path' : 'first',
#             'fine_path' : 'first',
            'pred_twostate' : max,
            'pred_threestate' : lambda x: 1 if max(x)==2 else 0,
})
# agg_data.pred
agg_data.reset_index(inplace=True)

In [8]:
# Group filtered data
nl_list = ['']
df_slim = df[df.neutral_loss.isin(nl_list)]

grouped_data_slim = df_slim.groupby(['Matrix short', 'Polarity', 'name_short'])

agg_data_slim = grouped_data_slim.agg({
#             'coarse_class' : 'first',
#             'fine_class' : 'first',
#             'morgan_class':'first', 
#             'hmdb_func_group_class':'first', 
#             'property_class':'first',
#             'coarse_path' : 'first',
#             'fine_path' : 'first',
            'pred_twostate' : max,
            'pred_threestate' : lambda x: 1 if max(x)==2 else 0,
})
agg_data_slim.reset_index(inplace=True)

In [17]:
result

Unnamed: 0,name_short,pred_twostate,pred_threestate
0,2-Oxoglutaric acid,12,6
1,3-Hydroxyanthranilic acid,9,8
2,3-Hydroxymethylglutaric acid,7,6
3,3-Phosphoglyceric acid,10,10
4,"4,5-Dihydroorotic acid",12,10
...,...,...,...
167,Xanthine,12,10
168,alpha-tocopherol,2,0
169,cis-Aconitic acid,12,5
170,gamma-Aminobutyric acid,9,9


In [20]:
# Choose full or filtered data
for data, data_label in zip([agg_data, agg_data_slim], ['with n.l.', 'no n.l']):
    
    # choose ionisation mode
    for mode in ['pos', 'neg']:
        data_filtered = data[data['Polarity'] == mode]
               
        # Choose catboost classification confidence level
        for catboost in ['twostate', 'threestate']:
            
            # Assemble data
            result = data_filtered.groupby('name_short').agg(sum).reset_index()
            result.sort_values(f'pred_{catboost}', ascending=False, inplace=True)
            
            fig, ax = plt.subplots(figsize=(40,15))
            sns.barplot(x='name_short', y=f'pred_{catboost}', data=result, ax=ax, color='k')
            ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
            ax.set(xlabel=None, ylabel='Number of methods detecting copmpound') # hide axes labels
            ax.set_title(f'How many methods detect each compound {data} {mode} {catboost}')
            fig.savefig(p_out / f"How many methods detect each compound {data_label} {mode} {catboost}.png")
            plt.close()

In [14]:
p_out / f"barplots/How many methods detect each compound {data_label} {mode} {catboost}.png"

WindowsPath('d:/saharuka/spotting/20_matrices_git/spotting/analysis/20_matrices/6_plots/q2_plots/barplots/barplots/How many methods detect each compound with n.l. pos twostate.png')

In [None]:
agg_data_filtered = agg_data[agg_data['Polarity'] == 'pos']
result = agg_data_filtered.groupby('name_short').agg(sum).reset_index()

result = result.sort_values('pred_threestate', ascending = False)

In [None]:
# Choose full or filtered data
for data, data_label in zip([agg_data, agg_data_slim], ['with n.l.', 'no n.l']):
    
    # Define a colour palette for chemical classes
    lut = dict(zip(data.coarse_class.unique(), sns.hls_palette(data.coarse_class.nunique(), h=.5)))
    my_classes = pd.Series(data.coarse_class.values, index=data.name_short).to_dict()
       
    # choose ionisation mode
    for mode in ['pos', 'neg']:
        data_filtered = data[data['Polarity'] == mode]
               
        # Choose catboost classification confidence level
        for catboost in ['twostate', 'threestate']:
            
            # Assemble data
            result = data_filtered.pivot(index='name_short',
                              columns=['Matrix short'],
                              values=f"val_{catboost}")
            result = result.reindex(data_filtered.name_short.drop_duplicates())
            result = result.fillna(0)
            result.to_csv(p_out / "hierarchical clustering modes separate" / f"data_for_seriation_{data_label}_{mode}_{catboost}.csv")
            
            # Make a colour bar corresponding to compound classes
            row_colors = result.index.map(my_classes).map(lut)
                        
            # Make a plot
            g = sns.clustermap(result, metric="euclidean", 
                               yticklabels=False, 
                               row_colors=row_colors,
                               row_cluster=True, 
                               cmap="Blues",
                               vmax=5.5) # set max value of the colour bar
                     
            # Beautify the plot
            ax = g.ax_heatmap # define axes
            ax.set(xlabel=None, ylabel=None) # hide axes labels
            g.ax_row_dendrogram.set_visible(False) # hiderow dendrogram
            
                # set title and give it space above the plot
            g.fig.subplots_adjust(top=0.9, right=0.95)
            g.fig.suptitle(f"log10 intensities {data_label} {mode} {catboost}") # set title
            
                # adjust colour bar size and location            
            g.ax_cbar.set_position((0.96, 0.075, 0.03, 0.65)) #x, y, w, l
            
                # construct legend
            handles = [Patch(facecolor=lut[name]) for name in lut]
            plt.legend(handles, lut, title='Chemical class',
                        bbox_to_anchor=(0.17, 0.5), bbox_transform=plt.gcf().transFigure, loc='upper right')

            # Save the lot
            g.savefig(p_out / "hierarchical clustering modes separate" / f"log10 intensities {data_label} {mode} {catboost}.png")
            plt.close()            