## This notebook creates a barplot showing how many methods detect each compound

In [1]:
from pathlib import Path
from definitions import ROOT_DIR
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch

- Inputs

In [2]:
p_root_dir = Path(ROOT_DIR).parents[0]
p_data = p_root_dir / "5_data_analysis"
p_out = p_root_dir / "6_plots/q2_plots/barplots"

# Compounds name and ID information
p_compounds = p_data / "compounds_ids.csv"

# Metrics and Catboost predictions for all ions in their target wells
p_predictions = p_data / "all_predictions_12-Jul-2021.csv"

# Dataset info (lab, matrix, polarity, m/z range, ids, etc.)
p_datasets = p_data / "datasets.csv"

# Classification
# p_chem_class = p_data / "custom_classification.csv"
# p_alt_class = p_data / "alternative_classification.csv"
# p_pathways = p_data / "pathways.csv"

- Load and merge predictions with dataset metadata and compound classification

In [5]:
predictions = pd.read_csv(p_predictions, index_col=0)
predictions.neutral_loss.fillna('', inplace=True)

compounds = pd.read_csv(p_compounds, index_col='internal_id')
# chem_class = pd.read_csv(p_chem_class, index_col='internal_id')
# alt_class = pd.read_csv(p_alt_class, index_col='internal_id')
# pathways = pd.read_csv(p_pathways, index_col='internal_id')

# Get a subset of most relevant information from Datasets file
datasets = pd.read_csv(p_datasets)
datasets_info = datasets.groupby('Clone ID').first()[['Polarity', 'Matrix short', 'Matrix long']] # 'Participant lab', 'Technology'

# Merge with predictions and classification
df = pd.merge(predictions, datasets_info, left_on='dataset_id', right_on='Clone ID', how='left')
# df = pd.merge(df, chem_class, how='left', on='name_short')
# df = pd.merge(df, alt_class, how='left', on='name_short')
# df = pd.merge(df, pathways, how='left', on='name_short')

- Prepare data in two ways:
    1. Sum intensities of all ion forms (adducts and neutral losses)
    2. Sum intensities only of common adducts, but do not include any neutral losses

In [6]:
# Make columns with intensities for only ion images classified as 'good'
# For the plots below we are going to use background-corrected intensity, ie. (mean_on_spot - mean_bgr)
df['val_twostate'] = (df.pred_twostate == 1) * df.spot_intensity_bgr_corrected
df['val_threestate'] = (df.pred_threestate == 2) * df.spot_intensity_bgr_corrected

In [7]:
# Group full data by compound
grouped_data = df.groupby(['Matrix short', 'Polarity', 'name_short'])

agg_data = grouped_data.agg({
#             'coarse_class' : 'first',
#             'fine_class' : 'first',
#             'morgan_class':'first', 
#             'hmdb_func_group_class':'first', 
#             'property_class':'first',
#             'coarse_path' : 'first',
#             'fine_path' : 'first',
            'pred_twostate' : max,
            'pred_threestate' : lambda x: 1 if max(x)==2 else 0,
})
# agg_data.pred
agg_data.reset_index(inplace=True)

In [8]:
# Group filtered data
nl_list = ['']
df_slim = df[df.neutral_loss.isin(nl_list)]

grouped_data_slim = df_slim.groupby(['Matrix short', 'Polarity', 'name_short'])

agg_data_slim = grouped_data_slim.agg({
#             'coarse_class' : 'first',
#             'fine_class' : 'first',
#             'morgan_class':'first', 
#             'hmdb_func_group_class':'first', 
#             'property_class':'first',
#             'coarse_path' : 'first',
#             'fine_path' : 'first',
            'pred_twostate' : max,
            'pred_threestate' : lambda x: 1 if max(x)==2 else 0,
})
agg_data_slim.reset_index(inplace=True)

In [17]:
result

Unnamed: 0,name_short,pred_twostate,pred_threestate
0,2-Oxoglutaric acid,12,6
1,3-Hydroxyanthranilic acid,9,8
2,3-Hydroxymethylglutaric acid,7,6
3,3-Phosphoglyceric acid,10,10
4,"4,5-Dihydroorotic acid",12,10
...,...,...,...
167,Xanthine,12,10
168,alpha-tocopherol,2,0
169,cis-Aconitic acid,12,5
170,gamma-Aminobutyric acid,9,9


In [20]:
# Choose full or filtered data
for data, data_label in zip([agg_data, agg_data_slim], ['with n.l.', 'no n.l']):
    
    # choose ionisation mode
    for mode in ['pos', 'neg']:
        data_filtered = data[data['Polarity'] == mode]
               
        # Choose catboost classification confidence level
        for catboost in ['twostate', 'threestate']:
            
            # Assemble data
            result = data_filtered.groupby('name_short').agg(sum).reset_index()
            result.sort_values(f'pred_{catboost}', ascending=False, inplace=True)
            
            fig, ax = plt.subplots(figsize=(40,15))
            sns.barplot(x='name_short', y=f'pred_{catboost}', data=result, ax=ax, color='k')
            ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
            ax.set(xlabel=None, ylabel='Number of methods detecting copmpound') # hide axes labels
            ax.set_title(f'How many methods detect each compound {data} {mode} {catboost}')
            fig.savefig(p_out / f"How many methods detect each compound {data_label} {mode} {catboost}.png")
            plt.close()

In [14]:
p_out / f"barplots/How many methods detect each compound {data_label} {mode} {catboost}.png"

WindowsPath('d:/saharuka/spotting/20_matrices_git/spotting/analysis/20_matrices/6_plots/q2_plots/barplots/barplots/How many methods detect each compound with n.l. pos twostate.png')

In [None]:
agg_data_filtered = agg_data[agg_data['Polarity'] == 'pos']
result = agg_data_filtered.groupby('name_short').agg(sum).reset_index()

result = result.sort_values('pred_threestate', ascending = False)

In [None]:
# Choose full or filtered data
for data, data_label in zip([agg_data, agg_data_slim], ['with n.l.', 'no n.l']):
    
    # Define a colour palette for chemical classes
    lut = dict(zip(data.coarse_class.unique(), sns.hls_palette(data.coarse_class.nunique(), h=.5)))
    my_classes = pd.Series(data.coarse_class.values, index=data.name_short).to_dict()
       
    # choose ionisation mode
    for mode in ['pos', 'neg']:
        data_filtered = data[data['Polarity'] == mode]
               
        # Choose catboost classification confidence level
        for catboost in ['twostate', 'threestate']:
            
            # Assemble data
            result = data_filtered.pivot(index='name_short',
                              columns=['Matrix short'],
                              values=f"val_{catboost}")
            result = result.reindex(data_filtered.name_short.drop_duplicates())
            result = result.fillna(0)
            result.to_csv(p_out / "hierarchical clustering modes separate" / f"data_for_seriation_{data_label}_{mode}_{catboost}.csv")
            
            # Make a colour bar corresponding to compound classes
            row_colors = result.index.map(my_classes).map(lut)
                        
            # Make a plot
            g = sns.clustermap(result, metric="euclidean", 
                               yticklabels=False, 
                               row_colors=row_colors,
                               row_cluster=True, 
                               cmap="Blues",
                               vmax=5.5) # set max value of the colour bar
                     
            # Beautify the plot
            ax = g.ax_heatmap # define axes
            ax.set(xlabel=None, ylabel=None) # hide axes labels
            g.ax_row_dendrogram.set_visible(False) # hiderow dendrogram
            
                # set title and give it space above the plot
            g.fig.subplots_adjust(top=0.9, right=0.95)
            g.fig.suptitle(f"log10 intensities {data_label} {mode} {catboost}") # set title
            
                # adjust colour bar size and location            
            g.ax_cbar.set_position((0.96, 0.075, 0.03, 0.65)) #x, y, w, l
            
                # construct legend
            handles = [Patch(facecolor=lut[name]) for name in lut]
            plt.legend(handles, lut, title='Chemical class',
                        bbox_to_anchor=(0.17, 0.5), bbox_transform=plt.gcf().transFigure, loc='upper right')

            # Save the lot
            g.savefig(p_out / "hierarchical clustering modes separate" / f"log10 intensities {data_label} {mode} {catboost}.png")
            plt.close()            