In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib notebook

import pandas as pd
import numpy as np
from pathlib import Path
from definitions import ROOT_DIR
import matplotlib.pyplot as plt
from plotnine import *
import re
from pylab import *

In [None]:
#Returns list of hard-coded paths (to avoid repetition in multiple function definitions)
def paths():
    p_root_dir = Path(ROOT_DIR).parents[0]
    p_data = p_root_dir / "matrix_comparison/5_data_analysis"
    p_out = p_root_dir / "interlaboratory_survey/6_plots/Interlab"

    # Compounds name and ID information
    p_compounds = p_data / "compounds_ids.csv"

    # Compound mapping to wells
    p_wellmap = p_data / "wellmap.csv"

    # Dataset info (lab, matrix, polarity, m/z range, ids, etc.)
    p_datasets = p_data / "Datasets_14Jul2022.csv"

    # Classification
    p_chem_class = p_data / "custom_classification_v2.csv"
    p_pathways = p_data / "pathways_v2.csv"
    p_predictions = p_root_dir / "interlaboratory_survey/5_data_analysis/2022-08-16_All_Interlab_Predictions.csv"

    paths = {
            'p_root_dir' : p_root_dir,
            'p_data' : p_data,
            'p_out' : p_out,
            'p_compounds' : p_compounds,
            'p_wellmap' : p_wellmap,
            'p_datasets' : p_datasets,
            'p_chem_class' : p_chem_class,
            'p_pathways' : p_pathways,
            'p_predictions' : p_predictions
    }
    return paths

#Format ion formulas to publication standard
def pretty_ion_formulas(adnl, pol=""):
    
    adnl = [string.replace("[M]-","") for string in adnl]
    adnl = [string.replace("[M]+","") for string in adnl]
    adnl = [string.replace("++","+") for string in adnl]
    adnl = [string.replace("--","-") for string in adnl]
    adnl = [re.sub('([0-9])', '$_\\1$', string) for string in adnl]
    adnl = ['[M'+string+']' for string in adnl]
    if pol == 'Pos':
        adnl = [string+'$^+$' for string in adnl]
    elif pol == 'Neg':
        adnl = [string+'$^-$' for string in adnl]
    return adnl

#Calculate and store peaks normalized to sum of peaks per molecule
def calculate_signal_composition(df):
    addlist = []
    
    for sample in list(set(df['sample_name'])): #For each dataset (one polarity, full range)
        
        for molecule in list(set(df[df['sample_name']==sample]['name_short'])): #For each unique named molecule
            
            #Sum up intensities of all matching adduct/neutral loss peaks and store
            molsum = sum(df[(df['sample_name']==sample) & (df['name_short']==molecule)]['spot_intensity_bgr_corrected'])
            addlist.append({'sample_name':sample, 'name_short':molecule, 'molsum':molsum})
    
    #Add sum intensity information to main dataframe
    addf = pd.DataFrame(addlist)
    newdf = pd.merge(df, addf, left_on=['sample_name','name_short'], right_on=['sample_name','name_short'], how='left')
    
    #Calculate the signal contribution of each row as intensity divided by total intensity
    newdf['frac'] = newdf['spot_intensity_bgr_corrected']/newdf['molsum']
    return newdf

#Load csv files and organize dataframe
def prepare_df(paths):

    #Load predictions
    predictions = pd.read_csv(paths['p_predictions'], index_col=0)
    predictions.neutral_loss.fillna('', inplace=True)

    #Load metadata files
    compounds = pd.read_csv(paths['p_compounds'], index_col='internal_id')
    wellmap = pd.read_csv(paths['p_wellmap'], index_col='internal_id')
    chem_class = pd.read_csv(paths['p_chem_class'], index_col='internal_id')
    pathways = pd.read_csv(paths['p_pathways'], index_col='internal_id')
    datasets = pd.read_csv(paths['p_datasets'])
    
    #Load class data. WARNING: risk of duplication
    main_chem_class = chem_class[['name_short', 'main_coarse_class']].drop_duplicates()

    # Get a subset of most relevant information from Datasets file and add a unique sample name to each merged dataset (full mass range, single polarity)
    datasets_info = datasets.groupby('Dataset ID').first()[['Polarity', 'Participant lab', 'Slide code', 'All', 'EMBL', 'Interlab', 'Technology', 'Matrix short']] # 'Participant lab', 'Technology'
    datasets_info['sample_name'] = datasets_info['Slide code'] + ': ' + datasets_info['Technology'] + ': ' + datasets_info['Matrix short']

    # Merge with predictions
    df = pd.merge(predictions, datasets_info, left_on='dataset_id', right_on='Dataset ID', how='left')
    df.sort_values(by = ['adduct', 'neutral_loss'], inplace=True)
    
    #Format adduct/neutral loss to output-ready format
    df['neutral_loss'] = df['neutral_loss'].apply(lambda x: x if len(x) < 7 else '+Matrix')
    df['adduct_and_nl'] = pretty_ion_formulas(df.adduct+df.neutral_loss)
    df['Polarity'] = [('Pos' if x=='positive' else 'Neg') for x in df['Polarity']] 
    
    #Remove duplicates
#    df['problems'] = df['sample_name'] + "  " + df['Polarity'] + ", molecule: " + df['name_short'] + df['adduct_and_nl']
#    df.sort_values(by='pred_val', ascending=False)
#    df['dupemask']=df.duplicated(subset='problems', keep='first')
#    df = df[~df.dupemask].drop(['dupemask','problems'], axis='columns')
#    df = df.sort_index()
    
    #Merge in metadata, apply filters
    df = df.merge(main_chem_class, on='name_short', how='left')
    df = df[df['Interlab']]
    df = df[df['pred_threestate']==2]
    df = calculate_signal_composition(df)
    
    return df

In [None]:
#Build the main dataframe once (takes a minute or so)
df = prepare_df(paths())

In [None]:
#Signal dilution by class, one plot per technology, 5% threshold5

barchart = []

# Group data by relevant columns
grouped_data = df.groupby(['Polarity', 'adduct_and_nl', 'sample_name', 'name_short', 'Technology', 'main_coarse_class'])

# Aggregate composition fraction and intensity
agg_data = grouped_data.agg({
        'spot_intensity_bgr_corrected' : 'sum',
        'frac' : 'sum'
}).reset_index()

molchart = []        

cmap = cm.get_cmap('tab20')
colorlist = []

#Generate ordered list of all colors in chosen colormap (default tab20)
for i in range(cmap.N):
    colorlist = colorlist + [matplotlib.colors.rgb2hex(cmap(i))]
posnames = []
negnames = []

for pol in ['Pos', 'Neg']:
    
    #Generate an ordered list of ions used for coloring bar charts (works because all other plots have a subset of ions found in this)
    namelist = []
    for tech in set(agg_data['Technology']):
    
        #Filter out all the good detections in one example dataset
        filtered_agg_data = agg_data[(agg_data['Technology'] == tech) & (agg_data['Polarity'] == pol)].reset_index().drop_duplicates()

        #Add polarity sign
        if pol == 'Pos':
            filtered_agg_data['adduct_and_nl'] = [string+'$^+$' for string in filtered_agg_data['adduct_and_nl']]
        elif pol == 'Neg':
            filtered_agg_data['adduct_and_nl'] = [string+'$^-$' for string in filtered_agg_data['adduct_and_nl']]

        result = filtered_agg_data.groupby(['main_coarse_class', 'Polarity', 'adduct_and_nl']).agg({
                'spot_intensity_bgr_corrected' : 'sum',
                'frac' : 'sum'
        }).reset_index()

        #Normalize to number of total samples (datasets)
        result['frac'] = result['frac'] / len(list(set(filtered_agg_data['sample_name'])))
        
        #Prepare report
        for cclass in set(filtered_agg_data['main_coarse_class']):
            rows = result[result['main_coarse_class'] == cclass].sort_values(by='frac', ascending=False)
            
            #Precalculate sum of all peak intensities contributing at least 10% of total intensity
            rowsum = sum(rows['frac'])
            threshold = rowsum*0.05
            flag = True
            sumint = 0

            #For each peak sorted in descending order of intensity, 
            for idx, row in rows.iterrows():
                if flag and row.frac < threshold: #Done?
                    flag = False
                if flag:
                    namelist = namelist + [row['adduct_and_nl']]
                    
    namelist = sorted(list(set(namelist)),key=len)
    
    #Save for later
    if pol == 'Pos':
        posnames = namelist
    else:
        negnames = namelist
    
    #Save a dictionary of ion names and associated colors
    colvals = dict(zip(namelist, colorlist))
    
    #Generate actual output
    for tech in set(agg_data['Technology']):

        #Filter out only datasets of the relevant technology and polarity
        filtered_agg_data = agg_data[(agg_data['Technology'] == tech) & (agg_data['Polarity'] == pol)].reset_index().drop_duplicates()

        #Add polarity sign to formula
        if pol == 'Pos':
            filtered_agg_data['adduct_and_nl'] = [string+'$^+$' for string in filtered_agg_data['adduct_and_nl']]
        elif pol == 'Neg':
            filtered_agg_data['adduct_and_nl'] = [string+'$^-$' for string in filtered_agg_data['adduct_and_nl']]
        
        #Group by relevant columns
        result = filtered_agg_data.groupby(['main_coarse_class', 'Polarity', 'adduct_and_nl']).agg({
                'spot_intensity_bgr_corrected' : 'sum',
                'frac' : 'sum'
        }).reset_index()

        #Normalize to number of total samples (datasets). This sets the total scale of each bar to exactly 1.
        result['frac'] = result['frac'] / len(list(set(filtered_agg_data['sample_name'])))

        report = pd.DataFrame()

        #Prepare report
        for cclass in set(filtered_agg_data['main_coarse_class']):
            rows = result[result['main_coarse_class'] == cclass].sort_values(by='frac', ascending=False)
            
            #Precalculate sum of all peak intensities up to a threshold of total contribution
            rowsum = sum(rows['frac'])
            threshold = rowsum*0.05
            flag = True
            sumint = 0

            #For each peak sorted in descending order of intensity, 
            for idx, row in rows.iterrows():
                addict  = {'Class':cclass, 'sum':rowsum }
                sumint = sumint+row.frac
                if flag and row.frac < threshold: #Done?
                    addict.update({'adduct_and_nl' : np.nan, #Sum remaining and store as "nan"
                                    'frac':(rowsum-sumint)/rowsum})
                    report = report.append(addict, ignore_index=True)
                    flag = False
                if flag: #Not done?
                    addict.update({'adduct_and_nl' : row['adduct_and_nl'], #Store row
                                    'frac':row.frac/rowsum})
                    report = report.append(addict, ignore_index=True)

        colorfill = scale_fill_manual(values=colvals, na_value='white')

        
        #Output split barchart
        molchart = molchart + [(ggplot(report)
        + aes(fill='adduct_and_nl', y='frac', x='Class')  
        + geom_bar(position="fill", stat="identity", linetype='solid', color='black')
        + coord_flip()
        + theme_classic()
        + colorfill
        + theme(
                    aspect_ratio=1,
                    text=element_text(family = 'sans-serif', size=16),
                    title=element_text(family = 'sans-serif', size=18),
                    #legend_position=(0.3, -0.15),
                    )
        + xlab("Class")
        + ylab("Fraction of total intensity")
        + labs(fill = "Adduct + neutral loss", title = f"Signal composition ({tech}, {pol})")
        )]

save_as_pdf_pages(molchart, paths()['p_out'] / "Barchart_Dilution_Class_And_Technology_5percent.pdf")

In [None]:
#Signal dilution by class, one plot per technology, 10% threshold

barchart = []

# Group data by relevant columns
grouped_data = df.groupby(['Polarity', 'adduct_and_nl', 'sample_name', 'name_short', 'Technology', 'main_coarse_class'])

# Aggregate composition fraction and intensity
agg_data = grouped_data.agg({
        'spot_intensity_bgr_corrected' : 'sum',
        'frac' : 'sum'
}).reset_index()

molchart = []        

cmap = cm.get_cmap('tab20')
colorlist = []

#Generate ordered list of all colors in chosen colormap (default tab20)
for i in range(cmap.N):
    colorlist = colorlist + [matplotlib.colors.rgb2hex(cmap(i))]

for pol in ['Pos', 'Neg']:

    if pol == 'Pos':
        namelist = posnames
    else:
        namelist = negnames
        
    #Save a dictionary of ion names and associated colors
    colvals = dict(zip(namelist, colorlist))
    
    #Generate actual output
    for tech in set(agg_data['Technology']):

        #Filter out only datasets of the relevant technology and polarity
        filtered_agg_data = agg_data[(agg_data['Technology'] == tech) & (agg_data['Polarity'] == pol)].reset_index().drop_duplicates()

        #Add polarity sign to formula
        if pol == 'Pos':
            filtered_agg_data['adduct_and_nl'] = [string+'$^+$' for string in filtered_agg_data['adduct_and_nl']]
        elif pol == 'Neg':
            filtered_agg_data['adduct_and_nl'] = [string+'$^-$' for string in filtered_agg_data['adduct_and_nl']]
        
        #Group by relevant columns
        result = filtered_agg_data.groupby(['main_coarse_class', 'Polarity', 'adduct_and_nl']).agg({
                'spot_intensity_bgr_corrected' : 'sum',
                'frac' : 'sum'
        }).reset_index()

        #Normalize to number of total samples (datasets). This sets the total scale of each bar to exactly 1.
        result['frac'] = result['frac'] / len(list(set(filtered_agg_data['sample_name'])))

        report = pd.DataFrame()

        #Prepare report
        for cclass in set(filtered_agg_data['main_coarse_class']):
            rows = result[result['main_coarse_class'] == cclass].sort_values(by='frac', ascending=False)
            
            #Precalculate sum of all peak intensities up to a threshold of total contribution
            rowsum = sum(rows['frac'])
            threshold = rowsum*0.1
            flag = True
            sumint = 0

            #For each peak sorted in descending order of intensity, 
            for idx, row in rows.iterrows():
                addict  = {'Class':cclass, 'sum':rowsum }
                sumint = sumint+row.frac
                if flag and row.frac < threshold: #Done?
                    addict.update({'adduct_and_nl' : np.nan, #Sum remaining and store as "nan"
                                    'frac':(rowsum-sumint)/rowsum})
                    report = report.append(addict, ignore_index=True)
                    flag = False
                if flag: #Not done?
                    addict.update({'adduct_and_nl' : row['adduct_and_nl'], #Store row
                                    'frac':row.frac/rowsum})
                    report = report.append(addict, ignore_index=True)

        colorfill = scale_fill_manual(values=colvals, na_value='white')

        
        #Output split barchart
        molchart = molchart + [(ggplot(report)
        + aes(fill='adduct_and_nl', y='frac', x='Class')  
        + geom_bar(position="fill", stat="identity", linetype='solid', color='black')
        + coord_flip()
        + theme_classic()
        + colorfill
        + theme(
                    aspect_ratio=1,
                    text=element_text(family = 'sans-serif', size=16),
                    title=element_text(family = 'sans-serif', size=18),
                    #legend_position=(0.3, -0.15),
                    )
        + xlab("Class")
        + ylab("Fraction of total intensity")
        + labs(fill = "Adduct + neutral loss", title = f"Signal composition ({tech}, {pol})")
        )]

save_as_pdf_pages(molchart, paths()['p_out'] / "Barchart_Dilution_Class_And_Technology_10percent.pdf")

In [None]:
# Group data by compound, dataset and polarity
grouped_data = df.groupby(['Polarity', 'adduct_and_nl', 'main_coarse_class'])

# Aggregate prediction boolean and intensity
agg_data = grouped_data.agg({
        'pred_threestate' : 'any',
        'frac' : 'sum'
}).reset_index()

molchart = []

cmap = cm.get_cmap('tab20')
colorlist = []

for i in range(cmap.N):
    colorlist = colorlist + [matplotlib.colors.rgb2hex(cmap(i))]



for pol in ['Pos', 'Neg']:
    

    #Filter out all the good detections in one example dataset
    filtered_agg_data = agg_data[(agg_data['pred_threestate'] == True) & (agg_data['Polarity'] == pol)].reset_index().drop_duplicates()

    if pol == 'Pos':
        filtered_agg_data['adduct_and_nl'] = [string+'$^+$' for string in filtered_agg_data['adduct_and_nl']]
        polstr = "Positive"
        colvals = dict(zip(posnames, colorlist))
    elif pol == 'Neg':
        filtered_agg_data['adduct_and_nl'] = [string+'$^-$' for string in filtered_agg_data['adduct_and_nl']]
        polstr = "Negative"
        colvals = dict(zip(negnames, colorlist))
    
    result = filtered_agg_data.groupby(['main_coarse_class', 'Polarity', 'adduct_and_nl']).agg({
            'frac' : 'sum'

    }).reset_index()
    
    report = pd.DataFrame()

    #Prepare report
    #for molecule in sorted(set(result['name_short'])):
              
    
    for coarse_class in set(filtered_agg_data['main_coarse_class']):
        rows = result[result.main_coarse_class == coarse_class].sort_values(by='frac', ascending=False)
        #Precalculate sum of all peak intensities and an 5% threshold level
        rowsum = sum(rows['frac'])
        threshold = rowsum*0.05
        flag = True

        sumint = 0

        #For each peak sorted in descending order of intensity (far bg normalized), 
        #save adduct intensity ratio if 80% of max not already reached for molecule
        for idx, row in rows.iterrows():
            addict  = {'main_coarse_class':coarse_class, 'sum':rowsum }
            sumint = sumint+row.frac
            
            if flag and row.frac < threshold:
                addict.update({'adduct_and_nl' : np.nan,
                                'frac':(rowsum-sumint)/rowsum})
                report = report.append(addict, ignore_index=True)
                flag = False
            if flag:
                addict.update({'adduct_and_nl' : row['adduct_and_nl'],
                                'frac':row.frac/rowsum})
                report = report.append(addict, ignore_index=True) 
    
    colorfill = scale_fill_manual(values=colvals, na_value='white')
              
    molchart = molchart + [(ggplot(report)
    + aes(fill='adduct_and_nl', y='frac', x='main_coarse_class')  
    + geom_bar(position="fill", stat="identity", linetype='solid', color='black')
    + coord_flip()
    + theme_classic()
    + colorfill
    + theme(
                aspect_ratio=1,
                text=element_text(family = 'sans-serif', size=16),
                title=element_text(family = 'sans-serif', size=18),
                #legend_position=(0.3, -0.15),
                )
    + xlab("Class")
    + ylab("Fraction of total intensity")
    + labs(fill = "Adduct + neutral loss", title = f"Signal composition ({pol})")
    )]
    
save_as_pdf_pages(molchart, paths()['p_out'] / "Barchart_Coarse_Class_Dilution.pdf")