In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib notebook

import pandas as pd
import numpy as np
from pathlib import Path
from definitions import ROOT_DIR
import matplotlib.pyplot as plt
from plotnine import *
import re
from pylab import *
import seaborn as sns

p_root_dir = Path(ROOT_DIR).parents[0]
p_interlab = p_root_dir / "interlaboratory_survey"
p_matrix = p_root_dir / "matrix_comparison"

p_data = p_matrix / "5_data_analysis"
p_out = p_interlab / "6_plots" / 'Tissue'

# Compounds name and ID information
p_compounds = p_data / "compounds_ids.csv"

p_annotations = p_interlab / "5_data_analysis/tissue_predictions_11-Aug-2022_with_fdr.csv"

p_chem_class = p_data / "custom_classification_v2.csv"
p_pathways = p_data / "pathways_v2.csv"
p_wellmap = p_data / "wellmap.csv"

In [None]:
#Format ion formulas to publication standard
def pretty_ion_formulas(adnl, pol=""):
    
    adnl = [string.replace("[M]-","") for string in adnl]
    adnl = [string.replace("[M]+","") for string in adnl]
    adnl = [string.replace("++","+") for string in adnl]
    adnl = [string.replace("--","-") for string in adnl]
    adnl = [re.sub('([0-9])', '$_\\1$', string) for string in adnl]
    adnl = ['[M'+string+']' for string in adnl]
    if pol == 'Pos':
        adnl = [string+'$^+$' for string in adnl]
    elif pol == 'Neg':
        adnl = [string+'$^-$' for string in adnl]
    return adnl

#Calculate and store peaks normalized to sum of peaks per molecule
def calculate_signal_composition(df):
    addlist = []
        
    for sample in list(set(df['ds_id'])): #For each dataset
        
        for molecule in list(set(df[df['ds_id']==sample]['formula'])): #For each unique named molecule
            
            #Sum up intensities of all matching adduct/neutral loss peaks and store
            totsum = sum(df[(df['ds_id']==sample) & (df['formula']==molecule)]['sample_intensity'])
            addlist.append({'ds_id':sample, 'formula':molecule, 'molsum':totsum })
    
    #Add sum intensity information to main dataframe
    addf = pd.DataFrame(addlist)
    newdf = pd.merge(df, addf, left_on=['ds_id','formula'], right_on=['ds_id','formula'], how='left')
    
    #Calculate the signal contribution of each row as intensity divided by total intensity
    newdf['frac'] = newdf['sample_intensity']/newdf['molsum']
    return newdf


In [None]:
#Prepare dataframe

df = pd.read_csv(p_annotations, index_col=0)
df.neutral_loss.fillna('', inplace=True)
compounds = pd.read_csv(p_compounds, index_col='internal_id')
chem_class = pd.read_csv(p_chem_class, index_col='internal_id')
pathways = pd.read_csv(p_pathways, index_col='internal_id')
main_chem_class = chem_class[['name_short', 'main_coarse_class', 'fine_class']].drop_duplicates()
wellmap = pd.read_csv(p_wellmap, index_col='internal_id')

#Filter to only keep on-tissue predictions, and calculate signal composition among these
df = df[df.prediction == 1]
df['ds_id'] = df.new_id
df = calculate_signal_composition(df)
df = df[df['fdr'] <= 0.1] #10% FDR filter


#Polarity is missing from input, so hack it in
df['pol'] = df.adduct.apply(lambda val: "Pos" if val in ['[M]+','+H','+Na','+K'] else "Neg") 
df['adduct_and_nl'] = pretty_ion_formulas(df.adduct+df.neutral_loss)    
  
df = pd.merge(df, wellmap, on='formula', how='left')
df = pd.merge(df, main_chem_class, on='name_short', how='left')

In [None]:
#Tissue signal dilution by class (adducts only)

grouped_data = df.groupby(['adduct', 'neutral_loss', 'adduct_and_nl', 'main_coarse_class', 'pol'])

agg_data = grouped_data.agg({
        'sample_intensity' : 'sum',
        'frac' : 'sum'
}).reset_index()

molchart = []        

cmap = cm.get_cmap('tab20')
colorlist = []

for i in range(cmap.N):
    colorlist = colorlist + [matplotlib.colors.rgb2hex(cmap(i))]

for pol in ['Pos','Neg']:
    report = pd.DataFrame()
    result = agg_data[(agg_data['pol'] == pol)]
    
    for cclass in set(agg_data['main_coarse_class']):
        
        rows = result[(result['main_coarse_class'] == cclass)&(result['neutral_loss'] == '')].sort_values(by='frac', ascending=False)
    
        rowsum = sum(rows['frac']) 

        threshold = rowsum*0
        flag = True
        sumint = 0
        totint = sum(rows['sample_intensity'])
        filtered_df = df[(df['main_coarse_class']==cclass)&(df['neutral_loss']=='')&(df['pol']==pol)]
        swarm = [filtered_df['main_coarse_class'],filtered_df['sample_intensity']]    
            
        for idx, row in rows.iterrows():
            addict  = {'Class':cclass+' (total intensity: '+str(round(totint))+')', 'sum':rowsum, 'totint':totint, 'swarm':pd.DataFrame(swarm).T }
            sumint = sumint+row.frac
            if flag and row.frac < threshold:
                addict.update({'adduct_and_nl' : np.nan,
                'frac':(rowsum-sumint)/rowsum})
                report = report.append(addict, ignore_index=True)
                flag = False
            if flag:
                addict.update({'adduct_and_nl' : row['adduct_and_nl'],
                'frac':row.frac/rowsum})
                report = report.append(addict, ignore_index=True)
    
    #Export strip plot (may be informative)
    swarm = pd.DataFrame()
    plt.figure()
    for sw in report['swarm']:
        swarm = swarm.append(sw)
        
    theorder = sorted(set(swarm['main_coarse_class']), reverse=True) 
    logint = [math.log10(su) for su in swarm['sample_intensity']]
    sns.set_theme(style="whitegrid")
    plt.xticks(rotation=90)
    ax = sns.stripplot(y=swarm["main_coarse_class"],x=[math.log10(x) for x in swarm["sample_intensity"]], order=theorder)
    plt.tight_layout()
    plt.savefig(p_out / f"{pol}_StripTest.pdf")
    
    #Export main barchart
    molchart = molchart + [(ggplot(report)
            + aes(fill='adduct_and_nl', y='frac', x='Class')  
            + geom_bar(position="fill", stat="identity", linetype='solid', color='black')
            + coord_flip()
            + theme_classic()
    #        + colorfill
            + theme(
                        aspect_ratio=1,
                        text=element_text(family = 'sans-serif', size=16),
                        title=element_text(family = 'sans-serif', size=18),
                        #legend_position=(0.3, -0.15),
                        )
            + xlab("Class")
            + ylab("Fraction of total intensity")
            + labs(fill = "adduct_and_nl", title = f"Signal composition ({pol})")
            )]

save_as_pdf_pages(molchart, p_out / "Tissue Class Distribution Adduct.pdf")

#Meeting comment: Violin plots by adduct?

In [None]:
#Signal dilution by dataset

grouped_data = df.groupby(['adduct', 'ds_id', 'pol'])

agg_data = grouped_data.agg({
        'sample_intensity' : 'sum',
        'frac' : 'sum'
}).reset_index()

molchart = []        

cmap = cm.get_cmap('tab20')
colorlist = []

for i in range(cmap.N):
    colorlist = colorlist + [matplotlib.colors.rgb2hex(cmap(i))]

for pol in ['Pos','Neg']:
    report = pd.DataFrame()
    result = agg_data[(agg_data['pol'] == pol)]
    
    for dsid in set(agg_data['ds_id']):
        rows = result[result['ds_id'] == dsid].sort_values(by='frac', ascending=False)
    
        rowsum = sum(rows['frac']) 

        threshold = rowsum*0.05
        flag = True
        sumint = 0

        for idx, row in rows.iterrows():
            addict  = {'Dataset':dsid, 'sum':rowsum }
            sumint = sumint+row.frac
            if flag and row.frac < threshold:
                addict.update({'adduct' : np.nan,
                'frac':(rowsum-sumint)/rowsum})
                flag = False
            if flag:
                addict.update({'adduct' : row['adduct'],
                'frac':row.frac/rowsum})
            report = report.append(addict, ignore_index=True)
                
    molchart = molchart + [(ggplot(report)
            + aes(fill='adduct', y='frac', x='Dataset')  
            + geom_bar(position="fill", stat="identity", linetype='solid', color='black')
            + coord_flip()
            + theme_classic()
    #        + colorfill
            + theme(
                        aspect_ratio=1,
                        text=element_text(family = 'sans-serif', size=16),
                        title=element_text(family = 'sans-serif', size=18),
                        #legend_position=(0.3, -0.15),
                        )
            + xlab("Dataset")
            + ylab("Fraction of total intensity")
            + labs(fill = "adduct", title = f"Signal composition ({pol})")
            )]

save_as_pdf_pages(molchart, p_out / "Tissue Dataset Distribution Adduct.pdf")