In [None]:
from pathlib import Path
from definitions import ROOT_DIR
import pandas as pd
import numpy as np

import scanpy as sc
from anndata import AnnData
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc, rc_context

rc('font',**{'family':'sans-serif',
             'sans-serif':['Arial'],
             'size':16})

In [None]:
def get_class_size(metadata, class_column):
    sizes = metadata[class_column].value_counts()
    metadata['class_size'] = [sizes[k] for k in metadata[class_column]]
    return metadata


def filter_neutral_losses(df, neutral_losses=['']):
    '''
    Filter out entries for ions with neutral losses that are not in the list provided
    If neutral_loss value us "only_nl", than consider only ions that have neutral losses
    '''
    
    if neutral_losses == "only_nl":
        df = df[df.neutral_loss != ""]
    elif neutral_losses != None:
        df = df[df.neutral_loss.isin(neutral_losses)]
    return df


def filter_adducts(df, adducts=['']):
    '''
    Filter out entries for ions with adducts that are not in the list provided
    '''
    if adducts != None:
        df = df[df.adduct.isin(adducts)]
    return df


def filter_polarity(df, polarity=None):
    '''
    Filter out entries based on polarity pol ['positive', 'negative']
    '''
    if polarity != None:
        df = df[df.Polarity == polarity]
    return df


def filter_data(data, polarity=None, adducts=None, neutral_losses=None):
    '''
    Apply polarity, adduct and neutral_loss filters
    '''
    data = filter_polarity(data, polarity)
    data = filter_adducts(data, adducts)
    data = filter_neutral_losses(data, neutral_losses)
    return data


def group_by_molecule(df, groupby_columns):
    '''
    Aggregate intensity and detection values per groupby columns
    '''          
    data = df.groupby(groupby_columns).agg({
        'detectability' : 'max', # here detectability of metabolite is set to 1 if any of it's ions was detected
        'property1' : 'sum', # here the additional property is summed across all detected ions of metabolite
        'property2' : 'sum', # here the additional property is summed across all detected ions of metabolite
        'property3' : 'sum' # here the additional property is summed across all detected ions of metabolite
    }).reset_index()
    return data


def summarise_per_class(df, groupby_columns):
    
    data = df.groupby(groupby_columns).agg({'detectability' : 'sum', # here number of detected metabolites per class is counted
                                            'my_property' : 'mean', # here you take a mean of some property among all detected metabolites in that class
                                            'class_size': 'first'
                                            }).reset_index()

    data['fraction_detected'] = data.detectability / data['class_size'] 
    data.drop(columns=['detectability', 'class_size'], inplace=True)
    return data


def assemble_adata(pca):
    observables = pca.index.to_frame(index=False)
    for col in observables.columns:
        observables[col] = observables[col].astype('category')  
    variables = pca.columns.to_frame(index=False)
    adata = AnnData(pca.values, obs=observables, var=variables)
    return adata

- Inputs

In [None]:
p_root_dir = Path(ROOT_DIR).parents[0]
p_interlab = p_root_dir / "interlaboratory_survey"
p_matrix = p_root_dir / "matrix_comparison"

p_m_data = p_matrix / "5_data_analysis"
p_i_data = p_interlab / "5_data_analysis"
p_out = p_interlab / "6_plots" / "Interlab"

# Metrics and Catboost predictions for all ions in their target wells
p_predictions = p_i_data / "2022-08-16_All_Interlab_Predictions.csv"

# Dataset info (lab, matrix, polarity, m/z range, ids, etc.)
p_datasets = p_m_data / "Datasets_14Jul2022.csv"

# Classification
p_chem_class = p_m_data / "custom_classification_v2.csv"

- Merge relevant info into one dataframe

In [None]:
# Load predictions
predictions = pd.read_csv(p_predictions, index_col=0)
print(predictions.columns)

In [None]:
# Format neutral loss column, keep other columns that you need
predictions = pd.read_csv(p_predictions, index_col=0)


predictions.neutral_loss.fillna('', inplace=True)

# Add dataset metadata 
datasets = pd.read_csv(p_datasets)
metadata_columns = ['Dataset name', 
                    'Participant lab', 
                    'Technology', 
                    'Original technology', 
                    'Ionisation source',
                    'Mass analyser', 
                    'Source pressure',
                    'Matrix short',
                    'Polarity', 
                    'Slide code',
                    'Interlab', 
                    'All'
                   ]
datasets_info = datasets.groupby('Dataset ID').first()[metadata_columns]

# Merge with predictions
df = pd.merge(predictions, datasets_info, left_on='dataset_id', right_on='Dataset ID', how='left')

# Filter to keep only interlab datasets + EMBL datasets with 10ppm
df = df[df['Interlab']] 
# only consider data of detected ions
threshold = 0.8
df['detectability'] = df.pred_val >= threshold
data = df[df.detectability]

## Option 1: Build PCA based on the intenisities and detectability of all metabolites
- Each metabolite is an average of its detected ions

In [None]:
# Choose polarity, filter adducts and neutral losses
filtered_data = filter_data(data,
                            polarity='negative', 
                            neutral_losses=['']
                           )

# Add property that you want to use for PCA
filtered_data['property1'] = filtered_data['on_off_ratio'] * 1 # this is just an example that does nothing to the column, but you can do whatever
filtered_data['property2'] = filtered_data['spot_intensity_bgr_corrected_tic_norm'] * 1
filtered_data['property3'] = filtered_data['bg_intensity_tic'] * 1

# Summarise data per metabolite and dataset
molecule_data = group_by_molecule(filtered_data, groupby_columns=np.append(metadata_columns, 'name_short').tolist())

# Reshape
pca = molecule_data.pivot_table(values=['detectability'], # you can choose one or more of propeties for pca, the one I sent you is only using detectability 
                                 index=metadata_columns, 
                                 columns='name_short',
                                 fill_value=0)

In [None]:
# Look at what you got
pca.head()

In [None]:
# Assemble adata
adata = assemble_adata(pca)

# Apply Z-score normalisation: If you use only detectability for PCA, this is not needed
sc.pp.scale(adata, zero_center=True) 

# Compute PCA
sc.tl.pca(adata, svd_solver='arpack')

In [None]:
# Make plots
fname = "PCA_interlab_neg"

with rc_context():    
    ax = sc.pl.pca(adata, 
                   components=['1, 2'], 
                   color=[
#                     'Dataset name', 
                    'Participant lab', 
                    'Technology', 
                    'Original technology', 
                    'Ionisation source',
                    'Mass analyser', 
                    'Source pressure',
                    'Matrix short',
                    'Polarity', 
                   ],
                   size=300, 
                   ncols=2, # Number of columns to organise subplots in
                   legend_loc='right margin',
                   show=False, 
                   wspace=0.95,
                   annotate_var_explained = True
                  )
    plt.tight_layout()
#     plt.savefig(p_out/ f"{fname}.png")
plt.savefig(p_out / f"{fname}.pdf")

## Option 2: PCA based on the aggregated values also used in coarse class dot plot
- This is useful in case you want to see PCA loadings

In [None]:
# Load classification, add class size info

classes = pd.read_csv(p_chem_class, index_col='internal_id')
chem_class = get_class_size(classes[['name_short', 'main_coarse_class']].drop_duplicates(), 
                            'main_coarse_class')
chem_subclass = get_class_size(classes[['name_short', 'coarse_class', 'fine_class']],  # coarse class here is to sort rows in the plot
                              'fine_class')

In [None]:
# Choose polarity, filter adducts and neutral losses
filtered_data = filter_data(data,
                            polarity='positive', 
                            neutral_losses=['']
                           )

# Add property that you want to use for PCA
filtered_data['my_property'] = filtered_data['spot_intensity_tic_norm'] * 1 # this is just an example that does nothing to the column, but you can do whatever

# Summarise data per metabolite and dataset (intensities of detected ions are summed)
molecule_data = group_by_molecule(filtered_data, groupby_columns=np.append(metadata_columns, 'name_short').tolist())

# Map chemical classes
mapped_data = molecule_data.merge(chem_class, on='name_short', how='left')

# Summarise data per class
class_data = summarise_per_class(mapped_data, groupby_columns=np.append(metadata_columns, 'main_coarse_class').tolist())

# Reshape
pca = class_data.pivot_table(values=['fraction_detected', 'my_property'],
                             index=metadata_columns, 
                             columns='main_coarse_class',
                             fill_value=0)

# Assemble adata
adata = assemble_adata(pca)

# Apply Z-score normalisation: If you use only detectability for PCA, this is not needed
sc.pp.scale(adata, zero_center=True) 

# Compute PCA
sc.tl.pca(adata, svd_solver='arpack')

In [None]:
pca.head()

In [None]:
fname = "PCA_interlab_coarse_class_pos"

with rc_context():    
    ax = sc.pl.pca(adata, 
                   components=['1, 2'], 
                   color=[
#                     'Dataset name', 
                    'Participant lab', 
                    'Technology', 
                    'Original technology', 
                    'Ionisation source',
                    'Mass analyser', 
                    'Source pressure',
                    'Matrix short',
                    'Polarity', 
                   ],
                   size=400, 
                   ncols=2,
                   legend_loc='right margin',
                   show=False, 
                   wspace=0.7,
                   annotate_var_explained = True)
    plt.tight_layout()
#     plt.savefig(p_out/ f"{fname}.png")
#     plt.savefig(p_out / f"{fname}.pdf")

In [None]:
# # Extras

# # See how much PCA components explain variance
# sc.pl.pca_variance_ratio(adata)

# # Check loading plot
# sc.pl.pca_loadings(adata, components='1,2')

## Option 3: PCA based on the aggregated values also used in fine class dot plot
- All three opotions provide similar result, but this can be useful for the loading plot

In [None]:
# Choose polarity, filter adducts and neutral losses
filtered_data = filter_data(data,
                            polarity='negative', 
                            neutral_losses=['']
                           )

# Add property that you want to use for PCA
filtered_data['my_property'] = filtered_data['spot_intensity_tic_norm'] * 1 # this is just an example that does nothing to the column, but you can do whatever

# Summarise data per metabolite and dataset (intensities of detected ions are summed)
molecule_data = group_by_molecule(filtered_data, groupby_columns=np.append(metadata_columns, 'name_short').tolist())

# Map chemical classes
mapped_data = molecule_data.merge(chem_subclass, on='name_short', how='left')

# Summarise data per class
class_data = summarise_per_class(mapped_data, groupby_columns=np.append(metadata_columns, 'fine_class').tolist())

# Reshape
pca = class_data.pivot_table(values=['fraction_detected', 'my_property'],
                             index=metadata_columns, 
                             columns='fine_class',
                             fill_value=0)

# Assemble adata
adata = assemble_adata(pca)

# Apply Z-score normalisation: If you use only detectability for PCA, this is not needed
sc.pp.scale(adata, zero_center=True) 

# Compute PCA
sc.tl.pca(adata, svd_solver='arpack')

In [None]:
pca.head()

In [None]:
fname = "PCA_interlab_fine_class_neg"

with rc_context():    
    ax = sc.pl.pca(adata, 
                   components=['1,2'], 
                   color=[
#                     'Dataset name', 
                    'Participant lab', 
                    'Technology', 
                    'Original technology', 
                    'Ionisation source',
                    'Mass analyser', 
                    'Source pressure',
                    'Matrix short',
                    'Polarity', 
                   ],
                   size=400, 
                   ncols=2,
                   legend_loc='right margin',
                   show=False, 
                   wspace=0.7,
                   annotate_var_explained = True)
    plt.tight_layout()
#     plt.savefig(p_out/ f"{fname}.png")
#     plt.savefig(p_out / f"{fname}.pdf")