In [1]:
import pandas as pd
import scanpy as sc
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score
from sklearn.metrics import homogeneity_score, completeness_score, fowlkes_mallows_score, silhouette_score
from src.utils import sankey_plot
import kaleido
import plotly.io as pio

In [2]:
DIR = 'Data/'
DATASET_NAMES = ['PBMC1', 'PBMC2', 'PBMC3','PBMC4']
TOOLS = ['monocle', 'scanpy', 'scvi-tools', 'seurat', 'COTAN']
PARAMS_TUNING = ['default', 'celltypist', 'antibody']

In [5]:
def compute_scores(dir, dataset, labels_df, labels_matched, ground_truth_labels):
    scores = {}
    scores['NMI'] = {}
    scores['ARI'] = {}
    scores['homogeneity'] = {}
    scores['completeness'] = {}
    scores['fowlkes_mallows'] = {}
    for tool in TOOLS:
        scores['NMI'][tool] = normalized_mutual_info_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'], average_method='arithmetic')
        scores['ARI'][tool] = adjusted_rand_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'])
        scores['homogeneity'][tool] = homogeneity_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'])
        scores['completeness'][tool] = completeness_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'])
        scores['fowlkes_mallows'][tool] = fowlkes_mallows_score(labels_pred=labels_df['cluster_'+tool], labels_true=labels_df[f'cluster_{ground_truth_labels}'])
    scores_df = pd.DataFrame(scores)
    scores_df.to_csv(f'{dir}{dataset}/scores_{labels_matched}_{ground_truth_labels}.csv')
    scores_df.to_latex(f'{dir}{dataset}/scores_{labels_matched}_{ground_truth_labels}.tex')
    display(scores_df)

In [None]:
for tuning in PARAMS_TUNING:
    for dataset in DATASET_NAMES:
        print('------------------------------')
        print(f'{dataset} - matching {tuning} labels' if tuning != 'default' else f'{dataset} - default labels')
        
        # concat tools labels
        labels_df = pd.read_csv(f'{DIR}{dataset}/COTAN/{tuning}/clustering_labels.csv', index_col=0)
        labels_df.rename(columns={"cluster": "cluster_COTAN"}, inplace=True)
        for tool in [t for t in TOOLS if t != 'COTAN']:
            tool_labels_df = pd.read_csv(f'{DIR}{dataset}/{tool}/{tuning}/clustering_labels.csv', index_col=0)
            labels_df = labels_df.merge(tool_labels_df, how='inner', on='cell')
            labels_df.rename(columns={"cluster": f"cluster_{tool}"}, inplace=True)
        
        # load and concat celltypist labels
        celltypist_df = pd.read_csv(f'{DIR}{dataset}/celltypist/celltypist_labels.csv', index_col=0)
        celltypist_df.index = celltypist_df.index.str[:-2]
        celltypist_df = labels_df.merge(celltypist_df, how='inner', on='cell')
        celltypist_df.rename(columns={"cluster.ids": f"cluster_celltypist"}, inplace=True)
        celltypist_mapping_df = pd.read_csv(f'{DIR}{dataset}/celltypist/celltypist_mapping.csv', index_col=0)
        
        # load and concat protein surface labels
        antibody_df = pd.read_csv(f'{DIR}{dataset}/antibody_annotation/antibody_labels.csv', index_col=0)
        antibody_df = labels_df.merge(antibody_df, how='inner', on='cell')
        antibody_df.rename(columns={"cluster.ids": f"cluster_antibody"}, inplace=True)
        antibody_mapping_df = pd.read_csv(f'{DIR}{dataset}/antibody_annotation/antibody_mapping.csv', index_col=1)

        # read dataset
        adata = sc.read_10x_mtx(
            f'{DIR}{dataset}/filtered/10X/',
            var_names='gene_symbols',
            cache=False
        )
        # keep only labelled cells
        adata.var_names_make_unique()
        subset_cells = adata.obs_names.isin(labels_df.index)
        adata = adata[subset_cells, :]

        # compute silhouette score
        silhouette = {}
        for tool in TOOLS:
            silhouette[tool] = silhouette_score(adata.X, labels_df[f'cluster_{tool}'])
        if tuning=='celltypist':
            silhouette['celltypist'] = silhouette_score(adata.X, celltypist_df[f'cluster_celltypist'])
        elif tuning=='antibody':
            silhouette['antibody'] = silhouette_score(adata.X, antibody_df[f'cluster_antibody'])
        silhouette_df = pd.DataFrame(silhouette, index=[0])
        silhouette_df.to_csv(f'{DIR}{dataset}/{tuning}_silhouette.csv')
        silhouette_df.to_latex(f'{DIR}{dataset}/{tuning}_silhouette.tex')

        # compute scores comparing each tool labels with celltypist labels
        if tuning == 'celltypist' or tuning == 'default':
            compute_scores(DIR, dataset, celltypist_df, tuning, 'celltypist')
            labels = []
            labels_titles = []
            for tool in TOOLS:
                labels.append(celltypist_df[f'cluster_{tool}'].to_list())
                labels_titles.append(tool)
            labels.append(celltypist_df[f'cluster_celltypist'].map(celltypist_mapping_df['go'].to_dict()).to_list())
            labels_titles.append('celltypist')
            title = f'{dataset} - matching {tuning} labels' if tuning != 'default' else f'{dataset} - default labels'
            #sankey_plot(labels=labels, labels_titles=labels_titles, title=title, path=f'{DIR}{dataset}/{tuning}_celltypist.html')
        
        # compute scores comparing each tool labels with protein labels
        if tuning == 'antibody' or tuning == 'default':
            compute_scores(DIR, dataset, antibody_df, tuning, 'antibody')
            labels = []
            labels_titles = []
            for tool in TOOLS:
                labels.append(antibody_df[f'cluster_{tool}'].to_list())
                labels_titles.append(tool)
            labels.append(antibody_df[f'cluster_antibody'].map(antibody_mapping_df['go'].to_dict()).to_list())
            labels_titles.append('antibody')
            title = f'{dataset} - matching {tuning} labels' if tuning != 'default' else f'{dataset} - default labels'
            #sankey_plot(labels=labels, labels_titles=labels_titles, title=title, path=f'{DIR}{dataset}/{tuning}_antibody.html')

------------------------------
PBMC1 - default labels


Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.578257,0.384609,0.41014,0.97993,0.602512
scanpy,0.721042,0.404607,0.82498,0.640363,0.508176
scvi-tools,0.776232,0.599664,0.80979,0.745344,0.666244
seurat,0.79363,0.649593,0.784165,0.803327,0.705921
COTAN,0.787289,0.670392,0.803876,0.771373,0.723485


Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.611988,0.425929,0.446299,0.973344,0.635502
scanpy,0.659645,0.391203,0.795577,0.563386,0.50773
scvi-tools,0.708581,0.551051,0.776228,0.65178,0.63275
seurat,0.738344,0.643097,0.764146,0.714228,0.706018
COTAN,0.73214,0.651092,0.784252,0.686521,0.713737


------------------------------
PBMC2 - default labels


Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.393166,0.20718,0.245998,0.978626,0.521364
scanpy,0.71882,0.457213,0.804,0.64996,0.556684
scvi-tools,0.699788,0.424696,0.78592,0.63067,0.525031
seurat,0.775988,0.56243,0.81956,0.736815,0.640108
COTAN,0.729355,0.4728,0.74555,0.713848,0.56248


Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.277314,0.107135,0.165534,0.853977,0.450594
scanpy,0.682604,0.524109,0.759388,0.619922,0.602311
scvi-tools,0.652891,0.485961,0.734303,0.587729,0.567847
seurat,0.743681,0.679941,0.77765,0.712555,0.730603
COTAN,0.701021,0.636547,0.691258,0.711064,0.702879


------------------------------
PBMC3 - default labels


Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.500696,0.23356,0.338609,0.960446,0.500077
scanpy,0.685919,0.462762,0.763719,0.622505,0.541286
scvi-tools,0.738418,0.579677,0.757237,0.720511,0.635237
seurat,0.770512,0.58511,0.821173,0.725738,0.644073
COTAN,0.701377,0.510672,0.798585,0.625266,0.58249


Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.429744,0.168276,0.280823,0.914939,0.437511
scanpy,0.664567,0.54263,0.702416,0.630588,0.596647
scvi-tools,0.691391,0.620339,0.67793,0.705398,0.66258
seurat,0.735217,0.664188,0.744324,0.72633,0.701375
COTAN,0.660422,0.532024,0.710462,0.616968,0.583615


------------------------------
PBMC4 - default labels


Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.617025,0.47007,0.453383,0.965513,0.647279
scanpy,0.701228,0.380357,0.819943,0.612541,0.48756
scvi-tools,0.739299,0.504966,0.788229,0.696088,0.5849
seurat,0.760207,0.494746,0.847372,0.689301,0.583823
COTAN,0.717429,0.440485,0.795735,0.653154,0.529504


Unnamed: 0,NMI,ARI,homogeneity,completeness,fowlkes_mallows
monocle,0.536861,0.325029,0.372515,0.960701,0.53281
scanpy,0.622945,0.371655,0.659143,0.590516,0.439575
scvi-tools,0.65155,0.425369,0.634107,0.66998,0.487767
seurat,0.669274,0.436706,0.676741,0.661971,0.496402
COTAN,0.622735,0.383434,0.626009,0.619496,0.448828


------------------------------
PBMC1 - matching celltypist labels


In [74]:
def compute_clustering_scores(celltypist_df, antibody_df, output_dir, dataset):
    # Merge the dataframes on the common 'cell' column

    celltypist_df = pd.read_csv(f'{DIR}{dataset}/celltypist/celltypist_labels.csv', index_col=0)
    celltypist_df.index = celltypist_df.index.str[:-2]
    antibody_df = pd.read_csv(f'{DIR}{dataset}/antibody_annotation/antibody_labels.csv', index_col=0)
    #antibody_df = labels_df.merge(antibody_df, how='inner', on='cell')
    all_in_antibody = celltypist_df.index.isin(antibody_df.index).all()

    print("All celltypist indices in antibody:", all_in_antibody)
    merged_df = celltypist_df.merge(antibody_df, how='inner',left_index=True, right_index=True)# on='cell')

    merged_df.columns = ['cluster_celltypist','cluster_antibody']
    
    # Initialize scores dictionary
    scores = {
        'NMI': normalized_mutual_info_score(merged_df['cluster_celltypist'], merged_df['cluster_antibody'], average_method='arithmetic'),
        'ARI': adjusted_rand_score(merged_df['cluster_celltypist'], merged_df['cluster_antibody']),
        'Homogeneity': homogeneity_score(merged_df['cluster_celltypist'], merged_df['cluster_antibody']),
        'Completeness': completeness_score(merged_df['cluster_celltypist'], merged_df['cluster_antibody']),
        'Fowlkes_Mallows': fowlkes_mallows_score(merged_df['cluster_celltypist'], merged_df['cluster_antibody'])
    }
    
    # Convert scores to DataFrame
    scores_df = pd.DataFrame([scores])
    
    # Save scores to CSV and LaTeX
    #scores_df.to_csv(f'{output_dir}{dataset}/clustering_comparison_scores.csv')
    #scores_df.to_latex(f'{output_dir}{dataset}/clustering_comparison_scores.tex')
    
    # Display scores DataFrame
    print(scores_df)



In [75]:
for dataset in DATASET_NAMES:
    print('------------------------------')
    print(f'{dataset} - Clustering Comparison between CellTypist and Antibody')

    # Assuming celltypist_df and antibody_df are defined elsewhere and available here
    compute_clustering_scores(celltypist_df, antibody_df, DIR, dataset)


------------------------------
PBMC1 - Clustering Comparison between CellTypist and Antibody
All celltypist indices in antibody: False
        NMI       ARI  Homogeneity  Completeness  Fowlkes_Mallows
0  0.718261  0.681603     0.697035      0.740821         0.738723
------------------------------
PBMC2 - Clustering Comparison between CellTypist and Antibody
All celltypist indices in antibody: False
        NMI       ARI  Homogeneity  Completeness  Fowlkes_Mallows
0  0.657818  0.497187     0.684209      0.633387         0.605241
------------------------------
PBMC3 - Clustering Comparison between CellTypist and Antibody
All celltypist indices in antibody: False
        NMI       ARI  Homogeneity  Completeness  Fowlkes_Mallows
0  0.665327  0.497556     0.710146       0.62583         0.573848
------------------------------
PBMC4 - Clustering Comparison between CellTypist and Antibody
All celltypist indices in antibody: False
        NMI       ARI  Homogeneity  Completeness  Fowlkes_Mallow