In [1]:
import pandas as pd
import scanpy as sc
import json
from utils import sankey_plot

In [2]:
datasets = ['PBMC1', 'PBMC2', 'PBMC3', 'PBMC4']
data_dir = './Data/'
min_cluster_size = 0

datasets_dims = {}
datasets_dims['name'] = []
datasets_dims['# cells'] = []
datasets_dims['# cells after step filtering'] = []
datasets_dims['# genes'] = []
datasets_dims['# genes after filtering'] = []
datasets_dims['# cells labelled with surface protein'] = []
datasets_dims['# antibody clusters'] = []
datasets_dims['# celltypist clusters'] = []

for dataset in datasets:
    datasets_dims['name'].append(dataset)

    # read antibody labels and mapping and merge them
    antibody_labels = pd.read_csv(f'{data_dir}{dataset}/antibody_annotation/antibody_labels.csv', index_col=0)
    antibody_counts = antibody_labels.value_counts().to_frame()
    datasets_dims['# cells labelled with surface protein'].append(antibody_labels.shape[0])
    datasets_dims['# antibody clusters'].append(antibody_counts.shape[0])
    antibody_mapping = pd.read_csv(f"{data_dir}{dataset}/antibody_annotation/antibody_mapping.csv", index_col=0)
    antibody_mapping = antibody_mapping.reset_index().set_index('id')
    antibody_counts.index.names = ['id']
    antibody_counts_mappings = antibody_counts.merge(antibody_mapping, on='id')
    antibody_counts_mappings.to_csv(f"{data_dir}{dataset}/antibody_annotation/antibody_annotation_counts.csv")

    # collect datasets dims
    adata = sc.read_10x_mtx(f"{data_dir}{dataset}/raw/10X/", var_names="gene_symbols", cache=False)
    adata_filtered = sc.read_10x_mtx(f"{data_dir}{dataset}/filtered/10X/", var_names="gene_symbols", cache=False)
    datasets_dims['# cells'].append(adata.shape[0])
    datasets_dims['# cells after step filtering'].append(adata_filtered.shape[0])
    datasets_dims['# genes'].append(adata.shape[1])
    datasets_dims['# genes after filtering'].append(adata_filtered.shape[1])

    # compute cluster sizes on filtered dataset
    celltypist_labels = pd.read_csv(f'{data_dir}{dataset}/celltypist/celltypist_labels.csv', index_col=0)
    celltypist_labels.index = celltypist_labels.index.str[:-2]
    datasets_dims['# celltypist clusters'].append(celltypist_labels.value_counts().shape[0])
    filtered_barcodes = adata_filtered.obs.index.to_list()
    celltypist_filtered_labels = celltypist_labels[celltypist_labels.index.isin(filtered_barcodes)]
    celltypist_counts = celltypist_filtered_labels.value_counts().to_frame()
    celltypist_counts.index.names = ['id']
    celltypist_mapping = pd.read_csv(f"{data_dir}{dataset}/celltypist/celltypist_mapping.csv", index_col=0)
    celltypist_counts_mappings = celltypist_counts.merge(celltypist_mapping, on='id')
    celltypist_counts_mappings.to_csv(f"{data_dir}{dataset}/celltypist/celltypist_annotation_counts_filtered.csv")

    # plot sankey between celltypist and antibody labels
    celltypist_antibody_labels = antibody_labels.join(celltypist_labels, how='left', lsuffix='_surface_protein', rsuffix='_celltypist')
    celltypist_antibody_labels.rename(columns={'cluster.ids_celltypist': 'celltypist', 'cluster.ids_surface_protein': 'surface_protein'}, inplace=True)
    sankey_plot(
        labels = [
            celltypist_antibody_labels['celltypist'].map(celltypist_counts_mappings['go'].to_dict()).to_list(),
            celltypist_antibody_labels['surface_protein'].map(antibody_counts_mappings['go'].to_dict()).to_list()
        ],
        path = f"{data_dir}{dataset}/sankey_ground_truth_labels.html",
        labels_titles = ['celltypist', 'surface_protein'],
        title = f"{dataset} Ground Truth labels ({celltypist_antibody_labels.shape[0]} cells)"
    )

    # save the number of clusters with celltypist
    with open(f'{data_dir}{dataset}/celltypist/nclusters.json', 'w') as fp:
        json.dump({'nclusters': celltypist_counts_mappings[celltypist_counts_mappings['count']>=min_cluster_size].shape[0]}, fp)

    # save the number of clusters with antibody data
    with open(f'{data_dir}{dataset}/antibody_annotation/nclusters.json', 'w') as fp:
        json.dump({'nclusters': antibody_counts_mappings[antibody_counts_mappings['count']>=min_cluster_size].shape[0]}, fp)

datasets_dims_df = pd.DataFrame(datasets_dims)
datasets_dims_df.set_index('name', inplace=True)
datasets_dims_df.to_csv(data_dir + "/pbmcs_sizes.csv", index=False)
display(datasets_dims_df)


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead




is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead




is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead




is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



Unnamed: 0_level_0,# cells,# cells after step filtering,# genes,# genes after filtering,# cells labelled with surface protein,# antibody clusters,# celltypist clusters
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
PBMC1,5527,3610,33538,14366,4307,16,19
PBMC2,6574,5975,36601,13704,5460,13,18
PBMC3,11715,10944,33538,13119,10508,22,19
PBMC4,8258,6966,33538,13600,7047,27,17
