In [1]:
import pandas as pd
import scanpy as sc
import numpy as np
import h5py

import matplotlib
import matplotlib.pyplot as plt
matplotlib.use('Agg')
from matplotlib.pyplot import plot,savefig
from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")
from read_count import read_data

In [2]:
col = ["#E64B35CC", "#0072B5CC", "#00A087CC", "#3C5488CC", "#F39B7FCC", "#F7DC05FF", "#FD7446E5",
       "#8491B4CC", "#7E6148CC", "#B09C85CC", "#E18727CC", "#FFDC91E5", "#6A6599E5", "#9467BDB2",
       "#FFFFFFFF", "#0000FFFF", "#FF0000FF", "#00FF00FF", "#000033FF", "#FF00B6FF", "#005300FF", "#FFD300FF",
       "#009FFFFF", "#9A4D42FF", "#00FFBEFF", "#783FC1FF", "#1F9698FF", "#FFACFDFF", "#B1CC71FF", "#F1085CFF",
       "#FE8F42FF", "#DD00FFFF", "#201A01FF", "#720055FF", "#766C95FF", "#02AD24FF", "#C8FF00FF", "#886C00FF",
       "#FFB79FFF", "#858567FF", "#A10300FF", "#14F9FFFF", "#00479EFF", "#DC5E93FF", "#93D4FFFF", "#004CFFFF"]

In [3]:
def plot_cluster(df, data_name, phase, by, y_true, n, ax):
    
    """
        phase: 'split' or 'enhance'. If phase == 'split', using the results after splitting, else using the results after enhancement.
        by: 'clusters' or 'labels'. If by == 'clusters', colored by cluster labels, else colored by true cell types.
        n: n-th dataset in [Human pancreas, Human PBMC, Human kidney, Mouse ES, Mouse hypothalamus, Mouse kidney, Turtle brain]
    """
    
    if phase == 'split':
        umap = umap_init_all[n]
        y_pred = df['Clusters'][0]
    
    else :
        umap = umap_last_all[n]
        y_pred = df['Clusters'][1]
    
    y_pred = np.asarray(y_pred, dtype='int').squeeze()
    K_pred = len(np.unique(y_pred))
    
    ari_pred = np.round(metrics.adjusted_rand_score(y_pred, y_true), 2)
    nmi_pred = np.round(metrics.normalized_mutual_info_score(y_pred, y_true), 2)
    
    if by == 'clusters':
        print('Datasets: {}_{}, ARI={}, NMI={}, k={}'.format(data_name, phase, ari_pred, nmi_pred, K_pred))
        
    adata = sc.AnnData(pd.DataFrame(np.random.rand(len(y_pred), 1)))
    adata.obs['pred'] = y_pred
    adata.obs['pred'] = adata.obs['pred'].astype(str).astype('category')
    adata.obs['true'] = y_true
    adata.obs['true'] = adata.obs['true'].astype(str).astype('category')

    adata.obsm['X_umap'] = umap
    
    if by == 'clusters':
        sc.pl.umap(adata, color=['pred'], ax=ax, show=False, legend_loc='None', size=8)
        if phase == 'split':
            ax.set_title('K={}'.format(K_pred), fontsize=15, family='Arial')
        else:
            ax.set_title('K={} ARI={}'.format(K_pred, ari_pred), fontsize=15, family='Arial')
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)
        
    else:
        sc.pl.umap(adata, color=['true'], ax=ax, show=False, legend_loc='None', palette=col, size=8)
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)

In [5]:
data_mat = h5py.File('dataset/Human_p.h5')
y_true_human = np.array(data_mat['Y'], dtype='int')
data_mat.close()

data_mat = h5py.File('dataset/Human_PBMC.h5')
y_true_pbmc = np.array(data_mat['Y'], dtype='int')
data_mat.close()

data_mat = h5py.File('dataset/Human_k.h5')
y_true_kidney = np.array(data_mat['Y'], dtype='int')
data_mat.close()

mat, obs, var, uns = read_data('dataset/Mouse_E.h5', sparsify=False, skip_exprs=False)
x = np.array(mat.toarray())
cell_name = np.array(obs["cell_type1"])
cell_type, y_true_klein = np.unique(cell_name, return_inverse=True)

mat, obs, var, uns = read_data('dataset/Mouse_h.h5', sparsify=False, skip_exprs=False)
x = np.array(mat.toarray())
cell_name = np.array(obs["cell_type1"])
cell_type, y_true_chen_1 = np.unique(cell_name, return_inverse=True)

adata = sc.AnnData(x)
adata.obs['celltype'] = y_true_chen_1
sc.pp.filter_genes(adata, min_cells=3)
sc.pp.filter_cells(adata, min_genes=200)
y_true_chen_2 = np.array(adata.obs['celltype']).squeeze()

mat, obs, var, uns = read_data('dataset/Mouse_k.h5', sparsify=False, skip_exprs=False)
x = np.array(mat.toarray())
cell_name = np.array(obs["cell_type1"])
cell_type, y_true_adam = np.unique(cell_name, return_inverse=True)

mat, obs, var, uns = read_data('dataset/Turtle_b.h5', sparsify=False, skip_exprs=False)
x = np.array(mat.toarray())
cell_name = np.array(obs["cell_type1"])
cell_type, y_true_turtle = np.unique(cell_name, return_inverse=True)

In [6]:
fig = plt.figure(figsize=(22, 5))
sub_figs = fig.subfigures(2, 1)
axs = []

for i, sub_fig in enumerate(sub_figs):
    axs.append(sub_fig.subplots(1, 7))
    
axs = np.array(axs)

# Split

In [7]:
human_c = np.load('results/enhancement/scAce_enhance_CIDR_human.npz')
pbmc_c = np.load('results/enhancement/scAce_enhance_CIDR_pbmc.npz')
kidney_c = np.load('results/enhancement/scAce_enhance_CIDR_kidney.npz')
klein_c = np.load('results/enhancement/scAce_enhance_CIDR_klein.npz')
chen_c = np.load('results/enhancement/scAce_enhance_CIDR_chen.npz')
adam_c = np.load('results/enhancement/scAce_enhance_CIDR_adam.npz')
turtle_c = np.load('results/enhancement/scAce_enhance_CIDR_turtle.npz')

In [None]:
datasets = [human_c, pbmc_c, kidney_c, klein_c, chen_c, adam_c, turtle_c]

umap_init_all, umap_last_all = [], []
for data in datasets:
    adata_init = sc.AnnData(data['Embedding'][0])
    sc.pp.neighbors(adata_init)
    sc.tl.umap(adata_init, random_state=0)
    umap_init_all.append(np.array(adata_init.obsm['X_umap']))
    adata_last = sc.AnnData(data['Embedding'][1])
    sc.pp.neighbors(adata_last)
    sc.tl.umap(adata_last, random_state=0)
    umap_last_all.append(np.array(adata_last.obsm['X_umap']))

In [9]:
umap_init_all = np.load("umap/umap_enhance_c_init.npz")['UMAP']
# np.savez("umap/umap_enhance_c_init.npz", UMAP=umap_init_all)

umap_last_all = np.load("umap/umap_enhance_c_last.npz")['UMAP']
# np.savez("umap/umap_enhance_c_last.npz", UMAP=umap_last_all)

In [10]:
plot_cluster(human_c, 'Human pancreas', 'split', 'clusters', y_true_human, 0, axs[0][0])
plot_cluster(pbmc_c, 'Human PBMC', 'split', 'clusters', y_true_pbmc, 1, axs[0][1])
plot_cluster(kidney_c, 'Human kidney', 'split', 'clusters', y_true_kidney, 2, axs[0][2])
plot_cluster(klein_c, 'Mouse ES', 'split', 'clusters', y_true_klein, 3, axs[0][3])
plot_cluster(chen_c, 'Mouse kidney', 'split', 'clusters', y_true_chen_2, 4, axs[0][4])
plot_cluster(adam_c, 'Mouse hypothalamus', 'split', 'clusters', y_true_adam, 5, axs[0][5])
plot_cluster(turtle_c, 'Turtle brain', 'split', 'clusters', y_true_turtle, 6, axs[0][6])

Datasets: Human pancreas_split, ARI=0.56, NMI=0.65, k=5
Datasets: Human PBMC_split, ARI=0.64, NMI=0.72, k=4
Datasets: Human kidney_split, ARI=0.26, NMI=0.45, k=4
Datasets: Mouse ES_split, ARI=0.68, NMI=0.66, k=10
Datasets: Mouse kidney_split, ARI=0.36, NMI=0.43, k=7
Datasets: Mouse hypothalamus_split, ARI=0.05, NMI=0.12, k=12
Datasets: Turtle brain_split, ARI=0.48, NMI=0.63, k=4


In [11]:
plot_cluster(human_c, 'Human pancreas', 'split', 'labels', y_true_human, 0, axs[1][0])
plot_cluster(pbmc_c, 'Human PBMC', 'split', 'labels', y_true_pbmc, 1, axs[1][1])
plot_cluster(kidney_c, 'Human kidney', 'split', 'labels', y_true_kidney, 2, axs[1][2])
plot_cluster(klein_c, 'Mouse ES', 'split', 'labels', y_true_klein, 3, axs[1][3])
plot_cluster(chen_c, 'Mouse kidney', 'split', 'labels', y_true_chen_2, 4, axs[1][4])
plot_cluster(adam_c, 'Mouse hypothalamus', 'split', 'labels', y_true_adam, 5, axs[1][5])
plot_cluster(turtle_c, 'Turtle brain', 'split', 'labels', y_true_turtle, 6, axs[1][6])

In [12]:
fig

<Figure size 2200x500 with 14 Axes>

In [13]:
plt.savefig('Figures/FigureS10A.svg', dpi=300, format='svg', bbox_inches='tight')

# Enhance

In [14]:
fig = plt.figure(figsize=(22, 5))
sub_figs = fig.subfigures(2, 1)
axs = []

for i, sub_fig in enumerate(sub_figs):
    axs.append(sub_fig.subplots(1, 7))

axs = np.array(axs)

In [15]:
plot_cluster(human_c, 'Human pancreas', 'enhance', 'clusters', y_true_human, 0, axs[0][0])
plot_cluster(pbmc_c, 'Human PBMC', 'enhance', 'clusters', y_true_pbmc, 1, axs[0][1])
plot_cluster(kidney_c, 'Human kidney', 'enhance', 'clusters', y_true_kidney, 2, axs[0][2])
plot_cluster(klein_c, 'Mouse ES', 'enhance', 'clusters', y_true_klein, 3, axs[0][3])
plot_cluster(chen_c, 'Mouse kidney', 'enhance', 'clusters', y_true_chen_2, 4, axs[0][4])
plot_cluster(adam_c, 'Mouse hypothalamus', 'enhance', 'clusters', y_true_adam, 5, axs[0][5])
plot_cluster(turtle_c, 'Turtle brain', 'enhance', 'clusters', y_true_turtle, 6, axs[0][6])

Datasets: Human pancreas_enhance, ARI=0.91, NMI=0.87, k=5
Datasets: Human PBMC_enhance, ARI=0.67, NMI=0.74, k=4
Datasets: Human kidney_enhance, ARI=0.38, NMI=0.54, k=4
Datasets: Mouse ES_enhance, ARI=0.98, NMI=0.96, k=4
Datasets: Mouse kidney_enhance, ARI=0.57, NMI=0.64, k=4
Datasets: Mouse hypothalamus_enhance, ARI=0.23, NMI=0.48, k=4
Datasets: Turtle brain_enhance, ARI=0.51, NMI=0.64, k=4


In [16]:
plot_cluster(human_c, 'Human pancreas', 'enhance', 'labels', y_true_human, 0, axs[1][0])
plot_cluster(pbmc_c, 'Human PBMC', 'enhance', 'labels', y_true_pbmc, 1, axs[1][1])
plot_cluster(kidney_c, 'Human kidney', 'enhance', 'labels', y_true_kidney, 2, axs[1][2])
plot_cluster(klein_c, 'Mouse ES', 'enhance', 'labels', y_true_klein, 3, axs[1][3])
plot_cluster(chen_c, 'Mouse kidney', 'enhance', 'labels', y_true_chen_2, 4, axs[1][4])
plot_cluster(adam_c, 'Mouse hypothalamus', 'enhance', 'labels', y_true_adam, 5, axs[1][5])
plot_cluster(turtle_c, 'Turtle brain', 'enhance', 'labels', y_true_turtle, 6, axs[1][6])

In [17]:
fig

<Figure size 2200x500 with 14 Axes>

In [18]:
plt.savefig('Figures/FigureS10B.svg', dpi=300, format='svg', bbox_inches='tight')