In [1]:
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ALLCools.integration import calculate_diagonal_score

from wmb import *

CEMBA_SNM3C_3C_CHROM100K_RAW_ZARR_PATH do not exist: /gale/netapp/cemba3c/BICCN/CEMBA_3C/mcds/CEMBA.snm3C.chrom100k_raw.zarr
CEMBA_SNM3C_3C_COMPARTMENT_ZARR_PATH do not exist: /gale/netapp/cemba3c/BICCN/CEMBA_3C/mcds/CEMBA.snm3C.3C.mcds/chrom100k
CEMBA_SNM3C_3C_DOMAIN_INSULATION_ZARR_PATH do not exist: /gale/netapp/cemba3c/BICCN/CEMBA_3C/mcds/CEMBA.snm3C.3C.mcds/chrom25k
CEMBA_SNM3C_CELL_TYPE_ANNOTATION_PATH do not exist: /gale/netapp/cemba3c/BICCN/wmb/cemba/CEMBA.snm3C.Annotations.zarr
CEMBA_SNM3C_GENE_CHUNK_ZARR_PATH do not exist: /gale/netapp/cemba3c/BICCN/wmb/GeneChunks/CEMBA.snm3C
CEMBA_SNMC_GENE_CHUNK_ZARR_PATH do not exist: /gale/netapp/cemba3c/BICCN/wmb/GeneChunks/CEMBA.snmC
AIBS_SMART_GENE_CHUNK_ZARR_PATH do not exist: /gale/netapp/cemba3c/BICCN/wmb/GeneChunks/AIBS.SMART
AIBS_TENX_GENE_CHUNK_ZARR_PATH do not exist: /gale/netapp/cemba3c/BICCN/wmb/GeneChunks/AIBS.TENX
BROAD_TENX_GENE_CHUNK_ZARR_PATH do not exist: /gale/netapp/cemba3c/BICCN/wmb/GeneChunks/BROAD.TENX


## Parameters

In [2]:
level = 'L2'
deep_level = 'L4'
category_key = 'L3'
deep_category_key = 'L4'

## Load Annot

In [3]:
import anndata
adata_merge = anndata.read_h5ad(f'../L1/Neuron/final_with_coords.h5ad')

In [4]:
adata_merge.obs['Modality'].value_counts() 

ATAC    1088328
mC       259071
Name: Modality, dtype: int64

In [5]:
mc_annot = cemba.get_mc_annot()
mc_neurons = ~mc_annot['L1_annot'].isin(
    ['ODC', 'OPC', 'ASC', 'MGC', 'CB', 'CBX', 'DG'])
mc_annot = mc_annot.sel(cell=mc_neurons)

In [6]:
atac_annot = cemba_atac.get_atac_annot()

In [7]:
atac_neurons = ~atac_annot['L2_annot'].isin([
    'VPIA', 'VLMC', 'MGL', 'PER', 'VEC', 'RGL', 'ASC', 'EPEN', 'BERG', 'OPC','IOL', 'OGC', 'GRC', 'GRANGL'
])

# due to inconsistent atac_annot, we use integration group here to select cells
atac_inte_group = pd.read_csv('../L1/Neuron/atac_integration_group.csv.gz', index_col=0)

atac_annot = atac_annot.sel(cell=atac_inte_group.index)

In [8]:
all_mc_cluster = mc_annot[category_key].to_pandas().value_counts().index
all_atac_cluster = atac_annot[category_key].to_pandas().value_counts().index

## Get L1 Confusion

In [10]:
def extend_confusion_matrix(matrix, mc_in_level, mc_out_level, atac_in_level,
                            atac_out_level):
    """Extend confusion matrix from one cluster resolution to the other deeper resolution"""
    mc_in_to_out = pd.Series(mc_annot[mc_in_level],
                             index=mc_annot[mc_out_level]).to_dict() #get the L2 and L4 identity for each cell
    mc_in_to_out = pd.Series(
        {k: v
         for k, v in mc_in_to_out.items() if v in matrix.columns}) #some l2 did not go to L4
    atac_in_to_out = pd.Series(atac_annot[atac_in_level],
                              index=atac_annot[atac_out_level]).to_dict()
    atac_in_to_out = pd.Series(
        {k: v
         for k, v in atac_in_to_out.items() if v in matrix.index})

    matrix = matrix.reindex(pd.Series(mc_in_to_out).values,
                            axis=1).reindex(pd.Series(atac_in_to_out).values)
    matrix.columns = mc_in_to_out.index
    matrix.index = atac_in_to_out.index
    return matrix

In [11]:
l1_confusion_matrix = pd.read_hdf(f'../L1/Neuron/L2.overlap_score.hdf')

l1_confusion_matrix = extend_confusion_matrix(l1_confusion_matrix,
                                              mc_in_level='L2',
                                              mc_out_level='L4',
                                              atac_in_level='L2',
                                              atac_out_level='L4')

In [12]:
l1_confusion_matrix.to_hdf('L1_confusion_matrx.L4_clusters.hdf', key='data')

In [15]:
assert l1_confusion_matrix.isna().values.sum() == 0

In [16]:
l1_confusion_matrix

Unnamed: 0,c1_c3_c0_c1,c1_c3_c0_c0,c1_c3_c4_c1,c1_c3_c4_c0,c1_c0_c2_c1,c1_c0_c0_c0,c1_c1_c1_c0,c1_c1_c0_c0,c1_c3_c3_c0,c1_c3_c4_c2,...,c17_c3_c2_c0,c17_c3_c2_c1,c17_c3_c0_c1,c17_c1_c1_c0,c17_c1_c2_c0,c17_c1_c2_c1,c17_c5_c0_c3,c13_c5_c1_c0,c13_c5_c3_c0,c13_c5_c2_c0
3_9_2_11,0.056603,0.056603,0.056603,0.056603,0.936288,0.936288,0.036464,0.036464,0.056603,0.056603,...,0.000045,0.000045,0.000045,0.000022,0.000022,0.000022,0.000045,0.000164,0.000164,0.000164
3_10_1_8,0.010015,0.010015,0.010015,0.010015,0.010954,0.010954,0.008083,0.008083,0.010015,0.010015,...,0.000064,0.000064,0.000064,0.000013,0.000013,0.000013,0.000064,0.000782,0.000782,0.000782
3_11_7_13,0.969517,0.969517,0.969517,0.969517,0.032261,0.032261,0.990472,0.990472,0.969517,0.969517,...,0.000037,0.000037,0.000037,0.000037,0.000037,0.000037,0.000018,0.000091,0.000091,0.000091
3_12_1_1,0.949271,0.949271,0.949271,0.949271,0.047930,0.047930,0.932249,0.932249,0.949271,0.949271,...,0.000099,0.000099,0.000099,0.000079,0.000079,0.000079,0.000099,0.000615,0.000615,0.000615
3_13_2_7,0.006863,0.006863,0.006863,0.006863,0.007739,0.007739,0.006614,0.006614,0.006863,0.006863,...,0.000050,0.000050,0.000050,0.000025,0.000025,0.000025,0.000050,0.001132,0.001132,0.001132
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2_57_12_8,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000214,0.000214,0.000214,0.000214,0.000214,0.000214,0.000214,0.000000,0.000000,0.000000
2_57_12_7,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000214,0.000214,0.000214,0.000214,0.000214,0.000214,0.000214,0.000000,0.000000,0.000000
2_57_12_6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000214,0.000214,0.000214,0.000214,0.000214,0.000214,0.000214,0.000000,0.000000,0.000000
2_43_37_12,0.000000,0.000000,0.000000,0.000000,0.000112,0.000112,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


## Get Inte L2 Confusion

In [17]:
def get_inte_groups(path, annot, category):
    """read cell [mc|atac]_integration_group.csv.gz, return cluster"""
    groups = pd.read_csv(path, index_col=0).squeeze()  # cell to group
    groups.index = groups.index.map(
        annot[category].to_pandas())  # category to group
    groups = pd.Series(groups.to_dict())  # deduplicates
    return groups #each category ro integrup

In [18]:
for path in pathlib.Path(f'../Neuron/{level}').glob(
        f'InteGroup*/{category_key}.overlap_score.hdf'):
    group = path.parent.name
   

In [20]:
confusion_matrix_dict = {}
row_groups = {}
col_groups = {}

for path in pathlib.Path(f'../{level}/Neuron').glob(
        f'InteGroup*/{category_key}.overlap_score.hdf'):
    group = path.parent.name

    # integration group
    # from leiden clustering on confusion matrix
    # may be manually merged in 07.ipynb
    row_group = get_inte_groups(path.parent / 'atac_integration_group.csv.gz',
                                annot=atac_annot,
                                category=category_key)
    col_group = get_inte_groups(path.parent / 'mc_integration_group.csv.gz',
                                annot=mc_annot,
                                category=category_key)
    row_groups[group] = row_group
    col_groups[group] = col_group

    # confusion matrix
    df = pd.read_hdf(path)
    diag_score = calculate_diagonal_score(df,
                                          col_group=col_group,
                                          row_group=row_group) #mean score
    print(f'{group} diag score: {diag_score:.2f}')

    df.index.name = f'atac.{category_key}'
    df.columns.name = f'mC.{category_key}'
    # reorder row and col based on inte groups
    df = df.loc[row_group.sort_values().index,
                col_group.sort_values().index].copy()
    confusion_matrix_dict[group] = df

all_confusion = pd.concat(confusion_matrix_dict.values())

InteGroup12 diag score: 1.41
InteGroup9 diag score: 7.46
InteGroup10 diag score: 5.65
InteGroup0 diag score: 16.64
InteGroup6 diag score: 5.61
InteGroup16 diag score: 1.09
InteGroup13 diag score: 3.17
InteGroup2 diag score: 8.72
InteGroup8 diag score: 5.38
InteGroup4 diag score: 7.53


In [21]:
all_confusion

Unnamed: 0,c38_c4_c0,c22_c1_c2,c22_c2_c0,c22_c2_c1,c22_c0_c5,c22_c0_c4,c38_c4_c1,c22_c0_c3,c22_c0_c1,c22_c2_c2,...,c30_c0_c1,c30_c0_c2,c24_c3_c1,c24_c0_c0,c24_c3_c4,c24_c3_c2,c24_c3_c3,c24_c2_c1,c24_c2_c0,c24_c3_c0
2_57_1,0.000000,0.005515,0.005515,0.007353,0.007353,0.007353,0.000000,0.005515,0.019187,0.007353,...,,,,,,,,,,
2_60_14,0.004926,0.042377,0.379310,0.546798,0.408943,0.472959,0.004926,0.543871,0.604017,0.248936,...,,,,,,,,,,
2_60_20,0.005556,0.067949,0.611908,0.811382,0.541085,0.603226,0.005556,0.183005,0.510026,0.374747,...,,,,,,,,,,
2_57_17,0.000000,0.012821,0.166667,0.166667,0.166667,0.166667,0.000000,0.088561,0.178501,0.056818,...,,,,,,,,,,
2_60_12,0.000000,0.043794,0.623894,0.603410,0.640495,0.702636,0.000000,0.411782,0.514662,0.118765,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3_25_8,,,,,,,,,,,...,0.005780,0.005780,0.000000,0.994220,0.000000,0.016393,0.000000,0.994220,0.994220,0.961538
3_25_0,,,,,,,,,,,...,0.000371,0.000371,0.000000,0.999629,0.000000,0.016393,0.000000,0.999629,0.999629,0.961538
3_25_10,,,,,,,,,,,...,0.005597,0.005597,0.003731,0.925373,0.003731,0.020125,0.003731,0.925373,0.925373,0.929104
3_25_4,,,,,,,,,,,...,0.000575,0.000575,0.000000,0.999425,0.000000,0.016393,0.000000,0.999425,0.999425,0.961538


In [22]:
all_confusion.to_hdf('L1_confusion_matrx.L3_clusters.hdf', key='data')

In [23]:
all_confusion_l4 = extend_confusion_matrix(all_confusion,
                                           mc_in_level='L3',
                                           mc_out_level='L4',
                                           atac_in_level='L3',
                                           atac_out_level='L4')
all_confusion_l4.to_hdf('L2_confusion_matrx.L4_clusters.hdf', key='data')

## Check Missing Clusters

In [25]:
assert all_mc_cluster.size == all_confusion.columns.size
assert all_atac_cluster.size == all_confusion.index.size

assert all_confusion.index.duplicated().sum() == 0
assert all_confusion.columns.duplicated().sum() == 0

## Get L3/4 Integration

### Get atac L4 to Inte L2

In [26]:
l3_group_names = []
for l2_group, row_group in row_groups.items():
    l3_group_name = l2_group + '_' + row_group.astype(str)
    l3_group_names.append(l3_group_name)
atac_l3_to_inte_l2 = pd.concat(l3_group_names)

In [27]:
atac_l4_to_atac_l3 = pd.DataFrame(atac_annot['L3'],
                                index=atac_annot['L4']).squeeze()
atac_l4_to_atac_l3 = pd.Series(atac_l4_to_atac_l3.to_dict())
atac_l4_to_inte_l2 = atac_l4_to_atac_l3.map(atac_l3_to_inte_l2)

In [28]:
atac_l4_to_inte_l2

3_9_2_11      InteGroup4_5
3_10_1_8      InteGroup4_0
3_11_7_13     InteGroup4_1
3_12_1_1      InteGroup4_1
3_13_2_7      InteGroup4_2
                  ...     
2_57_12_8     InteGroup0_0
2_57_12_7     InteGroup0_0
2_57_12_6     InteGroup0_0
2_43_37_12    InteGroup0_0
2_35_13_5     InteGroup0_7
Length: 9136, dtype: object

### Get mC L4 to Inte L2

In [29]:
l3_group_names = []
for l2_group, row_group in col_groups.items():
    l3_group_name = l2_group + '_' + row_group.astype(str)
    l3_group_names.append(l3_group_name)
mc_l3_to_inte_l2 = pd.concat(l3_group_names)

In [30]:
mc_l4_to_mc_l3 = pd.DataFrame(mc_annot['L3'], index=mc_annot['L4']).squeeze()
mc_l4_to_mc_l3 = pd.Series(mc_l4_to_mc_l3.to_dict())
mc_l4_to_inte_l2 = mc_l4_to_mc_l3.map(mc_l3_to_inte_l2)

In [31]:
mc_l4_to_inte_l2

c1_c3_c0_c1     InteGroup4_1
c1_c3_c0_c0     InteGroup4_1
c1_c3_c4_c1     InteGroup4_1
c1_c3_c4_c0     InteGroup4_1
c1_c0_c2_c1     InteGroup4_5
                    ...     
c17_c1_c2_c1    InteGroup6_0
c17_c5_c0_c3    InteGroup6_3
c13_c5_c1_c0    InteGroup6_4
c13_c5_c3_c0    InteGroup6_4
c13_c5_c2_c0    InteGroup6_4
Length: 2400, dtype: object

## Get Inte L4 Confusion

In [32]:
l4_confusion_matrix_dict = {}
l4_row_groups = {}
l4_col_groups = {}

for path in pathlib.Path(f'../{deep_level}/Neuron').glob(
        f'InteGroup*/{deep_category_key}.overlap_score.hdf'):
    group = path.parent.name

    # integration group
    # from leiden clustering on confusion matrix
    # may be manually merged in 07.ipynb
    row_group = get_inte_groups(path.parent / 'atac_integration_group.csv.gz',
                                annot=atac_annot,
                                category=deep_category_key)
    col_group = get_inte_groups(path.parent / 'mc_integration_group.csv.gz',
                                annot=mc_annot,
                                category=deep_category_key)
    l4_row_groups[group] = row_group
    l4_col_groups[group] = col_group

    # confusion matrix
    df = pd.read_hdf(path)
    diag_score = calculate_diagonal_score(df,
                                          col_group=col_group,
                                          row_group=row_group)
    print(f'{group} diag score: {diag_score:.2f}')

    df.index.name = f'atac.{category_key}'
    df.columns.name = f'mC.{category_key}'
    # reorder row and col based on inte groups
    df = df.loc[row_group.sort_values().index,
                col_group.sort_values().index].copy()
    l4_confusion_matrix_dict[group] = df

InteGroup0_9 diag score: 6.00
InteGroup10_3 diag score: 5.97
InteGroup0_6 diag score: 8.39
InteGroup10_4 diag score: 0.08
InteGroup6_6 diag score: 21.96
InteGroup0_8 diag score: 6.35
InteGroup8_6 diag score: 2.51
InteGroup2_5 diag score: 12.01
InteGroup13_2 diag score: 2.25
InteGroup0_16 diag score: 13.56
InteGroup0_3 diag score: 3.95
InteGroup16_0 diag score: 1.84
InteGroup9_2 diag score: 5.70
InteGroup10_7 diag score: 1.16
InteGroup9_0 diag score: 1.39
InteGroup13_5 diag score: 0.94
InteGroup13_0 diag score: 1.98
InteGroup0_18 diag score: 2.41
InteGroup8_5 diag score: 1.40
InteGroup8_3 diag score: 4.40
InteGroup4_7 diag score: 3.28
InteGroup2_4 diag score: 8.41
InteGroup4_5 diag score: 1.80
InteGroup4_0 diag score: 3.68
InteGroup0_15 diag score: 4.67
InteGroup9_5 diag score: 2.40
InteGroup0_11 diag score: 2.60
InteGroup2_6 diag score: 5.72
InteGroup0_10 diag score: 5.15
InteGroup4_2 diag score: 2.19
InteGroup8_2 diag score: 5.21
InteGroup13_4 diag score: 0.89
InteGroup10_5 diag score

## Create L4 Patch on L2 Confusion Matrix

In [33]:
l2_with_l4_patch = all_confusion_l4.copy()

In [34]:
for df in l4_confusion_matrix_dict.values():
    l2_with_l4_patch.loc[df.index, df.columns] = df

In [35]:
l2_with_l4_patch.to_hdf('L4_confusion_matrx.L4_clusters.hdf', key='data')

## Deal with Cluster Match

In [36]:
l4_group_names = []
for l2_group, row_group in l4_row_groups.items():
    l4_group_name = l2_group + '_' + row_group.astype(str)
    l4_group_names.append(l4_group_name)
atac_l4_to_inte_l4 = pd.concat(l4_group_names)

In [37]:
l4_group_names = []
for l2_group, row_group in l4_col_groups.items():
    l4_group_name = l2_group + '_' + row_group.astype(str)
    l4_group_names.append(l4_group_name)
mc_l4_to_inte_l4 = pd.concat(l4_group_names)

## Final Cluster Map

In [38]:
atac_l4_to_final_group = {}
for atac_cluster, l2_inte_group in atac_l4_to_inte_l2.items():
    if atac_cluster in atac_l4_to_inte_l4.index:
        l4_inte_group = atac_l4_to_inte_l4[atac_cluster]
        assert l4_inte_group.startswith(l2_inte_group) is True
        final_group = l4_inte_group
    else:
        final_group = l2_inte_group
    atac_l4_to_final_group[atac_cluster] = final_group
atac_l4_to_final_group = pd.Series(atac_l4_to_final_group)

In [39]:
atac_l4_to_final_group.to_csv('atac_l4_to_integration_group.csv')

In [40]:
mc_l4_to_final_group = {}
for mc_cluster, l2_inte_group in mc_l4_to_inte_l2.items():
    if mc_cluster in mc_l4_to_inte_l4.index:
        l4_inte_group = mc_l4_to_inte_l4[mc_cluster]
        assert l4_inte_group.startswith(l2_inte_group) is True
        final_group = l4_inte_group
    else:
        final_group = l2_inte_group
    mc_l4_to_final_group[mc_cluster] = final_group
mc_l4_to_final_group = pd.Series(mc_l4_to_final_group)

In [41]:
mc_l4_to_final_group.to_csv('mc_l4_to_integration_group.csv')

In [42]:
atac_l4_to_final_group

3_9_2_11      InteGroup4_5_2
3_10_1_8      InteGroup4_0_1
3_11_7_13     InteGroup4_1_1
3_12_1_1      InteGroup4_1_2
3_13_2_7      InteGroup4_2_3
                   ...      
2_57_12_8     InteGroup0_0_1
2_57_12_7     InteGroup0_0_1
2_57_12_6     InteGroup0_0_1
2_43_37_12    InteGroup0_0_0
2_35_13_5     InteGroup0_7_5
Length: 9136, dtype: object

In [43]:
mc_l4_to_final_group

c1_c3_c0_c1     InteGroup4_1_3
c1_c3_c0_c0     InteGroup4_1_3
c1_c3_c4_c1     InteGroup4_1_3
c1_c3_c4_c0     InteGroup4_1_3
c1_c0_c2_c1     InteGroup4_5_0
                     ...      
c17_c1_c2_c1    InteGroup6_0_0
c17_c5_c0_c3    InteGroup6_3_1
c13_c5_c1_c0    InteGroup6_4_0
c13_c5_c3_c0    InteGroup6_4_0
c13_c5_c2_c0    InteGroup6_4_0
Length: 2400, dtype: object