In [1]:
import anndata
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from ALLCools.clustering import *
from ALLCools.integration.seurat_class import SeuratIntegration
from ALLCools.plot import *
from wmb import aibs, brain, cemba

In [None]:
# Parameters
categorical_key = ['new_celltypes_0808']
cpu = 3
mem_gb = 1
ref_dataset = "mc"


## Input LSI before integration

In [4]:
ref_adata = anndata.read_h5ad("mc_pca.h5ad")
query_adata = anndata.read_h5ad("m3c_pca.h5ad")

In [6]:
adata_list = [ref_adata, query_adata]

### Init empty adata_merge

In [7]:
from scipy.sparse import csr_matrix

cells = sum([a.shape[0] for a in adata_list])
features = adata_list[0].shape[1]

adata_merge = anndata.AnnData(
    X=csr_matrix((cells, features), dtype=np.float32),
    obs=pd.concat([a.obs for a in adata_list]),
    var=adata_list[0].var,
)

In [8]:
_dir = '/home/qzeng_salk_edu/project/230712_m3c-mc-integration/'
mc_annot = pd.read_csv(f'{_dir}/mC_META_230814.csv', index_col = 0 )
m3c_annot = pd.read_csv(f'{_dir}/m3C_META_230814.csv', index_col = 0 )

In [11]:
mc_annot['new_celltypes_0808'] = mc_annot['new_celltypes_0808']

In [12]:
import numpy as np
m3c_annot['new_celltypes_0808'] = np.nan

In [13]:
# order doesn't matter, index will be matched
for key in categorical_key:
    adata_merge.obs[key] = pd.concat(
        [mc_annot[key], m3c_annot[key]]).astype(str)

In [14]:
adata_list

[AnnData object with n_obs × n_vars = 12501 × 1328
     obs: 'Modality'
     var: 'chrom', 'cef', 'end-m3C', 'start-m3C', 'bin_start-mC', 'bin_end-mC'
     obsm: 'X_pca',
 AnnData object with n_obs × n_vars = 5543 × 1328
     obs: 'Modality'
     var: 'chrom', 'cef', 'end-m3C', 'start-m3C', 'bin_start-mC', 'bin_end-mC'
     obsm: 'X_pca']

In [17]:
n_pc = adata_list[0].obsm["X_pca"].shape[1]
if n_pc < 10:
    n_cca_components = n_pc
else:
    n_cca_components = max(n_pc - 10, 10)

n_cca_components

10

In [18]:
min_sample = adata_merge.obs["Modality"].value_counts().min()

## Integration and transform

In [19]:
integrator = SeuratIntegration()

In [20]:
# take ~2.5-3h for 300K mC + 4M 10X RNA
anchor = integrator.find_anchor(
    adata_list,
    k_local=None,
    key_local="X_pca",
    k_anchor=5,
    key_anchor="X",
    dim_red="cca",
    max_cc_cells=100000,
    k_score=30,
    k_filter=min(200, min_sample),
    scale1=True,
    scale2=True,
    n_components=n_cca_components,
    n_features=200,
    alignments=[[[0], [1]]],
)

Find anchors across datasets.
Run CCA
non zero dims 10
Find Anchors using k=30
Anchor selected with high CC feature graph: 12630 / 19169
Score Anchors
Identified 12630 anchors between datasets 0 and 1.


In [21]:
corrected = integrator.integrate(
    key_correct="X_pca",
    row_normalize=True,
    k_weight=100,
    sd=1,
    alignments=[[[0], [1]]],
)

adata_merge.obsm["X_pca_integrate"] = np.concatenate(corrected)

Merge datasets
[[0], [1]]
Initialize
Find nearest anchors. k_weight:  100
Normalize graph
Transform data


## Label transfer

In [18]:
# transfer_results = integrator.label_transfer(
#     ref=[0],
#     qry=[1],
#     categorical_key=categorical_key,
#     key_dist='X_pca'
# )
# for k, v in transfer_results.items():
#     v.to_hdf(f'{k}_transfer.hdf', key='data')
# integrator.save_transfer_results_to_adata(adata_merge, transfer_results)

## Save

In [22]:
adata_merge.write_h5ad("final.h5ad")

In [23]:
adata_merge

AnnData object with n_obs × n_vars = 18044 × 1328
    obs: 'Modality', 'new_celltypes_0808'
    var: 'chrom', 'cef', 'end-m3C', 'start-m3C', 'bin_start-mC', 'bin_end-mC'
    obsm: 'X_pca_integrate'

In [26]:
#integrator.save("integration")

In [27]:
import subprocess
subprocess.run(['rm', '-f', 'mc_pca.h5ad', 'm3c_pca.h5ad'])

CompletedProcess(args=['rm', '-f', 'mc_pca.h5ad', 'm3c_pca.h5ad'], returncode=0)