In [None]:
from ALLCools.clustering import *
import anndata

## Merge Adata

In [None]:
mc_adata = anndata.read_h5ad('mc_input.h5ad')
atac_adata = anndata.read_h5ad('atac_input.h5ad')

In [None]:
use_var = mc_adata.var_names.intersection(atac_adata.var_names)
mc_adata._inplace_subset_var(use_var)
atac_adata._inplace_subset_var(use_var)

In [None]:
adata_merge = mc_adata.concatenate(atac_adata,
                                   batch_categories=['mC', 'ATAC'],
                                   batch_key='Modality',
                                   index_unique=None)
adata_merge

## Run LSI on Merged Adata

In [None]:
model = LSI(scale_factor=10000,
            n_components=100,
            algorithm="randomized",
            random_state=0)

In [None]:
max_cell_per_modality = 100000
use_cells = []
for m, sub_data in adata_merge.obs.groupby('Modality'):
    if sub_data.shape[0] > max_cell_per_modality:
        sub_data = sub_data.sample(n=max_cell_per_modality, random_state=0)
    use_cells += sub_data.index.tolist()
use_cells_judge = adata_merge.obs_names.isin(use_cells)

# TODO use only mC cell to fit LSI, then transform all data

In [None]:
model.fit(adata_merge[use_cells_judge, :], downsample=100000)

In [None]:
model.transform(adata_merge)

In [None]:
significant_pc_test(adata_merge, p_cutoff=0.1, obsm='X_lsi')

In [None]:
for m in ['mC', 'ATAC']:
    adata = adata_merge[adata_merge.obs['Modality'] == m]
    adata.write_h5ad(f'{m.lower()}_lsi.h5ad')