In [None]:
from ALLCools.clustering import *
from sklearn.decomposition import PCA
import anndata

import numpy as np

In [None]:
n_train_cell = 100000
ref_label = 'mC'
query_label = 'm3C'

## Merge Adata

In [None]:
mc_adata = anndata.read_h5ad('mc_input.h5ad')
m3c_adata = anndata.read_h5ad('m3c_input.h5ad')

In [None]:
use_var = mc_adata.var_names.intersection(m3c_adata.var_names)
mc_adata._inplace_subset_var(use_var)
m3c_adata._inplace_subset_var(use_var)

In [None]:
adata_merge = mc_adata.concatenate(m3c_adata,
                                   batch_categories=[ref_label, query_label],
                                   batch_key='Modality',
                                   index_unique=None)
adata_merge

In [None]:
np.random.seed(0)

# select mC cells to fit the model
train_cell = np.zeros(mc_adata.shape[0]).astype(bool)
if mc_adata.shape[0] > n_train_cell:
    train_cell[np.random.choice(np.arange(mc_adata.shape[0]), n_train_cell, False)] = True
else:
    train_cell[:] = True
    
mc_adata.obs['Train'] = train_cell.copy()

## Run PCA on Merged Adata

In [None]:
model = PCA(n_components=100, svd_solver='arpack', random_state=0)

# use selected mC cells to fit
model.fit(mc_adata.X[mc_adata.obs['Train'].values])
sel_dim = (model.singular_values_ != 0)
print(sel_dim.sum())

In [None]:
# transform all other data
chunk_size = 50000
chunks = []
for chunk_start in range(0, adata_merge.shape[0], chunk_size):
    chunks.append(
        model.transform(adata_merge.X[chunk_start:(chunk_start + chunk_size)]))

adata_merge.obsm['X_pca'] = np.concatenate(chunks, axis=0)[:, sel_dim]
adata_merge.obsm['X_pca'] /= model.singular_values_[sel_dim]

In [None]:
significant_pc_test(adata_merge, p_cutoff=0.1, obsm='X_pca')

In [None]:
for m in ['mC', 'm3C']:
    adata = adata_merge[adata_merge.obs['Modality'] == m].copy()
    adata.write_h5ad(f'{m.lower()}_pca.h5ad')

In [None]:
adata_merge