## Import

In [1]:
from wmb import cemba
from ALLCools.mcds import MCDS

import pandas as pd
from ALLCools.clustering import log_scale
import glob

In [None]:
# Parameters
chrom_to_remove = ["chrX", "chrY", "chrM", "chrL"]
cpu = 1
downsample = 1000
mc_type = "CHN"
mem_gb = 1
std_cutoff = 0.05
var_dim = "chrom100k"


In [3]:
_dir = '/home/qzeng_salk_edu/project/230712_m3c-mc-integration'

## Select cells

In [4]:
meta = pd.read_csv(f'{_dir}/m3C_META_230814.csv', index_col = 0)
use_cells = pd.read_csv('m3c_cells.txt', index_col=0, header=None).index
use_cells.name = 'cell'

## Get adata with CEF

In [5]:
mcds_paths = glob.glob(f'{_dir}/female-mcds/pool_amb*.mcds')

mcds = MCDS.open(mcds_paths, var_dim=var_dim, use_obs=use_cells)

mcds.add_mc_frac(
normalize_per_cell=True,  # after calculating mC frac, per cell normalize the matrix
    clip_norm_value=10  # clip outlier values above 10 to 10
)

mcds

Unnamed: 0,Array,Chunk
Bytes,514.65 kiB,257.32 kiB
Shape,"(26350,)","(13175,)"
Count,3 Tasks,2 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 514.65 kiB 257.32 kiB Shape (26350,) (13175,) Count 3 Tasks 2 Chunks Type numpy.ndarray",26350  1,

Unnamed: 0,Array,Chunk
Bytes,514.65 kiB,257.32 kiB
Shape,"(26350,)","(13175,)"
Count,3 Tasks,2 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,205.86 kiB,205.86 kiB
Shape,"(26350,)","(26350,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 205.86 kiB 205.86 kiB Shape (26350,) (26350,) Count 2 Tasks 1 Chunks Type int64 numpy.ndarray",26350  1,

Unnamed: 0,Array,Chunk
Bytes,205.86 kiB,205.86 kiB
Shape,"(26350,)","(26350,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,205.86 kiB,205.86 kiB
Shape,"(26350,)","(26350,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 205.86 kiB 205.86 kiB Shape (26350,) (26350,) Count 2 Tasks 1 Chunks Type int64 numpy.ndarray",26350  1,

Unnamed: 0,Array,Chunk
Bytes,205.86 kiB,205.86 kiB
Shape,"(26350,)","(26350,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.18 GiB,514.65 kiB
Shape,"(5543, 26350, 2, 2)","(10, 13175, 1, 1)"
Count,118709 Tasks,11512 Chunks
Type,uint32,numpy.ndarray
"Array Chunk Bytes 2.18 GiB 514.65 kiB Shape (5543, 26350, 2, 2) (10, 13175, 1, 1) Count 118709 Tasks 11512 Chunks Type uint32 numpy.ndarray",5543  1  2  2  26350,

Unnamed: 0,Array,Chunk
Bytes,2.18 GiB,514.65 kiB
Shape,"(5543, 26350, 2, 2)","(10, 13175, 1, 1)"
Count,118709 Tasks,11512 Chunks
Type,uint32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.18 GiB,1.01 MiB
Shape,"(5543, 26350, 2)","(10, 13175, 1)"
Count,239585 Tasks,5756 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 2.18 GiB 1.01 MiB Shape (5543, 26350, 2) (10, 13175, 1) Count 239585 Tasks 5756 Chunks Type float64 numpy.ndarray",2  26350  5543,

Unnamed: 0,Array,Chunk
Bytes,2.18 GiB,1.01 MiB
Shape,"(5543, 26350, 2)","(10, 13175, 1)"
Count,239585 Tasks,5756 Chunks
Type,float64,numpy.ndarray


In [6]:
mc_cef = pd.read_csv('mC.CEF.csv', header=None, index_col=0).squeeze()
rna_cef = pd.read_csv('m3C.CEF.csv', header=None, index_col=0).squeeze()
cef = mc_cef | rna_cef
cef = cef[cef].index
cef.name = var_dim

In [7]:
mcds.add_feature_selection_column(cef, f'{var_dim}_cef')
adata = mcds.get_adata(mc_type=mc_type,
                       select_hvf=f'{var_dim}_cef',
                       split_large_chunks=False)

In [8]:
std_filter = adata.X.std(axis=0) > std_cutoff
adata._inplace_subset_var(std_filter)

In [9]:
adata

AnnData object with n_obs × n_vars = 5543 × 1328
    var: 'chrom', 'end', 'start', 'cef'

## Preprocessing and save

In [10]:
# log mC fraction and scale features
log_scale(adata, with_mean=True)

StandardScaler()

In [11]:
adata.write_h5ad('m3c_input.h5ad')