## Import

In [None]:
from wmb import brain, cemba, mm10
import seaborn as sns

from ALLCools.clustering import *
from ALLCools.mcds import MCDS
from ALLCools.plot import *

import pandas as pd
import matplotlib.pyplot as plt
from ALLCools.clustering import cluster_enriched_features, log_scale

In [None]:
var_dim = 'chrom100k'
cluster_col = 'L1'
chrom_to_remove = ['chrX', 'chrY', 'chrM', 'chrL']
downsample = 500

## Select cells

In [None]:
cells = pd.read_csv('mc_cells.txt', index_col=0, header=None).index
cells.name = 'cell'
cells.size

## downsample cells

In [None]:
annot = cemba.get_mc_annot()
clusters = annot[cluster_col].to_pandas()
clusters = clusters.loc[cells].copy()

if downsample is not None:
    use_cells = []
    for cluster, sub_series in clusters.groupby(clusters):
        if sub_series.size > downsample:
            _cells = sub_series.sample(downsample, random_state=0).index
        else:
            _cells = sub_series.index
        use_cells += _cells.tolist()
    use_cells = pd.Index(use_cells)
else:
    use_cells = cells
    
use_cells.size

## Get adata with basic feature selection

In [None]:
mcds = MCDS.open(cemba.CEMBA_SNMC_MCDS_PATH, var_dim=var_dim, use_obs=use_cells)

mcds

In [None]:
mcds.add_mc_frac()
adata = mcds.get_adata(mc_type=mc_type,
                       select_hvf=False,
                       split_large_chunks=False)

In [None]:
adata.obs[cluster_col] = annot[cluster_col].to_pandas()

## Basic Feature Filtering

In [None]:
remove_chromosomes(adata, exclude_chromosomes=chrom_to_remove)

In [None]:
remove_black_list_region(adata, black_list_path=mm10.ENCODE_BLACKLIST_PATH)

In [None]:
# remove low std features
pass_std_filter = adata.X.std(axis=0) > 0.05
adata._inplace_subset_var(pass_std_filter)

## CEF Feature Filtering

In [None]:
# CEF function take the original mC fraction as input
cluster_enriched_features(adata,
                          cluster_col=cluster_col,
                          top_n=200,
                          alpha=0.05,
                          stat_plot=True,
                          method='mc')

In [None]:
# select CEF only
adata.var[f'{cluster_col}_enriched_features'].to_csv('CEF.csv', header=None)