# Calculate Highly Variable Features And Get mC Fraction AnnData

## Purpose
The purpose of this step is to select highly variable features (HVF) and generate cell-by-feature methylation fraction matrix for clustering. The highly variable features are selected by comparing feature's normalized dispersion among cells.

## Input
- Filtered cell metadata;
- MCDS files;
- Feature list from basic feature filtering

## Output
- cell-by-HVF methylation fraction matrix stored in AnnData format, e.g., mCH adata and mCG adata.

## Import

In [None]:
import yaml
import pandas as pd
import dask
import ALLCools
from ALLCools.mcds import MCDS
from wmb import cemba

## Parameters

In [None]:
with open('config/03a.yaml', 'r') as f:
    config = yaml.safe_load(f)
    locals().update(config)
    print('Notebook configs:')
    for _k, _v in config.items():
        print(f'{_k} = {_v}')

In [None]:
if dataset == 'mC':
    mcds_path = cemba.CEMBA_SNMC_MCDS_PATH
else:
    mcds_path = cemba.CEMBA_SNM3C_MCDS_PATH

## Load Data

### Metadata

In [None]:
if dataset == 'mC':
    metadata = cemba.get_mc_mapping_metric(select_cells=select_cells)
else:
    metadata = cemba.get_m3c_mapping_metric(select_cells=select_cells)

total_cells = metadata.shape[0]
print(f'Metadata of {total_cells} cells')

In [None]:
metadata.head()

### MCDS

In [None]:
with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    # still use all the cells to load MCDS
    total_mcds = MCDS.open(mcds_path,
                           var_dim=var_dim,
                           use_obs=metadata.index)

## Add mC Rate

In [None]:
total_mcds.add_mc_rate(var_dim=var_dim,
                       normalize_per_cell=True,
                       clip_norm_value=10)

total_mcds

## Save AnnData

In [None]:
mch_hvf = pd.read_hdf('mch_hvf.hdf')

feature_select = mch_hvf['feature_select']
total_mcds.coords[f'{var_dim}_CHN_feature_select'] = feature_select

In [None]:
mch_adata = total_mcds.get_adata(mc_type=mch_pattern,
                                 var_dim=var_dim,
                                 select_hvf=True)

mch_adata.write_h5ad(f'mCH.HVF.h5ad')

mch_adata