# Feature Basic Filtering

## Purpose
Apply basic filters to remove these problematic features:
- Extremly low coverage or high coverage features
- ENCODE Blcaklist
- Some chromosomes (chrX, chrY and chrM)

## Input
- Cell metadata (after basic cell filter)
- MCDS files

## Output
- FeatureList.BasicFilter.txt: List of feature ids passed all filters

## Import

In [None]:
import yaml
import pandas as pd
from ALLCools.mcds import MCDS
from wmb import cemba, mm10

## Parameters

In [None]:
with open('config/01.yaml', 'r') as f:
    config = yaml.safe_load(f)
    locals().update(config)
    print('Notebook configs:')
    for _k, _v in config.items():
        print(f'{_k} = {_v}')

In [None]:
black_list_path = mm10.ENCODE_BLACKLIST_PATH
f = 0.2

if dataset == 'mC':
    mcds_path = cemba.CEMBA_SNMC_MCDS_PATH
else:
    mcds_path = cemba.CEMBA_SNM3C_MCDS_PATH

## Load Data

### Metadata

In [None]:
if dataset == 'mC':
    metadata = cemba.get_mc_mapping_metric(select_cells=select_cells)
else:
    metadata = cemba.get_m3c_mapping_metric(select_cells=select_cells)

total_cells = metadata.shape[0]
print(f'Metadata of {total_cells} cells')

In [None]:
metadata.head()

## Downsample

In [None]:
if metadata.shape[0] > 50000:
    metadata = metadata.sample(50000, random_state=0)

### MCDS

In [None]:
mcds = MCDS.open(mcds_path,
                 var_dim='chrom100k',
                 use_obs=metadata.index)

In [None]:
mcds

## Filter Features

### Filter by mean coverage

In [None]:
mcds.add_feature_cov_mean()

In [None]:
mcds = mcds.filter_feature_by_cov_mean(
    min_cov=min_cov,  # minimum coverage
    max_cov=max_cov  # Maximum coverage
)

### Filter by ENCODE Blacklist

In [None]:
mcds = mcds.remove_black_list_region(
    black_list_path=black_list_path,
    f=f  # Features having overlap > f with any black list region will be removed.
)

### Remove chromosomes

In [None]:
mcds = mcds.remove_chromosome(exclude_chromosome)

## Save Feature List

In [None]:
with open('FeatureList.BasicFilter.txt', 'w') as f:
    for var in mcds.get_index(mcds.var_dim).astype(str):
        f.write(var + '\n')