# Preclustering and Cluster Enriched Features

## Purpose
The purpose of this step is to perform a simple pre-clustering using the highly variable features to get a pre-clusters labeling. We then select top enriched features for each cluster (CEF) for further analysis.

## Input
- HVF adata file.

## Output
- HVF adata file with pre-clusters and CEF annotated.

## Import

In [None]:
import yaml
import anndata
import scanpy as sc
import pandas as pd
from ALLCools.clustering import cluster_enriched_features, significant_pc_test, log_scale

## Parameters

In [None]:
with open('config/04b.yaml', 'r') as f:
    config = yaml.safe_load(f)
    locals().update(config)
    print('Notebook configs:')
    for _k, _v in config.items():
        print(f'{_k} = {_v}')

## Load Data

In [None]:
adata = anndata.read_h5ad(adata_path)

## Pre-Clustering

If cluster label is not provided, will perform basic clustering here

In [None]:
if cluster_col is None:
    # IMPORTANT
    # put the unscaled matrix in adata.raw
    adata.raw = adata
    log_scale(adata)
    
    sc.tl.pca(adata, n_comps=min(min(adata.shape) - 1, 100))
    significant_pc_test(adata, p_cutoff=0.1, update=True)
    
    sc.pp.neighbors(adata, n_neighbors=knn)
    sc.tl.leiden(adata, resolution=resolution)
    
    if cluster_plot:
        sc.tl.umap(adata)
        sc.pl.umap(adata, color='leiden')
    
    # return to unscaled X, CEF need to use the unscaled matrix
    adata = adata.raw.to_adata()
    del adata.uns['log']
    
    cluster_col = 'leiden'

## Downsample if the adata is too large

In [None]:
if downsample is not None:
    use_cells = []
    for cluster, sub_df in adata.obs.groupby(cluster_col):
        if sub_df.shape[0] > downsample:
            cells = sub_df.sample(downsample).index
        else:
            cells = sub_df.index
        use_cells += cells.tolist()
    use_adata = adata[pd.Index(use_cells), :].copy()
else:
    use_adata = adata

## Cluster Enriched Features (CEF)

In [None]:
cluster_enriched_features(use_adata,
                          cluster_col=cluster_col,
                          top_n=top_n,
                          alpha=alpha,
                          stat_plot=True)

In [None]:
use_adata

In [None]:
if downsample is not None:
    adata.uns[f'{cluster_col}_feature_enrichment'] = use_adata.uns[f'{cluster_col}_feature_enrichment']
    adata.var[f'{cluster_col}_enriched_features'] = use_adata.var[f'{cluster_col}_enriched_features']
else:
    adata = use_adata

## Save AnnData

In [None]:
adata = adata[:, adata.var[f'{cluster_col}_enriched_features']]
adata.write_h5ad('mCG.CEF.h5ad')