# snmCT Mapping Summary

This notebook provides a quick overview of some key mapping metrics. You can customize it by yourself.

[**See documentation about mapping metrics here.**](https://app.gitbook.com/@hq-1/s/mc/mapping-summary-and-metrics)

## Parameters

## Prepare

In [None]:
output_dir = ''
plate_col = 'Plate'
color_quantile = (0.025, 0.975)

### Load

In [None]:
import pathlib
import pandas as pd
from cemba_data.utilities import get_configuration

output_dir = pathlib.Path(output_dir)
mapping_summary = pd.read_csv(output_dir / 'stats/MappingSummary.csv.gz', index_col=0)
config = get_configuration(output_dir / 'mapping_config.ini')

In [None]:
mapping_summary['CellInputReadPairs'] = mapping_summary['R1InputReads'].astype(int)  # == final_df['R2InputReads']
cell_barcode_ratio = pd.concat([(i['CellInputReadPairs'] / i['CellInputReadPairs'].sum())
                                    for _, i in mapping_summary.groupby('PCRIndex')])
mapping_summary['CellBarcodeRatio'] = cell_barcode_ratio

feature_count_stats = pd.read_hdf(output_dir / 'TotalRNAData.h5', key='stats')
mapping_summary['GenesDetected'] = feature_count_stats['GenesDetected']

In [None]:
mapping_summary.columns

### Plot

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from cemba_data.mapping import cutoff_vs_cell_remain, plot_on_plate


def distplot_and_plate_view(data, hue, color_quantile=color_quantile, config=config):
    fig1, (vmin, vmax) = cutoff_vs_cell_remain(data=data[hue].dropna(), 
                                               bins=50, kde=False,
                                               xlim_quantile=color_quantile)

    fig2, plate_names, plate_datas = plot_on_plate(
        data=data,
        hue=hue,
        groupby=plate_col,
        vmin=vmin,
        vmax=vmax,
        aggregation_func=lambda i: i.mean())
    
    fig3, ax = plt.subplots(figsize=(data[plate_col].unique().size * 2, 4))
    plate_hue_name = 'MultiplexGroup' if config['barcode_version'] == 'V2' else 'RandomIndex'
    sns.boxenplot(data=data, x=plate_col, y=hue, palette='hls', 
                  ax=ax, hue=plate_hue_name)
    ax.set_ylim(vmin, vmax)
    ax.xaxis.set_tick_params(rotation=90)
    ax.legend(bbox_to_anchor=(1.1, 1), title=plate_hue_name)
    sns.despine(ax=ax)
    return

In [None]:
# plot defaults
sns.set_context(context='notebook')
plt.rc('figure', dpi=150)

## Summary

In [None]:
# conventional basic check, change as you need
mccc_cutoff = 0.03
high_mccc = mapping_summary['mCCCFrac'] > mccc_cutoff

miseq_guess = mapping_summary['FinalDNAReads'].mean() < 50000
reads_cutoff = 100 if miseq_guess else 500000
low_reads = mapping_summary['FinalDNAReads'] < reads_cutoff

success = ~high_mccc & ~low_reads
n_cell = mapping_summary.shape[0]
n_plate = mapping_summary['Plate'].unique().size
total_wells = n_plate * 384

In [None]:
print(f"""
This library seems to be a {'MiSeq' if miseq_guess else 'NovaSeq'} library.

Cells
    {n_plate}\t plates
    {total_wells}\t wells (total cell number in theory)

    {n_cell} ({n_cell / total_wells * 100:.1f}%)\t cells having mapping metric
    {success.sum()} ({success.sum() / total_wells * 100:.1f}%)\t cells passed basic QC (mCCC and # of final reads)
    {high_mccc.sum()} ({high_mccc.sum() / total_wells * 100:.1f}%)\tcells having high mCCC frac (> {mccc_cutoff})
    {low_reads.sum()} ({low_reads.sum() / total_wells * 100:.1f}%)\tcells having low number of final mC reads (< {reads_cutoff}).

Reads
    {mapping_summary['CellInputReadPairs'].sum()*2:.0f}\tTotal Input Reads (R1 & R2)
    {mapping_summary['CellInputReadPairs'].mean()*2:.0f}\tAverage Input Reads for cells having metric (R1 & R2)
    
    {mapping_summary['FinalDNAReads'].sum():.0f}\tTotal Final DNA Reads (R1 & R2)
    {mapping_summary['FinalDNAReads'].mean():.0f}\tAverage Final DNA Reads for cells having metric (R1 & R2)
    {mapping_summary['FinalRNAReads'].sum():.0f}\tTotal Final RNA Reads (R1)
    {mapping_summary['FinalRNAReads'].mean():.0f}\tAverage Final RNA Reads for cells having metric (R1)
    {mapping_summary['GenesDetected'].mean():.0f}\tAverage Genes Detected for cells having metric (R1)

    {mapping_summary['R1MappingRate'].mean():.1f}%\tAverage R1 Bismark Mapping Rate for cells having metric
    {mapping_summary['R2MappingRate'].mean():.1f}%\tAverage R2 Bismark Mapping Rate for cells having metric
    {mapping_summary['R1DuplicationRate'].mean() * 100:.1f}%\tAverage R1 PCR Duplicate Rate for cells having metric
    {mapping_summary['R2DuplicationRate'].mean() * 100:.1f}%\tAverage R2 PCR Duplicate Rate for cells having metric
""")

## mC Fraction

### mCCC

In [None]:
distplot_and_plate_view(mapping_summary, hue='mCCCFrac')

### mCH

In [None]:
distplot_and_plate_view(mapping_summary, hue='mCHFrac')

### mCG

In [None]:
distplot_and_plate_view(mapping_summary, hue='mCGFrac')

## FASTQ Metric

### CellInputReadPairs

In [None]:
distplot_and_plate_view(mapping_summary, hue='CellInputReadPairs')

### Cell Barcode Portion

In [None]:
distplot_and_plate_view(mapping_summary, hue='CellBarcodeRatio')

## Mapping Rate

### R1 Bismark Mapping Rate

In [None]:
distplot_and_plate_view(mapping_summary, hue='R1MappingRate')

### R2 Bismark Mapping Rate

In [None]:
distplot_and_plate_view(mapping_summary, hue='R2MappingRate')

## PCR Duplication Rate

### R1 PCR Duplication Rate

In [None]:
distplot_and_plate_view(mapping_summary, hue='R1DuplicationRate')

### R2 PCR Duplication Rate

In [None]:
distplot_and_plate_view(mapping_summary, hue='R2DuplicationRate')

## Final Reads

### DNA (mC) Reads

In [None]:
distplot_and_plate_view(mapping_summary, hue='FinalDNAReads')

### RNA Reads (R1)

In [None]:
distplot_and_plate_view(mapping_summary, hue='FinalRNAReads')

### Genes Detected

In [None]:
distplot_and_plate_view(mapping_summary, hue='GenesDetected')

## Mapping config

In [None]:
config