# Run statistics

This report provides statistics for all major pre-processing and filtering steps performed by the pipeline.

All charts are interactive so hovering over areas of interest will provide additional information.

In [1]:
import os
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
print(f'Statistics report generated on {pd.to_datetime("today")}')

In [None]:
directory = ""


In [None]:
# Parameters
directory = "/t1-data/project/milne_group/asmith/Projects/capture_pipeline_testing/ccanalyser_statistics/"


# Fastq duplication statistics

Fastq files (after partitioning) are examined for fragments (R1 + R2) that appear to be PCR duplicates.

Duplicates are identified by comparing the concatenated R1 and R2 sequences and filtering out exact matches. 

This is only the first pass of PCR duplicate removal as single base changes will be ignored. The aim here is to remove as many duplicate fragments as possible to reduce the amount of downstream processing required.

Approximately 5-20% of fragments are typically removed by this step.

In [2]:
df = pd.read_csv(os.path.join(directory, 'deduplication/deduplication.summary.csv'))


NameError: name 'directory' is not defined

In [None]:
# Number of samples present, used for setting chart heights and widths.
N_SAMPLES = df["sample"].nunique()


In [None]:
fig = px.bar(
    data_frame=df.query('stat_type != "reads_total"'),
    x="stat",
    y="sample",
    color="stat_type",
    template="simple_white",
    category_orders={
        "sample": sorted(df["sample"].unique()),
        "stat_type": ("reads_unique", "reads_removed"),
    },
    color_discrete_sequence=["#1f77b4", "grey"],
)
fig.for_each_trace(lambda t: t.update(name=" ".join(t.name.split("_"))))
fig.update_layout(legend_title_text="")
fig.update_yaxes(title="Sample")
fig.update_xaxes(title="Number of Reads")
fig.update_traces(marker_line_width=0)

# Trimming 

Following initial PCR duplicate removal fastq files are trimmed to remove sequencing adapters.

These plots provide a brief summary of the number of adapters identified and removed.


In [None]:
df = pd.read_csv(os.path.join(directory, "trimming/trimming.summary.csv"))

In [None]:
N_SAMPLES = df["sample"].nunique()


In [None]:
df_summary = df.query(
    'stat_type == "adapters_removed" or stat_type == "reads_total"'
).sort_values(["sample", "read_number"])
subplot_specs = specs = [[{"type": "pie"} for i in range(2)] for j in range(N_SAMPLES)]
fig = make_subplots(
    rows=N_SAMPLES,
    cols=2,
    specs=specs,
    row_titles=sorted(df_summary["sample"].str.replace("_", " ").unique()),
    column_titles=["Read 1", "Read 2"],
)

for ii, (sample, df_sample) in enumerate(df_summary.groupby("sample")):
    for jj in range(0, 2):

        df_read_number = df_sample.query(f"read_number == {jj+1}")

        fig.add_trace(
            go.Pie(
                labels=df_read_number["stat_type"].str.replace("_", " ").str.title(),
                values=df_read_number["stat"],
                name=f"{sample} {jj+1}",
                domain={
                    "row": 1,
                },
            ),
            row=ii + 1,
            col=jj + 1,
        )

fig.update_layout(width=750, height=(250 * N_SAMPLES))

# Read pair combination statistics (FLASh)

After the removal of adapters read pairs are combined (if any overlap exists) using FLASh to generate combined fragments (refered to as flashed). Non-combined read pairs that do not have a sufficient overlap  (refered to as paired-end or pe) are maintained as read pairs in separate fastq files.  

In [None]:
df = pd.read_csv(os.path.join(directory, "run_statistics.csv"))


In [None]:
df_summary = (
    df.loc[df["stage"].isin(["digestion"])]
    .loc[lambda df: df["stat_type"] == "unfiltered"]
    .assign(
        read_type=lambda df: df["read_type"]
        .replace("flashed", "Flashed")
        .replace("pe", "PE")
    )
    .groupby(["sample", "stage", "stat_type", "read_type"])["stat"]
    .mean()
    .reset_index()
)

fig = px.bar(
    data_frame=df_summary,
    x="stat",
    y="sample",
    color="read_type",
    template="simple_white",
    category_orders={"sample": sorted(df["sample"]), "read_type": ["Flashed", "PE"]},
)
fig.update_layout(legend_title_text="")
fig.update_yaxes(title="Sample")
fig.update_xaxes(title="Number of Read Pairs")
fig.update_traces(marker_line_width=0)

# Fastq *in silico* digestion statistics

Following read pair combination, the combined or non-combined fragments are examined for recognition sites of the restriction enzyme used for the assay. A valid digesion of a fragment (above the minimum threshold set) results in one or more restriction fragments, refered to as slices.

Flashed read pairs are treated differently from paired-end read pairs as we expect to observe the ligation junction in the flashed fragment. Therefore, if no recognition sites are identified, the fragment is marked as invalid and is discarded. Non-combined (paired-end) reads are unlikely to contain the ligation junction and therefore if no restriction sites are identified, the individual read pairs are not discarded.

All identified slices must be longer than the minimum length specified (default 18 bp) to be considered valid. 

In [None]:
df_hist = pd.read_csv(os.path.join(directory, "digestion/digestion.histogram.csv"))
df_reads = pd.read_csv(os.path.join(directory, "digestion/digestion.reads.csv"))


## The number of read pairs with at least one valid slice

Unfiltered read pairs = The number of read pairs containing at least one restriction site

Filtered read pairs = The number of read pairs containing at least one restriction site and at least one slices is above the minimum length


In [None]:
df = df_reads.query("read_number != 2").assign(
    read_type=lambda df: df["read_type"]
    .replace("flashed", "Flashed")
    .replace("pe", "PE"),
    stat_type=lambda df: df["stat_type"]
    .replace("unfiltered", "All Read Pairs")
    .replace("filtered", "Reads with slices"),
    sample=lambda df: df["sample"].str.replace("_", " "),
)

fig = px.bar(
    data_frame=df,
    x="stat",
    y="stat_type",
    color="read_type",
    facet_row="sample",
    template="simple_white",
    height=500 * N_SAMPLES,
    width=750,
    category_orders={"sample": sorted(df["sample"]), "read_type": ["Flashed", "PE"]},
)
fig.update_layout(
    legend_title_text="",
    margin={"b": 10},
)
fig.update_yaxes(title="", autorange="reversed")
fig.update_xaxes(matches=None, showticklabels=True)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1]))
fig.layout["xaxis"]["title"]["text"] = "Number of Slices (Reads with RE sites)"
fig.update_traces(marker_line_width=0)
fig

## Histogram of digested slices

This plot shows the number of valid slices identified per fragment, separated by flashed status. For the PE reads, an undigested read is considered valid therefore all PE reads with > 1 slice contain a recognition site.

In [None]:
fig = px.histogram(
    data_frame=df_hist.assign(
        read_number=lambda df: df["read_number"].map(
            {0: "Flashed", 1: "PE R1", 2: "PE R2"}
        )
    ),
    x="n_slices",
    y="n_reads",
    color="read_number",
    facet_row="sample",
    template="simple_white",
    barmode="group",
    height=500 * N_SAMPLES,
    width=750,
    hover_data=["n_reads"],
    category_orders={"read_number": ["Flashed", "PE R1", "PE R2"]},
)

fig.update_layout(legend_title_text="")
fig.update_yaxes(title="Frequency", matches=None, showticklabels=True)
fig.update_xaxes(dtick=1, showticklabels=True)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1]))
fig.update_traces(marker_line_width=0)

# Alignment filtering statistics

After alignment to the reference genome and annotation with capture probes, excluded regions and restriction fragments. Aligned slices are filtered and all fragments that do not contain one capture slice and one or more reporter slice(s) (i.e. slices that are not captured or appear in excluded regions) are removed.

This chart shows the number of read pairs removed at each stage of the filtering, split by flashed/pe status.

In [None]:
df_reads = pd.read_csv(os.path.join(directory, "reporters/reporters.reads.csv"))
df_slices = pd.read_csv(os.path.join(directory, "reporters/reporters.slices.csv"))
df_reporters = pd.read_csv(os.path.join(directory, "reporters/reporters.reporters.csv"))


In [None]:
df_reads = (
    df_reads.sort_values("stat", ascending=False)
    .query('stat_type != "not-deduplicated"')
    .replace("duplicate_filtered", "partial_duplicate_removal")
    .replace("deduplicated", "full_PCR_duplicate_removal")
    .assign(
        stat_type=lambda df: df["stat_type"]
        .str.replace("_", " ")
        .str.title()
        .str.replace("Pcr", "PCR"),
        read_type=lambda df: df["read_type"]
        .replace("flashed", "Flashed")
        .replace("pe", "PE"),
        sample=lambda df: df["sample"].str.replace("_", " "),
    )
)

N_SAMPLES = len(df_reads["sample"].unique())


fig = px.bar(
    data_frame=df_reads.sort_values("stat", ascending=False),
    x="stat",
    y="stat_type",
    template="simple_white",
    color="read_type",
    facet_row="sample",
    category_orders={
        "stat_type": df_reads["stat_type"].unique(),
        "read_type": ["Flashed", "PE"],
        "sample": sorted(df_reads["sample"].unique()),
    },
    height=(1000 * N_SAMPLES),
)
fig.update_xaxes(matches=None, showticklabels=True)
fig.update_yaxes(title="")
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1]))
fig.update_layout(legend_title_text="")
fig.update_traces(marker_line_width=0)

## Capture and reporter statistics

This chart shows the number of cis (same chromosome as capture) or trans (different chromosome to capture) reporters identified. This is separated by capture probe.

In [None]:
n_probes = df_reporters["capture"].nunique()
fig = px.bar(
    data_frame=df_reporters.groupby(["sample", "capture", "cis/trans"])
    .agg({"count": "sum"})
    .reset_index()
    .assign(sample=lambda df: df["sample"].str.replace("_", " ")),
    x="count",
    y="capture",
    color="cis/trans",
    facet_row="sample",
    barmode="group",
    template="simple_white",
    category_orders={
        "cis/trans": ["trans", "cis"],
        "capture": sorted(df_reporters["capture"].unique()),
        "sample": sorted(df_reporters["sample"].unique()),
    },
    height=500 + (N_SAMPLES * n_probes * 100),
    width=1000,
    labels={"count": "Number of reporters"},
)
fig.update_yaxes(title_text="")
fig.update_xaxes(matches=None, showticklabels=True)
fig.for_each_trace(lambda t: t.update(name=t.name.split("=")[0]))
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1]))
fig.update_layout(legend={"traceorder": "reversed", "title": ""})
fig.update_traces(marker_line_width=0)

# Overall stats

This chart displays the combined statistics from the entire pipeline run summarised at the read pair level.

In [None]:
df = pd.read_csv(os.path.join(directory, "run_statistics.csv")).sort_values(
    "stat", ascending=False
)


In [None]:
stat_type_mapping = {
    "reads_total": "Total Reads",
    "reads_unique": "PCR Duplicate Filtered (1st pass)",
    "unfiltered": "Passed Trimming and Combining",
    "filtered": "Passed restriction site filter.",
    "mapped": "Mapped to reference genome",
    "contains_single_capture": "Contains a Capture Slice",
    "contains_capture_and_reporter": "Contains a Capture and Reporter Slice",
    "duplicate_filtered": "PCR Duplicate Filtered (2nd pass, partial)",
    "deduplicated": "PCR Duplicate Filtered (final pass)",
}


In [None]:
df = df.assign(
    stat_type=lambda df: df["stat_type"].map(stat_type_mapping),
    read_type=lambda df: df["read_type"]
    .replace("flashed", "Flashed")
    .replace("pe", "PE"),
    sample=lambda df: df["sample"].str.replace("_", " "),
)

fig = px.bar(
    df.query("(read_number != 2) and (stage == stage) "),
    x="stat",
    y="stat_type",
    color="read_type",
    template="simple_white",
    facet_row="sample",
    height=500 * N_SAMPLES,
    width=1000,
    category_orders={
        "stat_type": df["stat_type"].unique(),
        "sample": sorted(df["sample"].unique()),
        "read_type": ["Flashed", "PE"],
    },
)
fig.update_yaxes(title_text="")
fig.update_xaxes(matches=None, showticklabels=True)
fig.update_layout(legend_title_text="")
fig.for_each_annotation(lambda a: a.update(text=f'{a.text.split("=")[1]}'))
fig.layout["xaxis"]["title_text"] = "Number of Read Pairs"
fig.update_traces(marker_line_width=0)
fig