In [1]:
import pandas as pd
import plotly.express as px

In [2]:
metadata_path = '../../config/metadata_ms.tsv'
wkdir = '../..'
plate_info = False
cohort_cols = 'location,taxon'

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("max_colwidth", None)

# Run statistics

In [None]:
cohort_col = cohort_cols.split(",")[0]

# load panel metadata
if metadata_path.endswith('.xlsx'):
    metadata = pd.read_excel(metadata_path, engine='openpyxl')
elif metadata_path.endswith('.tsv'):
    metadata = pd.read_csv(metadata_path, sep="\t")
elif metadata_path.endswith('.csv'):
    metadata = pd.read_csv(metadata_path, sep=",")
else:
    raise ValueError("Metadata file must be .xlsx or .csv")

import json
with open(f"{wkdir}/results/config/metadata_colours.json", 'r') as f:
    color_mapping = json.load(f)


# load demux info 
with open(f"{wkdir}/resources/reads/Stats/Stats.json", "r") as file:
    data = json.load(file)

# Extract DemuxResults
demux_results = []
for conversion_result in data["ConversionResults"]:
    for demux_result in conversion_result["DemuxResults"]:
        sample_id = demux_result["SampleId"]
        number_reads = demux_result.get("NumberReads", None)
        for index_metric in demux_result["IndexMetrics"]:
            index_sequence = index_metric["IndexSequence"]
            mismatch_counts = index_metric["MismatchCounts"]
            mismatch_0 = mismatch_counts.get("0", 0)  # Get mismatch count for 0
            mismatch_1 = mismatch_counts.get("1", 0)  # Get mismatch count for 1
            demux_results.append(
                {
                    "sample_id": sample_id,
                    "n_reads": number_reads,
                    "IndexSequence": index_sequence,
                    "Mismatch_0": mismatch_0,
                    "Mismatch_1": mismatch_1,
                }
            )

  # Extract Undetermined data
    if "Undetermined" in conversion_result:
        undetermined = conversion_result["Undetermined"]
        number_reads = undetermined.get("NumberReads", None)
        demux_results.append(
            {
                "sample_id": "Undetermined",
                "n_reads": number_reads,
                "IndexSequence": None,
                "Mismatch_0": None,
                "Mismatch_1": None,
            }
        )

# Convert to DataFrame
demultiplex_data = pd.DataFrame(demux_results).assign(pc_reads=lambda x: x["n_reads"] / x["n_reads"].sum()).assign(
    pc_perfect_index=lambda x: x["Mismatch_0"] / x["n_reads"],
    pc_mismatch_index=lambda x: x["Mismatch_1"] / x["n_reads"]
)

In [None]:
n_undetermined = demultiplex_data.query("sample_id == 'Undetermined'")['n_reads'].to_list()[0]
pc_undetermined = demultiplex_data.query("sample_id == 'Undetermined'")['pc_reads'].to_list()[0]

print(f"There are {n_undetermined:,} undetermined reads, {pc_undetermined*100}% of all reads")
print(f"There are {demultiplex_data['n_reads'].sum():,} total reads")

#### Total reads per sample

In [None]:
demultiplex_data = demultiplex_data.query("sample_id != 'Undetermined'")
demultiplex_data = metadata.merge(demultiplex_data, how='left')

fig = px.bar(demultiplex_data, 
       x='sample_id', 
       y='n_reads', 
       color=cohort_col, 
       color_discrete_map=color_mapping[cohort_col],
       title='Total reads assigned to each sample',
       template='simple_white'
      )
fig.show()

#### Percentage of perfect and mismatched index reads

In [None]:
fig = px.bar(demultiplex_data, 
       x='sample_id', 
       y='pc_perfect_index', 
       color=cohort_col, 
       color_discrete_map=color_mapping[cohort_col],
       title='The % of perfect index reads',
       template='simple_white'
      )
fig.show()

fig = px.bar(demultiplex_data.query("n_reads > 100"), 
       x='sample_id', 
       y='pc_mismatch_index', 
       color=cohort_col, 
       color_discrete_map=color_mapping[cohort_col],
       title='The % of Index reads with one mismatch',
       template='simple_white'
      )
fig.show()

#### The number of reads for  each i7 index

In [None]:
demultiplex_data.loc[:, 'i7'] = demultiplex_data['IndexSequence'].str.slice(0,8)
fig = px.box(demultiplex_data, 
             x='i7', 
             y='n_reads', 
             template='simple_white',
             color=cohort_col,
             color_discrete_map=color_mapping[cohort_col],
             title='Boxplot of total reads for each I7 index'
            )
fig.show()

fig2 = px.scatter(demultiplex_data, 
           x='n_reads', 
           y='pc_perfect_index', 
           color='i7', 
           hover_data=['well_letter', 'well_number'],
           title='The % of perfect index reads against total reads for each i7 index',
           template='simple_white')

fig2.show()

In [None]:
def plot_96well_plate(metadata, color_var='mapped_reads', title='Plate A - Number of mapped reads'):
    fig = px.scatter(metadata[::-1], 
                     y='well_letter', 
                     x='well_number',
                     color=color_var, 
                     hover_data=metadata.columns, 
                     template='plotly_white')
    fig.update_traces(marker_size=40)
    fig.update_layout(xaxis = dict(
                                side='top',
                                tickmode = 'linear',
                                tick0 = 0,
                                dtick = 1), 
                      title=title)
    return fig


if plate_info:
    from IPython.display import display, Markdown
    display(Markdown('#### Visualising edge effects'))
    tot_per_well = demultiplex_data.groupby(['well_letter', 'well_number']).agg({'n_reads':'mean'}).reset_index()

    plot_96well_plate(tot_per_well,
                    color_var='n_reads',
                    title="Visualising edge effects - total reads across i7s")