In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

def natural_sort( l ): 
    import re 
    """ Sort the given iterable in the way that humans expect.""" 
    convert = lambda text: int(text) if text.isdigit() else text 
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(l, key = alphanum_key)

In [None]:
metadata_path = '../../config/metadata.tsv'
bed_targets_path = "../../config/AgamDao.bed"
wkdir = "../.."

### Coverage

In this notebook, we calculate coverage across the whole-genome and specifically at the SNP targets of the amplicon panel. 

In [None]:
# load panel metadata
if metadata_path.endswith('.xlsx'):
	metadata = pd.read_excel(metadata_path, engine='openpyxl')
elif metadata_path.endswith('.tsv'):
	metadata = pd.read_csv(metadata_path, sep="\t")
elif metadata_path.endswith('.csv'):
	metadata = pd.read_csv(metadata_path, sep=",")
else:
	raise ValueError("Metadata file must be .xlsx or .csv")

panel_metadata = pd.read_csv(bed_targets_path, sep="\t")
panel_metadata.columns = ['chrom', 'start', 'end', 'amplicon', 'type']
contigs = panel_metadata.chrom.unique()

#### Whole-genome

Lets plot coverage across the whole genome to see if we have off-target effects!

In [None]:
cov_list = []
for sampleID in metadata['sampleID']:
    cov_df = pd.read_csv(f"{wkdir}/results/coverage/{sampleID}.per-base.bed.gz", sep="\t", header=None)
    cov_df.columns = ["chrom", "start", "end", "depth"]
    cov_df = cov_df.assign(sampleID=sampleID).query("chrom in @contigs")
    cov_list.append(cov_df)

cov_df = pd.concat(cov_list, axis=0)

total_cov_df = cov_df.groupby(['chrom', 'start', 'end']).agg({'depth':'sum'}).reset_index()
total_cov_df = total_cov_df.assign(midpoint=lambda x: ((x.start + x.end)/2).astype(int))

In [None]:
for idx, contig in enumerate(contigs):
    fig = px.scatter(
            total_cov_df.query("chrom == @contig"),
            x='midpoint', 
            y="depth",
            color_discrete_sequence=['lightsalmon'],
            title=contig,
            template='simple_white',
            height=300,
        )
    
    amp_lines = panel_metadata.query("chrom == @contig")[['start', 'amplicon']].drop_duplicates()
    for idx2, row in amp_lines.iterrows():
        fig.add_vline(x=row['start'], line_width=1, line_dash="dash", opacity=0.2)
        
    fig.show()

### Coverage at each target SNP

In [None]:
target_covs = []
for sample in metadata.sampleID:
    target_cov = pd.read_csv(f"{wkdir}/results/coverage/{sample}.regions.bed.gz", sep="\t", header=None)
    target_cov = target_cov.assign(sampleID=sample)
    target_covs.append(target_cov)

target_cov_df = pd.concat(target_covs, axis=0)
target_cov_df.columns = ['chrom', 'start', 'end', 'amplicon', 'depth', 'sampleID']
target_cov_df = target_cov_df.merge(panel_metadata, how='left', on=['chrom', 'start', 'end', 'amplicon'])
snp_targets_sorted = natural_sort(target_cov_df.amplicon.unique().astype(str))

In [None]:
fig = px.box(target_cov_df, x='amplicon', y='depth', hover_data=['chrom', 'type'], width=1000, template='simple_white')
fig.update_xaxes(categoryorder='array', categoryarray= snp_targets_sorted )
fig.show()

### Coverage by sample

In [None]:
samples_cov = cov_df.groupby(['sampleID', 'chrom']).agg({'depth':'sum'})
fig = px.bar(samples_cov.reset_index(), x='sampleID', y='depth', hover_data='chrom', color='chrom', width=1000, template='simple_white')
fig.show()

### Coverage by amplicon and sample

In [None]:
# pivot to heatmap shape and take mean where we have multiple SNPs in one amplicon 
df = target_cov_df.pivot_table(index='amplicon', columns='sampleID', values='depth', aggfunc='mean').fillna(0).astype(int)
# reorder amplicons alphabetical
order = natural_sort(df.index.to_list())
df = df.loc[order]

df.to_excel(f"{wkdir}/results/coverage/amplicon_by_sample_depth.xlsx")

from IPython.display import display, Markdown
display(Markdown(f'<a href="{wkdir}/results/coverage/amplicon_by_sample_depth.xlsx">Amplicon by sample read depth (.xlsx)</a>'))

px.imshow(df, width=1000, height=1000, color_continuous_scale='blues')