In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

def natural_sort( l ): 
    import re 
    """ Sort the given iterable in the way that humans expect.""" 
    convert = lambda text: int(text) if text.isdigit() else text 
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(l, key = alphanum_key)

In [None]:
metadata_path = '../../config/metadata.tsv'
bed_targets_path = "../../config/AgamDao.bed"
wkdir = ""

### Read depth by sample and amplicon

In this notebook, we calculate coverage at each amplicon for each sample. 

In [None]:
# load panel metadata
if metadata_path.endswith('.xlsx'):
    metadata = pd.read_excel(metadata_path, engine='openpyxl')
elif metadata_path.endswith('.tsv'):
    metadata = pd.read_csv(metadata_path, sep="\t")
elif metadata_path.endswith('.csv'):
    metadata = pd.read_csv(metadata_path, sep=",")
else:
    raise ValueError("Metadata file must be .xlsx or .csv")

panel_metadata = pd.read_csv(bed_targets_path, sep="\t")
panel_metadata.columns = ['chrom', 'start', 'end', 'snp_target', 'type']
contigs = panel_metadata.chrom.unique()

In [None]:
# Loop through each sample and read coverage table and concat 
cov_list = []
for sampleID in metadata['sampleID']:
    cov_df = pd.read_csv(f"../../results/coverage/{sampleID}.per-base.bed.gz", sep="\t", header=None)
    cov_df.columns = ["chrom", "start", "end", "depth"]
    cov_df = cov_df.assign(sampleID=sampleID).query("chrom in @contigs")
    cov_list.append(cov_df)

cov_df = pd.concat(cov_list, axis=0)

def filter_and_add_snp_target_loc(dataframe, panel_metadata):
    # Initialize an empty list to store the filtered DataFrames for each contig
    filtered_dfs = []
    
    # Loop through each unique 'contig' in the snp_target_locs DataFrame
    for contig in panel_metadata['chrom'].unique():
        # Get the snp_target_locs for the current contig
        contig_snp_locs = panel_metadata[panel_metadata['chrom'] == contig]['start'].tolist()
        
        # Create a filtered DataFrame for the current contig
        filtered_rows = []
        for index, row in dataframe[dataframe['chrom'] == contig].iterrows():
            matching_locs = [loc for loc in contig_snp_locs if row['start'] <= loc < row['end']]
            if matching_locs:
                filtered_rows.append(row.to_dict())
        
        # Convert the filtered rows to a DataFrame
        filtered_df = pd.DataFrame(filtered_rows)
        
        # Add a new column 'snp_target_loc' and populate it with the matching snp_target_loc values
        filtered_df['snp_target_loc'] = [next(iter([loc for loc in contig_snp_locs if row['start'] <= loc < row['end']]), None) for _, row in filtered_df.iterrows()]
        
        # Append the filtered DataFrame to the list
        filtered_dfs.append(filtered_df)
    
    # Concatenate all the filtered DataFrames into a final DataFrame
    final_filtered_df = pd.concat(filtered_dfs)
    
    return final_filtered_df

In [None]:
# filter out coverage at other regions and take mean coverage where we have multiple 
# depth values within an amplicon
cov_df = filter_and_add_snp_target_loc(cov_df, panel_metadata)
cov_df = cov_df.drop(columns=['start', 'end']).rename(columns={'snp_target_loc':'start'}).merge(panel_metadata)
cov_df = cov_df.rename(columns={'snp_target':'amplicon'})
cov_df = cov_df.groupby(['sampleID', 'amplicon']).agg({'depth':'mean'}).reset_index()

# pivot to amplicon x sample depth table 
cov_sample_by_amplicon = cov_df.pivot(values='depth', index='amplicon', columns='sampleID').fillna(0)

# add in amplicons not present in data at all due to zero coverage
present = cov_sample_by_amplicon.index.to_list()
not_present = [i for i in panel_metadata.snp_target.to_list() if i not in present]
not_present_df = pd.DataFrame(np.zeros((len(not_present), len(metadata.sampleID))), index=not_present, columns=metadata.sampleID)
df = pd.concat([cov_sample_by_amplicon, not_present_df])

# reorder amplicons alphabetical
order = natural_sort(df.index.to_list())
df = df.loc[order]

df.to_excel("results/coverage/amplicon_by_sample_depth.xlsx")

In [None]:
from IPython.display import display, Markdown
display(Markdown(f'<a href="{wkdir}/results/coverage/amplicon_by_sample_depth.xlsx">Amplicon by sample read depth (.xlsx)</a>'))

In [None]:
px.imshow(df, width=1000, height=1000, color_continuous_scale='blues')