In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import allel

In [None]:
metadata_path = '../../config/metadata.tsv'
bed_targets_path = "../../config/ag-vampir.bed"
dataset = 'ag-vampir-002'
vcf_path = f"../../results/vcfs/targets/{dataset}.annot.vcf"
wkdir = "../.."
cohort_cols = 'location'
panel = 'ag-vampir'
platform = 'illumina'

sample_total_read_threshold = 250
amplicon_total_read_threshold = 1000

In [None]:
import sys
import os
sys.path.append(os.path.join(wkdir, 'workflow'))
import ampseekertools as amp

import warnings
warnings.filterwarnings('ignore')

# Sample quality control 

In this notebook, we perform quality control on samples, removing samples with very low depth or elevated heterozygosity. 

In [None]:
cohort_col = cohort_cols.split(',')[0]

metadata = pd.read_csv(metadata_path, sep="\t")

import json
with open(f"{wkdir}/results/config/metadata_colours.json", 'r') as f:
    color_mapping = json.load(f)

panel_metadata = pd.read_csv(
    bed_targets_path, 
    sep="\t", 
    header=None, 
    names=['contig', 'start', 'end', 'amplicon', 'mutation', 'ref', 'alt']
)

geno, pos, contigs, metadata, ref, alts, ann = amp.load_vcf(vcf_path, metadata, platform=platform)
samples = metadata['sample_id']

## Coverage data

In [None]:
target_covs = []
x_ratios = []
for sample in metadata.sample_id:
    target_cov = pd.read_csv(f"{wkdir}/results/coverage/{sample}.regions.bed.gz", sep="\t", header=None, names=['contig', 'start', 'end', 'amplicon', 'depth', 'sample_id'])
    target_cov = target_cov.assign(sample_id=sample)
    target_covs.append(target_cov)
    
    # x-autosome ratio
    if panel == 'ag-vampir':
        contig_depth = target_cov.groupby('contig').agg({'depth':'sum'})
        x_ratios.append((contig_depth.loc[['2L', '2R', '3L', '3R']].sum() / contig_depth.loc['X']).iloc[0])
    
target_cov_df = pd.concat(target_covs, axis=0)
target_cov_df = target_cov_df.merge(panel_metadata, how='left', on=['contig', 'start', 'end', 'amplicon'])

sample_cov_df = target_cov_df.groupby('sample_id').agg({'depth':'sum'}).reset_index()

fig = px.histogram(sample_cov_df, x='depth', nbins=500, template='simple_white', 
                   width=800, height=300, title='Histogram of total read counts per sample')
fig.show()

How many samples fall below the threshold for total reads?

In [None]:
exclude_samples_depth = sample_cov_df.query("depth < @sample_total_read_threshold")['sample_id']
print(f"Removing {len(exclude_samples_depth)} samples due to low total depth")

#### Total reads per target SNP

In [None]:
amplicon_cov_df = target_cov_df.groupby('mutation').agg({'depth':'sum'}).reset_index()

fig = px.histogram(amplicon_cov_df, x='depth', nbins=200, color='mutation', template='simple_white', 
                   width=800, height=350, 
                   title='Histogram of total read counts per SNP target')
fig.show()

Which target SNPs have lower total depth than the amplicon threshold?

In [None]:
exclude_targets_depth = amplicon_cov_df.query("depth < 100")['mutation']
print(f"Removing {len(exclude_targets_depth)} target SNPs due to low total depth")

pd.DataFrame(exclude_targets_depth)

### Number of missing calls

In [None]:
min_missing_calls = 60 #int(panel_metadata.shape[0] / 2)

exclude_samples_missing_calls = samples[(geno.is_missing().sum(axis=0) > min_missing_calls)]
print(f"{len(exclude_samples_missing_calls)} samples have more than {min_missing_calls} missing calls overall out of all possible target SNPs")

a = exclude_samples_missing_calls
b = exclude_samples_depth

# how many samples are shared between the exclude missing calls and depth lists 
overlap = len(set(a) & set(b))

print(f"{overlap}/{len(exclude_samples_missing_calls)} of these are also present in the low depth samples to be excluded")

### Autosome / Sex chromosome coverage ratios (ag-vampir only)

Females will have a lower ratio of autosomes:x, and males will have a higher ratio. Its not clear whether we can use this yet to sex samples.

In [None]:
if panel == 'ag-vampir':
    x_ratio_df = pd.DataFrame({'sample_id':metadata.sample_id, 'x_ratio':x_ratios})
    x_ratio_df = x_ratio_df.query("sample_id not in @exclude_samples_depth")

    fig = px.histogram(x_ratio_df, x='x_ratio', color='sample_id', template='simple_white', nbins=1000, width=800, height=300)
    fig.update_xaxes(range=(0,20), title=dict(text='Autosome / X depth ratio'))
    fig.show()

### Sample heterozygosity

In [None]:
def calc_heterozygosity(gt, gt_samples):   
    het_per_sample = [np.nanmean(allel.heterozygosity_observed(gt[:, [i], :])) for i in range(gt.shape[1])]
    het_df = pd.DataFrame({'sample_id':gt_samples, 'heterozygosity':het_per_sample})
    return het_df.set_index("sample_id")

het_df = calc_heterozygosity(gt=geno, gt_samples=samples).reset_index()
het_df = het_df.merge(metadata)

fig = px.bar(
    het_df, 
    x='sample_id', 
    y='heterozygosity', 
    color=cohort_col, 
    color_discrete_map=color_mapping[cohort_col],
    template='simple_white', 
    title="Individual sample heterozygosity", 
    height=400,
    width=900
)

fig2  = px.histogram(
    het_df, 
    x='heterozygosity', 
    color=cohort_col, 
    color_discrete_map=color_mapping[cohort_col],
    template='simple_white', 
    title="Histogram of sample heterozygosity", 
    height=400,
    width=900
)

fig.show()
fig2.show()

#### Locate heterozygosity outliers

We then find samples within each cohort which have a heterozygosity (2.5 * IQR) higher than the 75% quantile, to exclude samples with very high heterozygosity for their cohort.

In [None]:
from scipy.stats import iqr

iqr_multiplier = 2.5 # determines how strict we are in throwing out outliers 

exclude_samples_heterozygosity = []
for coh in het_df[cohort_col].unique():
    df = het_df.query(f"{cohort_col} == @coh")
    hets = df.heterozygosity
    
    threshold = np.nanquantile(hets, 0.75) + (iqr_multiplier * iqr(hets, nan_policy='omit'))
    
    if any(hets > threshold):
        exclude_samples_heterozygosity.extend(df.query("heterozygosity > @threshold").sample_id.to_list())
    
    print(f"For {coh} the heterozygosity threshold is {np.round(threshold, 3)}, out of {len(hets)} samples, {(hets > threshold).sum()} are outliers")

print(f"\nRemoving {len(exclude_samples_heterozygosity)} samples in total due to high heterozygosity")

### Preliminary PCA - remove outliers 

In [None]:
import allel
import numpy as np
import pandas as pd 
from scipy import stats
import allel
    
def find_pca_outliers(pca_df, zscore_threshold=3):
    """
    Find outliers in PCA components using Z-score method.
    
    Parameters:
    -----------
    pca_df : pandas DataFrame
        DataFrame containing PCA components as columns
    zscore_threshold : float
        Number of standard deviations for outlier cutoff
        
    Returns:
    --------
    DataFrame with outlier information:
        - max_zscore: Maximum absolute Z-score across all components
        - is_outlier: Boolean indicating if point is an outlier
        - outlier_components: List of components where point is an outlier
    """
    pca_df = pca_df.filter(like='PC')
    # Calculate Z-scores for all components
    zscores = pd.DataFrame(
        np.abs(stats.zscore(pca_df)),
        columns=pca_df.columns,
        index=pca_df.index
    )
    
    # Find maximum Z-score for each point
    max_zscores = zscores.max(axis=1)
    
    # Identify which components are outliers for each point
    outlier_components = zscores.apply(lambda x: x > zscore_threshold)
    outlier_component_lists = outlier_components.apply(
        lambda x: list(x.index[x]), axis=1
    )
    
    # Create results DataFrame
    results = pd.DataFrame({
        'max_zscore': max_zscores,
        'is_outlier': max_zscores > zscore_threshold,
        'outlier_components': outlier_component_lists
    })
    
    return results.sort_values('max_zscore', ascending=False)

vcf_amplicon_path = f"{wkdir}/results/vcfs/amplicons/{dataset}.annot.vcf"
geno, pos, contigs, metadata, ref, alt, ann = amp.load_vcf(vcf_amplicon_path, metadata, platform=platform)

pca_exclude_samples = []
for coh in metadata[cohort_col].unique():
    pca_df, model = amp.pca(geno, metadata, query=f"{cohort_col} == '{coh}'", n_components=3, missing_threshold=0.2)
    df_outliers = find_pca_outliers(pca_df.set_index('sample_id'), zscore_threshold=4)

    n_samples = df_outliers.shape[0]
    n_outliers = df_outliers['is_outlier'].sum()
    print(f"{coh} - Found {n_outliers} PCA outliers in {n_samples} samples using Z-scores")

    outliers = df_outliers[df_outliers['is_outlier']].index.tolist()
    pca_exclude_samples.extend(outliers)

### Summary of samples to exclude

In [None]:
negative_samples = metadata.query("sample_id.str.contains('Negative|negative')", engine='python').sample_id.to_list()
diluted_samples = metadata.query("sample_id.str.contains('dil')", engine='python').sample_id.to_list()

exclude_samples = np.unique(exclude_samples_depth.to_list() + exclude_samples_heterozygosity + list(exclude_samples_missing_calls) + pca_exclude_samples + negative_samples + diluted_samples)
removed_metadata = metadata.query("sample_id in @exclude_samples")[cohort_col].value_counts().to_frame().reset_index()

removed_metadata = removed_metadata.set_index(cohort_col).T
tot = removed_metadata.sum(axis=1)
removed_metadata = removed_metadata.assign(total=tot).T

removed_metadata.reset_index()

In [None]:
new_metadata = metadata.query("sample_id not in @exclude_samples")
new_metadata.to_csv(f"{wkdir}/results/config/metadata.qcpass.tsv", sep="\t", index=False)

####  Sample QC complete!
A new metadata file with low-quality samples removed has been written to results/config/ :)