In [19]:
import pandas as pd
import altair as alt
import numpy as np

In [22]:
# Add headers
num_samples = 16
sample_columns = [f"Sample{i+1}_DP" for i in range(num_samples)]

# Read data without a header and assign column names
column_names = ['CHROM', 'POS'] + sample_columns
depth_df = pd.read_csv('/Users/genetics/Documents/code/altair/colo829/long_reads/5way_5perc_sample_depth.tsv', sep='\t', header=None, names=column_names)

# Create a unique variant ID (could also combine CHROM_POS)
depth_df['variant_id'] = depth_df['CHROM'].astype(str) + ':' + depth_df['POS'].astype(str)

# Extract sample DP columns (assuming they start at 3rd column)
dp_columns = depth_df.columns[2:-1]  # or simply 2: if variant_id is not yet present
if 'variant_id' in depth_df.columns:
    dp_columns = depth_df.columns[2:-1]  # CHROM, POS, ..., variant_id
else:
    dp_columns = depth_df.columns[2:]

# Compute summary metrics
depth_df['avg_coverage'] = depth_df[dp_columns].mean(axis=1)
depth_df['stddev_coverage'] = depth_df[dp_columns].std(axis=1)
depth_df['num_samples_found'] = (depth_df[dp_columns] > 0).sum(axis=1)

# Select output columns
summary_df = depth_df[['variant_id', 'avg_coverage', 'stddev_coverage', 'num_samples_found']]

# Save to TSV
summary_df.to_csv('variants_summary_5way.tsv', sep='\t', index=False)

In [29]:

# Load data from TSV file
data = pd.read_csv('/Users/genetics/Documents/code/altair/colo829/long_reads/variants_summary_5way.tsv', sep='\t')

# Filter data
filt = data[data['num_samples_found'] >= 8]  # Keep only variants found in at least N samples
print(data.__len__)
print(filt.__len__)

# Define color scale (adjust to your needs) 
color_scale = alt.Scale(
    domain=[0, 16],  # Adjust this range based on your data
    range=['green', 'steelblue']
)

# Create points with error bars
points = alt.Chart(data).mark_point(filled=True, size=30).encode(
    x=alt.X('variant_id:N', title='Variant ID', axis=None),
    y=alt.Y('avg_coverage:Q', title='Average Coverage'),
    color=alt.Color('num_samples_found:Q', scale=color_scale, legend=alt.Legend(title='Number of Samples Found')),
    tooltip=['variant_id', 'avg_coverage', 'stddev_coverage', 'num_samples_found']
)

error_bars = alt.Chart(data).mark_errorbar().encode(
    x=alt.X('variant_id:N'),
    y=alt.Y('avg_coverage:Q'),
    yError=alt.YError('stddev_coverage:Q')
)

# Combine layers
chart = points

chart = chart.properties(
    width=1100,
    height=500,
    title='Long Read Average Coverage for 5%+ 5-Way RUFUS-Only Sites'
)

chart


<bound method DataFrame.__len__ of          variant_id  avg_coverage  stddev_coverage  num_samples_found
0      chr1:1052316       27.6250        21.391198                 14
1      chr1:3516896       30.6250        22.621155                 14
2      chr1:3735054       20.5625        26.964096                  9
3      chr1:4395814       24.2500        30.588669                  9
4     chr1:10523836       32.3750        34.594556                 11
...             ...           ...              ...                ...
3568  chrY:19514040       18.6875        13.108362                 15
3569  chrY:20442884       16.4375        19.479797                 10
3570  chrY:20694979       17.6250        14.573378                 12
3571  chrY:20845286       20.8750        12.774323                 15
3572  chrY:21604793       17.2500        16.250128                 12

[3573 rows x 4 columns]>
<bound method DataFrame.__len__ of          variant_id  avg_coverage  stddev_coverage  num_samples_