In [54]:
import pandas as pd
import altair as alt
import numpy as np

In [49]:
#/scratch/ucgd/lustre-labs/marth/scratch/u0746015/COLO829/intersections/rufus_exclusive_long_read_isecs/five_way/per_sample_depth.tsv

In [66]:
# Add headers
num_samples = 16
sample_columns = [f"Sample{i+1}_DP" for i in range(num_samples)]

# Read data without a header and assign column names
column_names = ['CHROM', 'POS'] + sample_columns
depth_df = pd.read_csv('/Users/genetics/Documents/code/altair/colo829/long_reads/5way_5perc_sample_depth.tsv', sep='\t', header=None, names=column_names)

# Create a unique variant ID (could also combine CHROM_POS)
depth_df['variant_id'] = depth_df['CHROM'].astype(str) + ':' + depth_df['POS'].astype(str)

# Extract sample DP columns (assuming they start at 3rd column)
dp_columns = depth_df.columns[2:-1]  # or simply 2: if variant_id is not yet present
if 'variant_id' in depth_df.columns:
    dp_columns = depth_df.columns[2:-1]  # CHROM, POS, ..., variant_id
else:
    dp_columns = depth_df.columns[2:]


# Compute summary metrics
depth_df['avg_coverage'] = depth_df[dp_columns].mean(axis=1)
depth_df['stddev_coverage'] = depth_df[dp_columns].std(axis=1)
depth_df['num_samples_found'] = (depth_df[dp_columns] > 0).sum(axis=1)

# Select output columns
summary_df = depth_df[['variant_id', 'avg_coverage', 'stddev_coverage', 'num_samples_found']]

# Save to TSV
summary_df.to_csv('variants_summary_5way.tsv', sep='\t', index=False)

In [100]:

# Load data from TSV file
data = pd.read_csv('/Users/genetics/Documents/code/altair/colo829/long_reads/variants_summary_5way.tsv', sep='\t')
data = data.apply(pd.to_numeric, errors='ignore')  # Convert numeric columns to appropriate types

# Filter data
#filt = data[data['num_samples_found'] >= 8]  # Keep only variants found in at least N samples

# Error bounds
data['lower_bound'] = data['avg_coverage'] - data['stddev_coverage']
data['upper_bound'] = data['avg_coverage'] + data['stddev_coverage']


# Define color scale (adjust to your needs) 
color_scale = alt.Scale(
    domain=[0, 16],  # Adjust this range based on your data
    range=['yellow', 'royalblue']
)

error_bars = alt.Chart(data).mark_errorbar(clip=True, color='lightgray').encode(
    x=alt.X('variant_id:N', title='Variant ID', axis=None),
    y=alt.Y('lower_bound:Q'),
    y2=alt.Y2('upper_bound:Q')
)

# Create points with error bars
points = alt.Chart(data).mark_point(filled=True, size=30, clip=True).encode(
    x=alt.X('variant_id:N', title='Variant ID', axis=None),
    y=alt.Y('avg_coverage:Q', title='Average Coverage'),
    color=alt.Color('num_samples_found:Q', scale=color_scale, legend=alt.Legend(title='# Samples Found In')),
    tooltip=['variant_id', 'avg_coverage', 'stddev_coverage', 'num_samples_found']
)

# Combine layers
chart = error_bars + points

chart = chart.properties(
    width=1400,
    height=600,
    title='Long Read Average Coverage for 5%+ 5-Way RUFUS-Only Sites in BLT50 Admixture'
).configure_title(fontSize=24).configure_axis(labelFontSize=16, titleFontSize=16).configure_legend(labelFontSize=14, titleFontSize=14)

chart


  data = data.apply(pd.to_numeric, errors='ignore')  # Convert numeric columns to appropriate types
