In [142]:
import altair as alt
import pandas as pd

pos_1plus=[3573, 478, 116, 98, 611]
neg_1plus=[296, 32, 9, 8, 90]

pos_2plus=[3573, 478, 116, 80, 324]
neg_2plus=[75, 12, 1, 4, 29]

pos_5plus=[3573, 477, 114, 73, 156]
neg_5plus=[4, 2, 0, 0, 4]

pos_8plus=[3573, 476, 113, 68, 132]
neg_8plus=[1, 1, 0, 0, 3]

pos_10plus=[3573, 473, 110, 55, 124]
neg_10plus=[0, 0, 0, 0, 3]

# Sample data - replace with your actual data
data = pd.DataFrame({
    'category': ['Σ C(5,5)', 'Σ C(5,4)', 'Σ C(5,3)', 'Σ C(5,2)', 'Σ C(5,1)'],
    'positive_total': [3578, 480, 118, 124, 1310], # BLT50 total variants for actual sites with 1+ read
    'positive_validated': pos_1plus, # BLT50 validated variants for actual sites
    'negative_total': [3578, 480, 118, 124, 1310], # BLT50 total variants for +15bp sites with 1+ read (hopefully these numbers the same as ^)
    'negative_validated': neg_1plus # BLT50 validated variants for +15bp sites
})

# Create stacked data structure
data_stacked = []
for _, row in data.iterrows():
    category = row['category']
    
    # Calculate percentages for this category
    pos_validation_rate = (row['positive_validated'] / row['positive_total'] * 100) if row['positive_total'] > 0 else 0
    neg_validation_rate = (row['negative_validated'] / row['negative_total'] * 100) if row['negative_total'] > 0 else 0
    
    # Calculate unvalidated counts (difference between total and validated)
    pos_unvalidated = row['positive_total'] - row['positive_validated']
    neg_unvalidated = row['negative_total'] - row['negative_validated']
    
    # Positive side - unvalidated variants (bottom of stack)
    data_stacked.append({
        'category': category,
        'direction': 'positive',
        'variant_type': 'unvalidated',
        'variant_direction': 'unvalidated_positive',
        'count': pos_unvalidated,
        'stack_order': 0,
        'validation_rate': pos_validation_rate,
        'total_count': row['positive_total'],
        'validated_count': row['positive_validated']
    })
    
    # Positive side - validated variants (stacked on top)
    data_stacked.append({
        'category': category,
        'direction': 'positive',
        'variant_type': 'validated',
        'variant_direction': 'validated_positive',
        'count': row['positive_validated'],
        'stack_order': 1,
        'validation_rate': pos_validation_rate,
        'total_count': row['positive_total'],
        'validated_count': row['positive_validated']
    })
    
    # Negative side - unvalidated variants (displayed as negative, bottom of stack)
    data_stacked.append({
        'category': category,
        'direction': 'negative',
        'variant_type': 'unvalidated',
        'variant_direction': 'unvalidated_negative',
        'count': -neg_unvalidated,
        'stack_order': 0,
        'validation_rate': neg_validation_rate,
        'total_count': row['negative_total'],
        'validated_count': row['negative_validated']
    })
    
    # Negative side - validated variants (stacked on negative side)
    data_stacked.append({
        'category': category,
        'direction': 'negative',
        'variant_type': 'validated',
        'variant_direction': 'validated_negative',
        'count': -row['negative_validated'],
        'stack_order': 1,
        'validation_rate': neg_validation_rate,
        'total_count': row['negative_total'],
        'validated_count': row['negative_validated']
    })

data_long = pd.DataFrame(data_stacked)

# Create the bidirectional stacked bar chart
chart = alt.Chart(data_long).mark_bar().encode(
    x=alt.X('category:O',
            sort=['Σ C(5,5)', 'Σ C(5,4)', 'Σ C(5,3)', 'Σ C(5,2)', 'Σ C(5,1)'],
            title='Categories',
            axis=alt.Axis(labelAngle=60,labelFontSize=14, titleFontSize=16)),
        y=alt.Y('count:Q',
            title='Count',
            scale=alt.Scale(domain=[-3600, 3600]), # Adjust domain as needed
            stack='zero',
            axis=alt.Axis(labelFontSize=14, titleFontSize=16, labelExpr="abs(datum.value)")),
    color=alt.Color('variant_direction:N',
                    scale=alt.Scale(
                        domain=['validated_positive', 'unvalidated_positive', 'validated_negative', 'unvalidated_negative'],
                        range=['#018571', '#80cdc1', '#a6611a', '#dfc27d']
                        #=['#fd8d3c', '#756bb1', '#fdbe85', '#bcbddc']
                    ),
                    legend=alt.Legend(
                        title="Variant Type",
                        orient='right',
                        # legendX=150,
                        # legendY=700,
                        direction='vertical',
                        symbolType='square',
                        symbolSize=200,
                        titleFontSize=16,
                        labelFontSize=14,
                        labelExpr="datum.value == 'unvalidated_positive' ? 'Non-Validated' : datum.value == 'validated_positive' ? 'Validated' : datum.value == 'unvalidated_negative' ? 'Non-Validated Control' : 'Validated Control'"
                    )),
    order=alt.Order('stack_order:O'),
    tooltip=['category:O', 'direction:N', 'variant_type:N', 'count:Q', 'validation_rate:Q']
).properties(
    width=700,
    height=600,
    title=alt.TitleParams(
        text="BLT50 5%+ VAF Long Read Variant Validation Counts (1+ Reads)",
        fontSize=20,
        fontWeight='bold',
        dy=-10,
        dx=100,
        anchor='start'
    )
)

# Create data for percentage labels (one per category per direction)
label_data = []
for category in data['category'].unique():
    # Positive side label
    pos_row = data[data['category'] == category].iloc[0]
    pos_validation_rate = (pos_row['positive_validated'] / pos_row['positive_total'] * 100) if pos_row['positive_total'] > 0 else 0
    pos_y_position = pos_row['positive_total']  # Top of positive stack (total height)
    
    label_data.append({
        'category': category,
        'y_position': pos_y_position + 50,  # Slightly above the bar
        'percentage': f"{pos_validation_rate:.1f}% ({pos_row['positive_validated']}/{pos_row['positive_total']})",
        'direction': 'positive'
    })
    
    # Negative side label
    neg_validation_rate = (pos_row['negative_validated'] / pos_row['negative_total'] * 100) if pos_row['negative_total'] > 0 else 0
    neg_y_position = -pos_row['negative_total']  # Bottom of negative stack (total height)
    
    label_data.append({
        'category': category,
        'y_position': neg_y_position - 50,  # Slightly below the bar
        'percentage': f"{neg_validation_rate:.1f}% ({pos_row['negative_validated']}/{pos_row['negative_total']})",
        'direction': 'negative'
    })

label_df = pd.DataFrame(label_data)

# Create percentage labels
percentage_labels = alt.Chart(label_df).mark_text(
    align='center',
    baseline='middle',
    fontSize=15,
    color='black',
    dy=alt.expr("datum.direction == 'positive' ? -3 : 3") 
).encode(
    x=alt.X('category:O'),
    y=alt.Y('y_position:Q'),
    text=alt.Text('percentage:N')
)

# Add a horizontal line at y=0 for reference
zero_line = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(
    color='black',
    strokeWidth=1
).encode(y='y:Q')

# Create section labels for positive and negative sides - positioned independently
section_label_data = pd.DataFrame([
    {'label': 'RUFUS-Called Sites', 'y_position': 3200, 'x_pos': 0},  # Top section
    {'label': 'Control Sites +15bp', 'y_position': -1800, 'x_pos': 0}  # Bottom section
])

# Create section labels for positive and negative sides
section_labels = alt.Chart(section_label_data).mark_text(
    align='right',
    baseline='middle',
    fontSize=12,
    fontWeight='bold',
    color='#333333',
    angle=270,  # Rotate text vertically like y-axis title
    dy=-50  # Position to the left of the chart area
).encode(
    x=alt.value(0),  # Fixed x position
    y=alt.Y('y_position:Q', scale=alt.Scale(domain=[-3600, 3600])),
    text=alt.Text('label:N')
)

# Combine all charts
final_chart = alt.layer(
    chart,
    zero_line,
    percentage_labels,
    section_labels
).resolve_scale(
    color='independent'
)

# Display the chart
final_chart.show()

# Optional: Save the chart
# final_chart.save('bidirectional_bar_chart.html')
# final_chart.save('bidirectional_bar_chart.png', scale_factor=2.0)

In [154]:
import altair as alt
import pandas as pd

# Your data arrays
pos_1plus = [3573, 478, 116, 98, 611]
neg_1plus = [296, 32, 9, 8, 90]

pos_2plus = [3573, 478, 116, 80, 324]
neg_2plus = [75, 12, 1, 4, 29]

pos_5plus = [3573, 477, 114, 73, 156]
neg_5plus = [4, 2, 0, 0, 4]

pos_8plus = [3573, 476, 113, 68, 132]
neg_8plus = [1, 1, 0, 0, 3]

pos_10plus = [3573, 473, 110, 55, 124]
neg_10plus = [0, 0, 0, 0, 3]

# Total counts (same for all thresholds)
pos_totals = [3578, 480, 118, 124, 1310]
neg_totals = [3578, 480, 118, 124, 1310]

categories = ['Σ C(5,5)', 'Σ C(5,4)', 'Σ C(5,3)', 'Σ C(5,2)', 'Σ C(5,1)']

# Create comprehensive dataset for line chart
trend_data = []
datasets = {
    '1+ Reads': (pos_1plus, neg_1plus),
    '2+ Reads': (pos_2plus, neg_2plus),
    '5+ Reads': (pos_5plus, neg_5plus),
    '8+ Reads': (pos_8plus, neg_8plus),
    '10+ Reads': (pos_10plus, neg_10plus)
}

for threshold, (pos_validated, neg_validated) in datasets.items():
    for i, category in enumerate(categories):
        # Positive validation percentage
        pos_rate = (pos_validated[i] / pos_totals[i] * 100) if pos_totals[i] > 0 else 0
        trend_data.append({
            'category': category,
            'threshold': threshold,
            'validation_rate': pos_rate,
            'site_type': 'RUFUS-Called',
            'line_id': f'{threshold} RUFUS-Called',
            'validated_count': pos_validated[i],
            'total_count': pos_totals[i]
        })
        
        # Negative validation percentage
        neg_rate = (neg_validated[i] / neg_totals[i] * 100) if neg_totals[i] > 0 else 0
        trend_data.append({
            'category': category,
            'threshold': threshold,
            'validation_rate': neg_rate,
            'site_type': 'Control',
            'line_id': f'{threshold} Control',
            'validated_count': neg_validated[i],
            'total_count': neg_totals[i]
        })

trend_df = pd.DataFrame(trend_data)

# Create the single line chart with all 10 lines
line_chart = alt.Chart(trend_df).mark_line(
    strokeWidth=3,
    point=alt.OverlayMarkDef(size=80, filled=True)
).encode(
    x=alt.X('category:O', 
            title='Category',
            sort=['Σ C(5,5)', 'Σ C(5,4)', 'Σ C(5,3)', 'Σ C(5,2)', 'Σ C(5,1)'],
            axis=alt.Axis(labelAngle=45, labelFontSize=14, titleFontSize=16)),
    y=alt.Y('validation_rate:Q',
            title='Validation Percentage (%)',
            scale=alt.Scale(domain=[0, 100]),
            axis=alt.Axis(labelFontSize=14, titleFontSize=16)),
    color=alt.Color('threshold:N',
                    scale=alt.Scale(
                        domain=['1+ Reads', '2+ Reads', '5+ Reads', '8+ Reads', '10+ Reads'],
                        range=['#e41a1c', '#ff7f00', '#4daf4a', '#377eb8', '#984ea3']  # One color per threshold
                    ),
                    legend=alt.Legend(
                        title="Read Threshold",
                        titleFontSize=14,
                        labelFontSize=12,
                        orient='top-left'
                    )),
    strokeDash=alt.StrokeDash('site_type:N',
                            scale=alt.Scale(
                                domain=['RUFUS-Called', 'Control'],
                                range=[[0], [5,5]]  # Solid for RUFUS-Called, dashed for Control
                            ),
                            legend=alt.Legend(
                                title="Site Type",
                                titleFontSize=14,
                                labelFontSize=16,
                                orient='top-right',
                                symbolType='stroke',
                                symbolStrokeWidth=3,
                                values=['RUFUS-Called', 'Neg Control']
                            )),
    tooltip=['threshold:O', 'category:O', 'site_type:N', 'validation_rate:Q', 'validated_count:Q', 'total_count:Q']
).properties(
    width=800,
    height=500,
    title=alt.TitleParams(
        text="Validation Rate Trends: RUFUS-Called vs Control Sites Across Read Thresholds",
        fontSize=20,
        anchor="middle",
        fontWeight='bold',
    )
)

# Display the chart
line_chart.show()

# Optional: Save the chart
# line_chart.save('validation_trends.html')
# line_chart.save('validation_trends.png', scale_factor=2.0)

In [1]:
# Long read coverage plots

pacbio=[35, 43, 28, 27, 22, 102, 37, 36, 33, 24]
ont=[26, 87, 38, 36, 30]

pacbio_sum=0
for i in range(len(pacbio)):
    pacbio_sum+=pacbio[i]
avg_pacbio=pacbio_sum/len(pacbio)
print(f"PacBio Average Coverage: {avg_pacbio:.1f}x")
ont_sum=0
for i in range(len(ont)):
    ont_sum+=ont[i]
avg_ont=ont_sum/len(ont)
print(f"ONT Average Coverage: {avg_ont:.1f}x")

total_sum=pacbio_sum+ont_sum
total_count=len(pacbio)+len(ont)
avg_total=total_sum/total_count
print(f"Overall Average Coverage: {avg_total:.1f}x")



PacBio Average Coverage: 38.7x
ONT Average Coverage: 43.4x
Overall Average Coverage: 40.3x
