# ALS Perturbation Latent Space Navigator
**GeneFormer-driven embeddings for ALS rescue hypotheses**
Steph Ritchie | Helical Coding Challenge | October 2025


## How to Use This Deck
- Launch RISE (`Alt+R`) or click the rocket button in Jupyter
- Press `Space` to advance, `Shift+Space` to go back
- Execute the notebook top-to-bottom before presenting to warm up widgets
- Use the controls on each interactive slide to explore the analysis live


## Analysis Roadmap
- Apply in-silico knock-up and knock-down perturbations to ALS genes (SOD1, TARDBP, FUS, OPTN, TBK1)
- Embed perturbed AnnData matrices with GeneFormer_V2 (gf-12L-95M-i4096)
- Compare latent neighbourhoods, rescue trajectories, and differential expression against controls
- Package metrics, figures, and tables for fast scenario walkthroughs


In [36]:
from pathlib import Path
import pickle
import pandas as pd
import numpy as np
from ipywidgets import HTML, VBox, HBox, Dropdown, ToggleButtons, IntSlider, Layout, Output
from ipywidgets import interact
from IPython.display import display, Markdown, Image

pd.options.display.float_format = '{:,.3f}'.format


In [37]:
OUTPUT_DIR = Path('third attempt/outputs')
SCENARIO_PATH = OUTPUT_DIR / 'hierarchical_files' / 'perturbation_scenarios.pkl'
DEG_PATH = OUTPUT_DIR / 'task3_significant_degs.csv'

DATA_OVERVIEW = {
    'Total cells': 112_014,
    'Total genes': 22_832,
    'ALS samples': 66_960,
    'Control samples': 45_054,
    'Unique cell types': 19,
}

EMBEDDING_METRICS = {
    'Best scenario': 'scenario3_control_disease_rescue',
    'Silhouette score': 0.363,
    'Optimal clusters (Leiden)': 6,
    'Most shifted cell type': 'Fibro',
}

DEG_COUNTS = {
    'Significant DEGs': 3_880,
    'Upregulated in ALS': 2_983,
    'Downregulated in ALS': 897,
}

SCENARIOS = {}
if SCENARIO_PATH.exists():
    with SCENARIO_PATH.open('rb') as handle:
        SCENARIOS = pickle.load(handle)
else:
    print(f'Warning: scenario metadata not found at {SCENARIO_PATH}')

if DEG_PATH.exists():
    deg_df = pd.read_csv(DEG_PATH)
    deg_df['direction'] = np.where(deg_df['logfoldchanges'] >= 0, 'Upregulated in ALS', 'Downregulated in ALS')
    deg_df['abs_logfc'] = deg_df['logfoldchanges'].abs()
else:
    print(f'Warning: DEG table not found at {DEG_PATH}')
    deg_df = pd.DataFrame(columns=['names', 'scores', 'logfoldchanges', 'pvals', 'pvals_adj', 'pvals_adj_BH', 'direction', 'abs_logfc'])

FIGURES = {
    'Rescue vs disease embeddings': OUTPUT_DIR / 'task3_scenario1_disease_rescue_disease_rescue_embeddings.png',
    'Scenario comparison dashboard': OUTPUT_DIR / 'task3_scenario_comparison.png',
    'Optimal clusters': OUTPUT_DIR / 'task3_optimal_clusters.png',
    'ALS vs control distances': OUTPUT_DIR / 'task3_als_control_distances.png',
    'Volcano plot': OUTPUT_DIR / 'task3_volcano_plot.png',
    'DEG heatmap': OUTPUT_DIR / 'task3_deg_heatmap.png',
}
FIGURES = {label: path for label, path in FIGURES.items() if path.exists()}


In [38]:
CARD_STYLE = (
    "padding:12px 16px; border-radius:12px; background:#0b1120; color:#f8fafc; "
    "border:1px solid #1f2933; min-width:200px;"
)


def make_stat_card(label, value, subtitle=''):
    body = (
        f'<div style="{CARD_STYLE}">'
        f'<div style="font-size:0.75 rem; text-transform:uppercase; opacity:0.75;">{label}</div>'
        f'<div style="font-size:1.4 rem; font-weight:600; margin-top:6 px;">{value}</div>'
    )
    if subtitle:
        body += f'<div style="font-size:0.8 rem; opacity:0.8; margin-top:4 px;">{subtitle}</div>'
    body += "</div>"
    return HTML(body)


def stack_cards(cards, per_row=3):
    if not cards:
        return HTML('<em>No data available.</em>')
    rows = []
    for start in range(0, len(cards), per_row):
        segment = cards[start:start + per_row]
        rows.append(HBox(segment, layout=Layout(gap='12px')))
    return VBox(rows, layout=Layout(gap='12px'))


def dataset_cards():
    cards = [make_stat_card(label, f"{value:,}") for label, value in DATA_OVERVIEW.items()]
    return stack_cards(cards, per_row=3)


def embedding_cards():
    cards = [
        make_stat_card('Best scenario', EMBEDDING_METRICS['Best scenario'], 'Leiden silhouette {:.3f}'.format(EMBEDDING_METRICS['Silhouette score'])),
        make_stat_card('Optimal clusters', EMBEDDING_METRICS['Optimal clusters (Leiden)'], 'Method: Leiden'),
        make_stat_card('Most shifted cell type', EMBEDDING_METRICS['Most shifted cell type'], 'Largest ALS-control distance'),
    ]
    return stack_cards(cards, per_row=3)


def deg_cards():
    cards = [
        make_stat_card('Significant DEGs', f"{DEG_COUNTS['Significant DEGs']:,}", '|logFC| > 0.5, FDR < 0.05'),
        make_stat_card('Up in ALS', f"{DEG_COUNTS['Upregulated in ALS']:,}", 'subset of significant'),
        make_stat_card('Down in ALS', f"{DEG_COUNTS['Downregulated in ALS']:,}", 'subset of significant'),
    ]
    return stack_cards(cards, per_row=3)


def scenario_to_markdown(name):
    meta = SCENARIOS.get(name)
    if not meta:
        return 'No metadata found for this scenario.'
    lines = [
        f"### {name.replace('_', ' ').title()}",
        f"- **Target cells:** {meta.get('target_cells', 'n/a')}",
        f"- **Description:** {meta.get('description', 'n/a')}"
    ]
    perturbations = meta.get('perturbations', [])
    if perturbations:
        lines.append('**Perturbations**')
        for spec in perturbations:
            genes = ', '.join(spec.get('genes', [])) or 'n/a'
            factor = spec.get('factor', 'n/a')
            ptype = spec.get('type', 'n/a').replace('_', ' ').title()
            lines.append(f"- {ptype} {genes} (factor {factor})")
    else:
        lines.append('No perturbations applied (control scenario).')
    return ''.join(lines)


def show_figure(label):
    path = FIGURES.get(label)
    if not path:
        display(Markdown('No figure available.'))
        return
    display(Image(filename=str(path)))


def view_degs(direction, top_n, sort_by):
    if deg_df.empty:
        display(Markdown('No DEG table loaded.'))
        return
    subset = deg_df[deg_df['direction'] == direction].copy()
    if subset.empty:
        display(Markdown('No rows match the current filters.'))
        return
    subset = subset.sort_values(sort_by, ascending=False).head(top_n)
    tidy = subset[['names', 'logfoldchanges', 'scores', 'pvals_adj_BH']].rename(columns={
        'names': 'Gene',
        'logfoldchanges': 'logFC',
        'scores': 'Score',
        'pvals_adj_BH': 'FDR_BH',
    })
    display(tidy.reset_index(drop=True))


## Dataset At A Glance
ALS BA4 single-cell dataset perturbation summary


In [43]:
overview_block = VBox([
    dataset_cards(),
    embedding_cards(),
    deg_cards(),
], layout=Layout(gap='18px'))

display(overview_block)


VBox(children=(VBox(children=(HBox(children=(HTML(value='<div style="padding:12px 16px; border-radius:12px; ba…

## Perturbation Scenarios
Interactively inspect the recipe for each simulated experiment


In [44]:
scenario_options = []
for key in sorted(SCENARIOS.keys()):
    label = key.replace('_', ' ').title()
    scenario_options.append((label, key))

scenario_dropdown = Dropdown(
    options=scenario_options or [('No scenarios found', None)],
    value=scenario_options[0][1] if scenario_options else None,
    description='Scenario:',
    layout=Layout(width='360px')
)
scenario_output = Output(layout=Layout(border='1px solid #1f2933', padding='12px', min_height='160px'))


def refresh_scenario(_=None):
    with scenario_output:
        scenario_output.clear_output()
        if not scenario_dropdown.value:
            display(Markdown('No scenario metadata available.'))
            return
        display(Markdown(scenario_to_markdown(scenario_dropdown.value)))


scenario_dropdown.observe(refresh_scenario, names='value')
refresh_scenario()

display(VBox([scenario_dropdown, scenario_output], layout=Layout(gap='12px')))


VBox(children=(Dropdown(description='Scenario:', layout=Layout(width='360px'), options=(('Scenario1 Disease Re…

## Latent Space Highlights
Switch between key figures rendered during the embedding analysis


In [45]:
figure_buttons = ToggleButtons(
    options=list(FIGURES.keys()) or ['No figures available'],
    description='Figure:',
    button_style='',
    layout=Layout(width='100%')
)
figure_output = Output(layout=Layout(border='1px solid #1f2933', padding='12px'))


def refresh_figure(change=None):
    with figure_output:
        figure_output.clear_output()
        if not FIGURES:
            display(Markdown('Add exported figures to the FIGURES dictionary to enable this view.'))
            return
        show_figure(figure_buttons.value)


if FIGURES:
    figure_buttons.value = next(iter(FIGURES.keys()))
else:
    figure_buttons.value = 'No figures available'

figure_buttons.observe(refresh_figure, names='value')
refresh_figure()

display(VBox([figure_buttons, figure_output], layout=Layout(gap='12px')))


VBox(children=(ToggleButtons(description='Figure:', layout=Layout(width='100%'), options=('Rescue vs disease e…

## Differential Expression Explorer
Filter significant genes by direction, ranking and panel size


In [49]:
interact(
    view_degs,
    direction=['Upregulated in ALS', 'Downregulated in ALS'],
    top_n=IntSlider(value=10, min=5, max=50, step=5, description='Top N'),
    sort_by=['abs_logfc', 'scores']
)


interactive(children=(Dropdown(description='direction', options=('Upregulated in ALS', 'Downregulated in ALS')…

<function __main__.view_degs(direction, top_n, sort_by)>

## Next Steps
- Pair this deck with live AnnData subsets for targeted cell type dives
- Extend the widget set with pathway enrichment summaries per scenario
- Export to HTML via `jupyter nbconvert --to slides interactive_presentation.ipynb` for sharing
