In [None]:
import sys
from pathlib import Path

repo_root = Path.cwd().parents[0]
sys.path.append(str(repo_root))

from src.config import Paths, ModelConfig
from src.utils import ensure_dir, save_json, stratified_downsample
from src.data import load_variant_summary, clean_variants
from src.features import build_features
from src.train import learn_structure, fit_bn
from src.evaluate import predict_bn, compute_metrics

import pandas as pd
from sklearn.model_selection import train_test_split


In [None]:
paths = Paths()
cfg = ModelConfig()

ensure_dir(paths.data_processed)
ensure_dir(paths.results)
ensure_dir(paths.figures)

RAW_PATH = paths.data_raw / "variant_summary.txt.gz"
PROCESSED_PATH = paths.data_processed / "features_clean.csv"
METRICS_PATH = paths.results / "metrics.json"

RAW_PATH, PROCESSED_PATH


In [None]:
df = load_variant_summary(str(RAW_PATH))
print("Total variants (raw):", len(df))

df = clean_variants(df)
print("Total variants (cleaned):", len(df))
df.head()


In [None]:
features = build_features(df)
features.head(), features[cfg.target].value_counts(normalize=True)


In [None]:
# Run a sample

SAMPLE_N = 200_000

features_run = stratified_downsample(
    features.dropna().reset_index(drop=True),
    target_col=cfg.target,
    n=SAMPLE_N,
    random_state=cfg.random_state
)

print("Rows used for modeling:", len(features_run))
features_run[cfg.target].value_counts(normalize=True)


In [None]:
features_run.to_csv(PROCESSED_PATH, index=False)
print("Saved:", PROCESSED_PATH)


In [None]:
drop_cols = ["PhenotypeCount_bin", "StopGain", "Frameshift", "LengthChange_bin", "PositionBin"]
data_clean = features_run.drop(columns=drop_cols)

data_clean.head()


In [None]:
train_df, test_df = train_test_split(
    data_clean,
    test_size=cfg.test_size,
    random_state=cfg.random_state,
    stratify=data_clean[cfg.target]
)

len(train_df), len(test_df)


In [None]:
dag = learn_structure(train_df, target=cfg.target, max_indegree=cfg.max_indegree)

print("Learned edges:")
edges = list(dag.edges())
edges[:20], len(edges)


In [None]:
model = fit_bn(train_df, dag)
print("CPDs learned:", len(model.get_cpds()))


In [None]:
y_true, y_pred, y_proba = predict_bn(model, test_df, target=cfg.target)
metrics = compute_metrics(y_true, y_pred, y_proba)

metrics["accuracy"], metrics["roc_auc"], metrics["pr_auc_average_precision"]


In [None]:
save_json(metrics, METRICS_PATH)
print("Saved:", METRICS_PATH)
