In [None]:
"""See markdown"""

# pylint: disable=line-too-long, redefined-outer-name, import-error, duplicate-code, unreachable

# Prepare background and evaluation data for SHAP analysis.

In [None]:
from __future__ import annotations

import copy
from pathlib import Path
from typing import List

from epiclass.core.metadata import Metadata
from epiclass.utils.general_utility import write_hdf5_paths_to_file
from epiclass.utils.shap.prep_shap_run import evaluate_background_ratios

BIOMATERIAL_TYPE = "harmonized_biomaterial_type"
CELL_TYPE = "harmonized_sample_ontology_intermediate"
ASSAY = "assay_epiclass"
SEX = "harmonized_donor_sex"
CANCER = "harmonized_sample_cancer_high"
DISEASE = "harmonized_sample_disease_high"
LIFE_STAGE = "harmonized_donor_life_stage"
TRACK = "track_type"

In [None]:
def display_gen_info(metadata: Metadata, extra_categories: List[str] | None = None):
    """Display track type, assay and cell type class counts."""
    metadata.display_labels("track_type")
    metadata.display_labels(ASSAY)
    metadata.display_labels(CELL_TYPE)
    if extra_categories:
        for category in extra_categories:
            metadata.display_labels(category)

In [None]:
base = Path().home() / "Projects/epilap/input/metadata"
path = base / "dfreeze-v2" / "hg38_2023-epiatlas-dfreeze_v2.1_w_encode_noncore_2.json"
base_metadata = Metadata(path)

In [None]:
model_path = (
    Path.home()
    / "mounts/narval-mount/project-rabyj/epilap/output/logs/epiatlas-dfreeze-v2.1/hg38_100kb_all_none"
)
model_path = (
    model_path / "harmonized_donor_sex_1l_3000n/w-mixed/10fold-oversample/split0/"
)

In [None]:
category = SEX

In [None]:
training_md5_path = list(model_path.glob("split0_training_*.md5"))[0]
valid_md5_path = list(model_path.glob("split0_validation_*.md5"))[0]
training_mapping_path = model_path / "training_mapping.tsv"

with open(training_md5_path, "r", encoding="utf8") as f:
    training_md5 = set(f.read().splitlines())
with open(valid_md5_path, "r", encoding="utf8") as f:
    valid_md5 = set(f.read().splitlines())
with open(training_mapping_path, "r", encoding="utf8") as f:
    training_mapping = dict(line.split("\t") for line in f.read().splitlines())

In [None]:
valid_metadata = copy.deepcopy(base_metadata)
for md5 in list(valid_metadata.md5s):
    if md5 not in valid_md5:
        del valid_metadata[md5]

### Background list

In [None]:
best_background_md5s, best_n_per_trio = evaluate_background_ratios(
    category=category,
    metadata=base_metadata,
    training_md5s=training_md5,
    n_samples_list=[2, 3, 4],
    verbose=True,
)

In [None]:
# raise UserWarning("Stop here")

In [None]:
name = f"{best_n_per_trio}pertrio"
shap_dir = model_path / "shap"
shap_dir.mkdir(exist_ok=True)

# write_hdf5_paths_to_file(
#     md5s=sorted(best_background_md5s),
#     parent=".",
#     suffix="100kb_all_none",
#     filepath=shap_dir / f"shap_background_{name}.list",
# )

### Evaluation list

In [None]:
display_gen_info(valid_metadata)

In [None]:
# valid_metadata.select_category_subsets(ASSAY, ["rna_seq", "mrna_seq"])
# valid_metadata.select_category_subsets(CELL_TYPE, ["T cell", "lymphocyte of B lineage", "muscle organ", "monocyte", "neutrophil", "myeloid cell"])
# valid_metadata.remove_small_classes(min_class_size=10, label_category=CELL_TYPE, verbose=False)

In [None]:
display_gen_info(valid_metadata)

In [None]:
name = "all_files_split0_validation"
write_hdf5_paths_to_file(
    md5s=sorted(valid_metadata.md5s),
    parent=".",
    suffix="100kb_all_none",
    filepath=shap_dir / f"shap_eval_{name}.list",
)