In [None]:
"""Figure core creation: Fig3

Formatting of the figures may not be identical to the paper, but they contain the same data points.
"""
# pylint: disable=import-error, redefined-outer-name, use-dict-literal, too-many-lines, too-many-branches, consider-using-f-string

In [None]:
%load_ext autoreload
%autoreload 2

## Setup

In [None]:
from __future__ import annotations

from pathlib import Path
from typing import Dict, List

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import display

from epi_ml.utils.notebooks.paper.paper_utilities import (
    ASSAY,
    ASSAY_MERGE_DICT,
    CELL_TYPE,
    SEX,
    IHECColorMap,
    MetadataHandler,
    SplitResultsHandler,
)

### General paths setup

In [None]:
base_dir = Path.home() / "Projects/epiclass/output/paper"
paper_dir = base_dir
if not paper_dir.exists():
    raise FileNotFoundError(f"Directory {paper_dir} does not exist.")

base_data_dir = base_dir / "data"
base_fig_dir = base_dir / "figures"

In [None]:
IHECColorMap = IHECColorMap(base_fig_dir)
assay_colors = IHECColorMap.assay_color_map
cell_type_colors = IHECColorMap.cell_type_color_map
sex_colors = IHECColorMap.sex_color_map

In [None]:
split_results_handler = SplitResultsHandler()

metadata_handler = MetadataHandler(paper_dir)
metadata_v2 = metadata_handler.load_metadata("v2")
metadata_v2_df = metadata_handler.load_metadata_df("v2")

### ChIP-Atlas predictions file `CA_metadata_4DB+all_pred.20240606_mod2.2.tsv`

In [None]:
base_pred_dir = base_data_dir / "training_results" / "predictions"
if not base_pred_dir.exists():
    raise FileNotFoundError(f"Directory {base_pred_dir} does not exist.")

chip_atlas_preds_dir = base_pred_dir / "C-A"
chip_atlas_preds_path = (
    chip_atlas_preds_dir / ASSAY / "CA_metadata_4DB+all_pred.20240606_mod2.2.tsv"
)
chip_atlas_preds_df = pd.read_csv(chip_atlas_preds_path, sep="\t", low_memory=False)

print(f"ChIP-Atlas: {chip_atlas_preds_df.shape[0]} total files")
# TODO: have clear encode metadata file specified in markdown, so 13c content is explicit

In [None]:
print("ChIP-Atlas - Available model predictions")
for col in chip_atlas_preds_df.columns:
    if "Max_pred" in col:
        model_task = col.split("_")[-1]
        print(model_task)

Predictions from following models and more:
| Metadata category|   Nb classes |     Experiment Key (comet.com)      | Nb Files | Training Size |
|------------------|--------------|------------------------------|----------|---------------|
| assay_epiclass   |     7      | 69488630801b4a05a53b5d9e572f0aaa       | 16788    | 34413 |
| assay_epiclass   |     11      | 0f8e5eb996114868a17057bebe64f87c      | 20922    | 46128 |
| assay_epiclass   |     13      | dd3710b73c0341af85a17ce1998362d0      | 24989    | 116550|
| harmonized_donor_sex | 3    | 4b908b83e0ec45c3ab991e65aa27af0c | 18299    | 28078  |
| harmonized_donor_life_stage | 5    | 91214ed0b1664395b1826dc69a495ed4 | 15372    | 66372  |
| harmonized_sample_cancer_high | 2    | 15da476b92f140eab818ece369248f4c | 20922    | 34491  |

The training size is higher than the number of files because of random oversampling of minority classes up to approx. majority size.

Classes:

- assay 7c: 6 h3k* histone marks + input
- assay 11c: assay7c + rna_seq + mrna_seq + wgbs_standard + wgbs_pbat
- assay 13c: assay11c + encode non-core + encode CTCF
- harmonized_donor_sex: male, female, mixed
- harmonized_donor_life_stage: adult, child, newborn, fetal, embryonic
- harmonized_sample_cancer_high (modification of harmonized_sample_disease_high): cancer, non-cancer (healthy/None+disease)

Training metadata: `IHEC_metadata_harmonization.v1.1.extended.csv` + encode XXXX

### ENCODE predictions file `encode_predictions_augmented_merged.csv`

See `model.list` in `encode_preds_dir`.  
Same models as ChIP-Atlas predictions. The assay_epiclass model is the 7c one.  
Additonally, the `harmonized_sample_ontology_intermediate` model was used on a subset of files with known EpiATLAS sample ontologies.

| Metadata category|   Nb classes |     Experiment Key (comet.com)      | Nb Files | Training Size |
|------------------|--------------|------------------------------|----------|---------------|
| harmonized_sample_ontology_intermediate   |     16      | bb66b72ae83645d587e50b34aebb39c3      | 16379    | 49054|


In [None]:
encode_preds_dir = base_pred_dir / "encode"
encode_preds_path = encode_preds_dir / "encode_predictions_augmented_merged.csv"
encode_preds_df = pd.read_csv(encode_preds_path, sep="\t", low_memory=False)
print(f"ENCODE: {encode_preds_df.shape[0]} total files")

## Fig 3

### Fig 3A - ChIP-Atlas assay prediction

### Fig 3B-C-D - ChIP-Atlas donor sex, cancer and life stage predictions

### Fig 3E - ENCODE dataset predictions (5 tasks)

### Fig 3F - ENCODE non-core predictions with assay category mapping

## Supp Fig 5A