In [None]:
"""Compare the metrics on assay_epiclass and harmonized_sample_ontology_intermediate
when using alternative track_types for chips-seq data.

Combinations of track_types:
- all (fc, pval, raw)
- noFC
- only-pval
- only-raw
"""

from __future__ import annotations

from pathlib import Path

import pandas as pd
from IPython.display import display

In [2]:
results_dir = (
    Path.home() / "mounts" / "narval-mount" / "logs-dfreeze-2.1" / "hg38_100kb_all_none/"
)
assay_logs = list(
    (results_dir / "assay_epiclass_1l_3000n").glob(
        "10fold*/full-10fold-validation_prediction_augmented-all.csv"
    )
)
cell_type_logs = list(
    (results_dir / "harmonized_sample_ontology_intermediate_1l_3000n").glob(
        "10fold-*/full-10fold-validation_prediction_augmented-all.csv"
    )
)

display(assay_logs)
display(cell_type_logs)

[PosixPath('/home/local/USHERBROOKE/rabj2301/mounts/narval-mount/logs-dfreeze-2.1/hg38_100kb_all_none/assay_epiclass_1l_3000n/10fold-noFC/full-10fold-validation_prediction_augmented-all.csv'),
 PosixPath('/home/local/USHERBROOKE/rabj2301/mounts/narval-mount/logs-dfreeze-2.1/hg38_100kb_all_none/assay_epiclass_1l_3000n/10fold-only_raw/full-10fold-validation_prediction_augmented-all.csv'),
 PosixPath('/home/local/USHERBROOKE/rabj2301/mounts/narval-mount/logs-dfreeze-2.1/hg38_100kb_all_none/assay_epiclass_1l_3000n/10fold-only_pval/full-10fold-validation_prediction_augmented-all.csv'),
 PosixPath('/home/local/USHERBROOKE/rabj2301/mounts/narval-mount/logs-dfreeze-2.1/hg38_100kb_all_none/assay_epiclass_1l_3000n/10fold/full-10fold-validation_prediction_augmented-all.csv')]

[PosixPath('/home/local/USHERBROOKE/rabj2301/mounts/narval-mount/logs-dfreeze-2.1/hg38_100kb_all_none/harmonized_sample_ontology_intermediate_1l_3000n/10fold-only_raw/full-10fold-validation_prediction_augmented-all.csv'),
 PosixPath('/home/local/USHERBROOKE/rabj2301/mounts/narval-mount/logs-dfreeze-2.1/hg38_100kb_all_none/harmonized_sample_ontology_intermediate_1l_3000n/10fold-noFC/full-10fold-validation_prediction_augmented-all.csv')]

In [3]:
assay_dfs = {}
for file in assay_logs:
    df = pd.read_csv(file, sep=",", header=0)
    name = file.parent.name
    assay_dfs[name] = df

In [4]:
# assay_dfs[0].columns

In [6]:
relevant_tracks = set(["pval", "raw"])

for name, df in assay_dfs.items():
    df = df[df["track_type"].isin(relevant_tracks)]
    result = (
        df.groupby(["track_type", "assay_epiclass", "Same?"])["md5sum"]
        .count()
        .unstack(fill_value=0)
    )
    result["Total"] = result.sum(axis=1)
    result["Accuracy (%)"] = (result[True] / result["Total"] * 100).round(2)
    print(name)
    display(result)
    result.to_csv(f"assay_{name}_acc.csv")

10fold-noFC


Unnamed: 0_level_0,Same?,False,True,Total,Accuracy (%)
track_type,assay_epiclass,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
pval,h3k27ac,2,1561,1563,99.87
pval,h3k27me3,1,674,675,99.85
pval,h3k36me3,1,694,695,99.86
pval,h3k4me1,1,962,963,99.9
pval,h3k4me3,0,799,799,100.0
pval,h3k9me3,0,642,642,100.0
raw,h3k27ac,7,1556,1563,99.55
raw,h3k27me3,5,670,675,99.26
raw,h3k36me3,2,693,695,99.71
raw,h3k4me1,1,962,963,99.9


10fold-only_raw


Unnamed: 0_level_0,Same?,False,True,Total,Accuracy (%)
track_type,assay_epiclass,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
raw,h3k27ac,7,1556,1563,99.55
raw,h3k27me3,1,674,675,99.85
raw,h3k36me3,1,694,695,99.86
raw,h3k4me1,1,962,963,99.9
raw,h3k4me3,2,797,799,99.75
raw,h3k9me3,2,640,642,99.69


10fold-only_pval


Unnamed: 0_level_0,Same?,False,True,Total,Accuracy (%)
track_type,assay_epiclass,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
pval,h3k27ac,3,1560,1563,99.81
pval,h3k27me3,1,674,675,99.85
pval,h3k36me3,1,694,695,99.86
pval,h3k4me1,1,962,963,99.9
pval,h3k4me3,3,796,799,99.62
pval,h3k9me3,0,642,642,100.0


10fold


Unnamed: 0_level_0,Same?,False,True,Total,Accuracy (%)
track_type,assay_epiclass,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
pval,h3k27ac,2,1561,1563,99.87
pval,h3k27me3,1,674,675,99.85
pval,h3k36me3,1,694,695,99.86
pval,h3k4me1,0,963,963,100.0
pval,h3k4me3,0,799,799,100.0
pval,h3k9me3,0,642,642,100.0
raw,h3k27ac,4,1559,1563,99.74
raw,h3k27me3,2,673,675,99.7
raw,h3k36me3,1,694,695,99.86
raw,h3k4me1,2,961,963,99.79
