In [None]:
"""Perform gene ontology analysis on bed files."""
# pylint: disable=import-error
from __future__ import annotations

import re
import subprocess
from pathlib import Path

import gffpandas.gffpandas as gffpd
import pandas as pd
from gprofiler import GProfiler

from epi_ml.utils.general_utility import get_valid_filename

In [None]:
base_path = Path.home() / "Projects/epilap"

## Taking care of the gff file

In [None]:
gff_path = base_path / "input" / "gff" / "Homo_sapiens.GRCh38.109.chr.gff3"

gff_df = gffpd.read_gff3(gff_path)

In [None]:
accepted_types = [
    source
    for source in gff_df.df["type"].unique().tolist()
    if "gene" in source and "segment" not in source
]

In [None]:
gff_df = gff_df.filter_feature_of_type(accepted_types)

In [None]:
gff_df.df["seq_id"] = "chr" + gff_df.df["seq_id"].astype(str)

In [None]:
gff_df.df.head()

In [None]:
gff_df.to_gff3(gff_path.parent / "Homo_sapiens.GRCh38.109.chr.filtered.gff3")

Also had to modify ##sequence-region header to add 'chr'.

## Using bedtools intersect

Use biggest file as B.

In [None]:
bed_base_dir = (
    base_path
    / "output/logs/epiatlas-dfreeze-v2.1/hg38_100kb_all_none/shap/harmonized_sample_ontology_intermediate_1l_3000n/10fold/split0/shap/rna_only/frequent_features/feature_frequency_method"
)

In [None]:
bedtools_path = Path.home() / "downloads" / "installations" / "bedtools"
gff_path = base_path / "input" / "gff" / "Homo_sapiens.GRCh38.109.chr.filtered.gff3"

In [None]:
if not bed_base_dir.exists():
    raise ValueError(f"{bed_base_dir} does not exist.")

In [None]:
for bed_folder in bed_base_dir.iterdir():
    # only looking into folders at first level
    if bed_folder.is_file():
        continue

    category_name = get_valid_filename(bed_folder.name)

    for bed_file in bed_folder.glob("*.bed"):
        class_label = re.search(
            pattern=r"frequent_features_\d+_(.*?)\.bed", string=str(bed_file)
        )[1]
        print(category_name, class_label)

        results_filename = f"{category_name}_{bed_file.stem}_intersect.tsv"
        result_path = bed_folder / results_filename

        # don't redo work
        if result_path.is_file():
            continue

        print(results_filename)

        cmd = [
            str(bedtools_path),
            "intersect",
            "-a",
            str(bed_file),
            "-b",
            str(gff_path),
            "-wo",
            "-F",
            "0.5",
        ]
        output = subprocess.check_output(cmd).decode()

        print(f"Writing to {result_path}")
        with open(result_path, "w", encoding="utf8") as out:
            out.writelines(output)

## Using gProfiler

In [None]:
gp = GProfiler(return_dataframe=True)

In [None]:
for intersect_file in bed_base_dir.rglob("*_intersect.tsv"):
    # Don't redo work
    new_file = intersect_file.parent / f"{intersect_file.stem}_gprofiler.tsv"
    if new_file.is_file():
        continue

    try:
        intersect_df = pd.read_csv(intersect_file, sep="\t", header=None)
    except pd.errors.EmptyDataError:
        continue
    genes = intersect_df[11].str.extract(r"ID=gene:(\w+);").drop_duplicates()
    genes_list = list(genes[0].values)

    print(f"Writing GO results to {new_file}")
    go_profile = gp.profile(query=genes_list)
    go_profile.to_csv(new_file, sep="\t", index=False)