In [5]:
"""Perform gene ontology analysis on bed files."""
# pylint: disable=import-error,unused-import,redefined-outer-name
from __future__ import annotations

import os
import subprocess
from pathlib import Path

import gffpandas.gffpandas as gffpd
import pandas as pd
from gprofiler import GProfiler

from epi_ml.utils.general_utility import get_valid_filename

In [6]:
base_path = Path.home() / "Projects/epilap"

## Taking care of the gff file

In [7]:
def filter_gff():
    """Filter gff to only keep genes."""
    gff_path = base_path / "input" / "gff" / "Homo_sapiens.GRCh38.109.chr.gff3"
    gff_df = gffpd.read_gff3(gff_path)
    gff_df_source: pd.DataFrame = gff_df.df  # type: ignore

    accepted_types = [
        source
        for source in gff_df_source["type"].unique().tolist()
        if "gene" in source and "segment" not in source
    ]

    gff_df = gff_df.filter_feature_of_type(accepted_types)
    gff_df_source["seq_id"] = "chr" + gff_df_source["seq_id"].astype(str)

    gff_df.to_gff3(gff_path.parent / "Homo_sapiens.GRCh38.109.chr.filtered.gff3")

Also had to modify ##sequence-region header to add 'chr'.

## Using bedtools intersect

Use biggest file as B.

In [8]:
# bed_base_dir = (
#     base_path
#     / "output/logs/epiatlas-dfreeze-v2.1/hg38_100kb_all_none/shap/harmonized_sample_ontology_intermediate_1l_3000n/10fold/split0/shap/rna_only/frequent_features/feature_frequency_method"
# )
# if not bed_base_dir.exists():
#     raise ValueError(f"{bed_base_dir} does not exist.")

In [9]:
BEDTOOLS_PATH = Path.home() / "downloads" / "installations" / "bedtools"
GFF_PATH = base_path / "input" / "gff" / "Homo_sapiens.GRCh38.109.chr.filtered.gff3"

In [10]:
def intersect_one_bed(input_bed_path: Path, output_filename: Path) -> None:
    """Bed to GO. Does nothing if output_filename already exists."""
    # don't redo work
    if output_filename.is_file():
        print(f"{output_filename} already exists.")
        return

    cmd = [
        str(BEDTOOLS_PATH),
        "intersect",
        "-a",
        str(input_bed_path),
        "-b",
        str(GFF_PATH),
        "-wo",
        "-F",
        "0.5",
    ]
    output = subprocess.check_output(cmd).decode()

    print(f"Writing to {output_filename}")
    with open(output_filename, "w", encoding="utf8") as out:
        out.writelines(output)

In [11]:
# bed_folder = Path.home() / "scratch/epiclass/join_important_features/global_info"
# for bed_file in bed_folder.glob("*.bed"):
#     if os.stat(str(bed_file)).st_size == 0:
#         os.remove(str(bed_file))
#         continue
#     output_name = Path(bed_file.stem + "_intersect_gff.tsv")
#     output_filename = bed_folder / output_name
#     intersect_one_bed(bed_file, output_filename)

## Using gProfiler

In [12]:
gp = GProfiler(return_dataframe=True)

In [13]:
bed_folder = (
    Path.home()
    / "scratch/epiclass/join_important_features/harmonized_sample_ontology_intermediate_1l_3000n/10fold-oversampling/global_shap_analysis/select_beds_top303/"
)

if not bed_folder.exists():
    raise ValueError(f"{bed_folder} does not exist.")

In [17]:
for intersect_file in bed_folder.glob("*_intersect_gff.tsv"):
    # Don't redo work
    new_file = bed_folder / f"{intersect_file.stem}_gprofiler.tsv"
    # if new_file.is_file():
    #     continue

    try:
        intersect_df = pd.read_csv(intersect_file, sep="\t", header=None)
    except pd.errors.EmptyDataError:
        continue
    genes = intersect_df[11].str.extract(r"ID=gene:(\w+);").drop_duplicates()
    genes_list = sorted(genes[0].values)

    gene_list_path = bed_folder / f"{intersect_file.stem}_genes.list"
    with open(gene_list_path, "w", encoding="utf8") as out:
        out.write("\n".join(genes_list))

    # print(f"Writing GO results to {new_file}")
    # go_profile = gp.profile(query=genes_list)
    # go_profile.to_csv(new_file, sep="\t", index=False)  # type: ignore