In [None]:
"""Download and process information from the UniProt database on transciption factors available on factorbook.org"""
# pylint: disable=import-error, redefined-outer-name, use-dict-literal, too-many-lines

In [18]:
import json
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import pandas as pd
import requests

## Download UniProt content 

In [11]:
def get_gene_id(name: str) -> Tuple[str, Optional[str]]:
    url = f"https://www.encodeproject.org/targets/{name}-human/"
    response = requests.get(url)
    if response.status_code == 200:
        match = re.search(r"GeneID:(\d+)", response.text)
        if match:
            return name, match.group(1)
    return name, None


def get_uniprot_id(name_gene_id: Tuple[str, Optional[str]]) -> Tuple[str, Optional[str]]:
    name, gene_id = name_gene_id
    if gene_id is None:
        return name, None
    url = f"https://www.encodeproject.org/genes/{gene_id}/"
    response = requests.get(url)
    if response.status_code == 200:
        match = re.search(r"UniProtKB:(\w+)", response.text)
        if match:
            return name, match.group(1)
    return name, None


def download_uniprot_file(
    name_uniprot_id: Tuple[str, Optional[str]], output_folder: Path
) -> Tuple[str, bool]:
    name, uniprot_id = name_uniprot_id
    is_success = False

    if uniprot_id is None:
        return name, is_success

    base_url = "https://rest.uniprot.org/uniprotkb"
    url_text = f"{base_url}/{uniprot_id}.txt"
    url_json = f"{base_url}/{uniprot_id}.json"
    for url in [url_text, url_json]:
        response = requests.get(url)
        if response.status_code == 200:
            filename = output_folder / f"{name}_{uniprot_id}.{Path(url).suffix[1:]}"
            with open(filename, "w", encoding="utf-8") as f:
                f.write(response.text)
            is_success = True

    return name, is_success


def extract_function(filepath: Path) -> Dict[str, Optional[str]]:
    with open(filepath, "r", encoding="utf-8") as f:
        content = f.read()
    match = re.search(r"-!- FUNCTION: (.+?)(?:\n\n|\Z)", content, re.DOTALL)
    if match:
        function = match.group(1).replace("\n", " ").strip()
        return {"name": filepath.stem, "function": function}
    return {"name": filepath.stem, "function": None}


def process_names(
    names: List[str], output_folder: Path, max_workers: int = 10
) -> List[Tuple[str, bool]]:
    if not output_folder.exists():
        raise FileNotFoundError(f"Output folder {output_folder} does not exist")

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Get GeneIDs
        future_to_gene_id = {executor.submit(get_gene_id, name): name for name in names}
        name_gene_ids = [future.result() for future in as_completed(future_to_gene_id)]

        # Get UniProtKB IDs
        future_to_uniprot_id = {
            executor.submit(get_uniprot_id, name_gene_id): name_gene_id[0]
            for name_gene_id in name_gene_ids
        }
        name_uniprot_ids = [
            future.result() for future in as_completed(future_to_uniprot_id)
        ]

        # Download UniProt files
        future_to_download = {
            executor.submit(
                download_uniprot_file, name_uniprot_id, output_folder
            ): name_uniprot_id[0]
            for name_uniprot_id in name_uniprot_ids
        }
        success_values = [future.result() for future in as_completed(future_to_download)]

    return success_values

In [12]:
names_path = Path.home() / "Projects/epiclass/output/paper/data/TF/TF_table.tsv"
names = pd.read_csv(names_path, header=None, sep="\t").iloc[:, 0].tolist()

names = [name.upper() for name in names]

In [14]:
output_dir = names_path.parent / "uniprot_files"
if not output_dir.exists():
    raise FileNotFoundError(f"Output folder {output_dir} does not exist")

In [None]:
# success_values = process_names(names, output_dir)

## Process json files

In [63]:
functions = []
for file in output_dir.glob("*.json"):
    with open(file, "r", encoding="utf-8") as f:
        content = json.load(f)

    try:
        all_comments = content["comments"]
    except KeyError as e:
        continue

    fct_val = "NA"
    sim_val = "NA"
    for comment in all_comments:
        if comment["commentType"] == "FUNCTION":
            text: Dict = comment["texts"][0]
            fct_val = text["value"]
        if comment["commentType"] == "SIMILARITY":
            text: Dict = comment["texts"][0]
            sim_val = text["value"]
    name, db_id = file.stem.split("_")
    functions.append(
        {"name": name, "uniprot_id": db_id, "function": fct_val, "similarity": sim_val}
    )

In [65]:
pd.DataFrame(functions).sort_values(by="name").to_csv(
    names_path.parent / "TF_functions.tsv", sep="\t", index=False
)