In [None]:
## TODO: Sanity check gene conversion -- need to make sure that merged DBs use same ordering in query
# and answer.
## Mouse codes
## Canine codes

In [None]:
## This script needs to be run on the head node as mygene queries an online database.

In [11]:
# Imports
import csv
import glob2
import pandas as pd
import matplotlib.pyplot as plt
import mygene
import numpy as np
import scanpy as sc
import squidpy as sq
import warnings
from anndata import AnnData
from gseapy import Biomart
from scipy import sparse

In [12]:
# Globals
PRJ_DIR = "/scratch/gpfs/KANG/sereno/spatialstem"
SRC_DIR = f"{PRJ_DIR}/sourcefiles"
RAW_DIR = f"{SRC_DIR}/raw"
HAD_DIR = f"{SRC_DIR}/h5ad"
INT_DIR = f"{PRJ_DIR}/intermediates"
FIG_DIR = f"{PRJ_DIR}/figs"

In [None]:
# Global processor for mouse to human conversion from biomart
M2H = bm.query(dataset='mmusculus_gene_ensembl',
               attributes=['ensembl_gene_id','external_gene_name',
                           'hsapiens_homolog_ensembl_gene',
                           'hsapiens_homolog_associated_gene_name'])
M2H = M2H.drop_duplicates(subset='ensembl_gene_id')
M2H.index = M2H['ensembl_gene_id']
C2H = bm.query(dataset='clfamiliaris_gene_ensembl',
               attributes=['ensembl_gene_id','external_gene_name',
                           'hsapiens_homolog_ensembl_gene',
                           'hsapiens_homolog_associated_gene_name'])
C2H = C2H.drop_duplicates(subset='ensembl_gene_id')
C2H.index = C2H['ensembl_gene_id']

In [3]:
# Get paths of H5 datasets.
h5ad_paths = glob2.glob(f"{HAD_DIR}/*.h5ad")
h5ad_paths.sort()

In [None]:
# Handles an anndata object with human gene symbols as features by placing them in the correct slot.
def handle_human_symbols(ad: AnnData) -> AnnData:
    if ("gene_ids" in ad.var.keys()):
        gene_symbols = list(ad.var["gene_ids"])
    elif ("SYMBOL" in ad.var.keys()):
        gene_symbols = list(ad.var["SYMBOL"])
    ad.var["human_symbol"] = gene_symbols
    return(ad)

In [None]:
# Handles an anndata object with human gene codes as features by translating them to human symbols.
def handle_human_codes(ad: AnnData) -> AnnData:
    if ("gene_ids" in ad.var.keys()):
        gene_list = list(ad.var["gene_ids"])
    elif ("SYMBOL" in ad.var.keys()):
        gene_list = list(ad.var["SYMBOL"])
    ad.var["human_codes"] = gene_list
    var_idx = list(ad.var.index)
    test_idx = var_idx[0]
    # Makes sure it's not default indexing.
    try:
        float(test_idx)
    except ValueError:
        if (test_idx[0:3]!="ENS"):
            gene_symbols = var_idx
        else:
            gene_symbols = translate_human_gene_codes(gene_list)
    ad.var["human_symbol"] = gene_symbols
    return(ad)

In [None]:
# Handles an anndata object with mouse gene codes as features by translating them to human symbols.
def handle_mouse_codes(ad: AnnData) -> AnnData:
    gene_series = ad.var["gene_ids"]
    gene_series.index = list(gene_series)
    genes_merged = M2H.merge(right=gene_series, how="right", left_index=True, right_index=True)
    human_symbols = genes_merged['hsapiens_homolog_associated_gene_name']
    ad.var["human_symbol"] = list(human_symbols)
    return(ad)

In [None]:
for gene_id 

In [None]:
# Handles an anndata object with mouse symbols as features by translating them to human symbols.
def handle_mouse_symbols(ad: AnnData) -> AnnData:
    gene_series = ad.var["gene_ids"]
    gene_series.index = list(gene_series)
    genes_merged = M2H.merge(right=gene_series, how="right", left_on="external_gene_name", right_index=True)
    # Confirmed by all(genes_merged["gene_ids"] == gene_series) to be in same order.
    genes_merged = genes_merged.drop_duplicates(subset='external_gene_name')
    human_symbols = genes_merged['hsapiens_homolog_associated_gene_name']
    ad.var["human_symbol"] = list(human_symbols)
    return(ad)

In [None]:
# Handles an anndata object with canine gene codes as features by translating them to human symbols.
def handle_canine_codes(ad: AnnData) -> AnnData:
    gene_series = ad.var["gene_ids"]
    gene_series.index = list(gene_series)
    genes_merged = C2H.merge(right=gene_series, how="right", left_index=True, right_index=True)
    human_symbols = genes_merged['hsapiens_homolog_associated_gene_name']
    ad.var["human_symbol"] = list(human_symbols)
    return(ad)

In [None]:
# Gets (human) gene symbols of each (human) gene code in an anndata object,
def translate_human_gene_codes(gene_list: list) -> list:
    mg = mygene.MyGeneInfo()
    gene_info_list = mg.getgenes(gene_list, fields='name, symbol, entrezgene, taxid, ensembl')
    # Filters each query to only one gene info entry
    queries = [gene_info["query"] for gene_info in gene_info_list]
    first_entries = []
    # Slow due to index calls...speed up?
    for gene_code in gene_list:
        first_entry_idx = queries.index(gene_code)
        first_entries.append(first_entry_idx)
    gene_info_list_cleaned = [gene_info_list[idx] for idx in first_entries]
    # Converts to gene symbols and handles missing ones.
    gene_symbols = []
    missing_n, lncrna_n, tup_n, tpp_n, tec_n, protein_n = 0, 0, 0, 0, 0, 0
    tap_n, pp_n, up_n, igv_n, igvp_n, art_n = 0, 0, 0, 0, 0, 0
    for gene_info in gene_info_list_cleaned:
        try:
            gene_symbol = gene_info["symbol"]
            gene_symbols.append(gene_symbol)
        # Catches genes with no assigned symbol
        except KeyError:
            try:
                gene_type = gene_info["ensembl"]["type_of_gene"]
            # Handles multiple ensembl ids mapping to one entrez id
            except TypeError:
                gene_type = gene_info["ensembl"][0]["type_of_gene"]
            # Handles missing ensembl entries
            except KeyError:
                gene_type = "missing"
            match gene_type:
                case "missing":
                    missing_n += 1
                    gene_symbols.append("m_missing" + str(missing_n))
                case "lncRNA":
                    lncrna_n += 1
                    gene_symbols.append("m_lncRNA" + str(lncrna_n))
                case "transcribed_unprocessed_pseudogene":
                    tup_n += 1
                    gene_symbols.append("m_TUP" + str(tup_n))
                case "transcribed_processed_pseudogene":
                    tpp_n += 1
                    gene_symbols.append("m_TPP" + str(tpp_n))
                case "TEC":
                    tec_n += 1
                    gene_symbols.append("m_TEC" + str(tec_n))
                case "protein_coding":
                    protein_n += 1
                    gene_symbols.append("m_protein" + str(protein_n))
                case "transcribed_unitary_pseudogene":
                    tap_n += 1
                    gene_symbols.append("m_TAP" + str(tap_n))
                case "processed_pseudogene":
                    pp_n += 1
                    gene_symbols.append("m_PP" + str(pp_n))
                case "unprocessed_pseudogene":
                    up_n += 1
                    gene_symbols.append("m_UP" + str(up_n))
                case "IG_V_gene":
                    igv_n += 1
                    gene_symbols.append("m_IGV" + str(igv_n))
                case "IG_V_pseudogene":
                    igvp_n += 1
                    gene_symbols.append("m_IGVP" + str(igvp_n))
                case "artifact":
                    art_n += 1
                    gene_symbols.append("m_artifact" + str(art_n))
                case _:
                    raise Exception("Unhandled gene type: " + gene_type)
    return(gene_symbols)

In [None]:
## Snippet for finding gene types present in the data with missing symbols.
# encountered_types = []
# for gene_info in gene_info_list:
#     if "symbol" in gene_info.keys():
#         continue
#     try:
#         gene_type = gene_info["ensembl"]["type_of_gene"]
#     except KeyError:
#         gene_type = "missing"
#     except TypeError:
#         gene_type = gene_info["ensembl"][0]["type_of_gene"]
#     if gene_type not in encountered_types:
#         print(gene_type)
#         encountered_types.append(gene_type)

In [5]:
# TODO: Write gene handling functions:
## Mouse symbol
## Mouse id
## Canine id
# Iterates through H5ads, finds format of gene ids, and handles each one based on its format.
# Goal: make new vars columns with original id and human-converted id. 
for h5ad_path in h5ad_paths:
    h5ad_path_strip = h5ad_path.replace(f"{HAD_DIR}/", "")
    ad = sc.read_h5ad(h5ad_path)
    # Skips datasets that have already been processed unless force flag set to True
    if ("human_symbol" in ad.var.keys() and not force):
        continue
    if ("gene_ids" in ad.var.keys()):
        test_gene = ad.var["gene_ids"].iloc[0]
    elif ("SYMBOL" in ad.var.keys()):
        test_gene = ad.var["SYMBOL"].iloc[0]
    else:
        print(h5ad_path, "Gene ID key not found in vars.")
    gene_id = test_gene[0:4]
    # Detects gene symbols and splits into human or mouse symbols
    if (gene_id[0:3]!="ENS"):
        # Check for lowercase characters anywhere in the test gene
        if any([c for c in test_gene if c.islower()]):
            gene_id = "MMUS"
        else:
            gene_id = "HSAS"
    match gene_id:
        case "ENSG":
            continue
            ad = handle_human_codes(ad)
            gene_type = "human codes"
        case "ENSM":
            continue
            ad = handle_mouse_codes(ad)
            gene_type = "mouse codes"
            # Handle mouse gene
        case "ENSC":
            continue
            ad = handle_canine_codes(ad)
            gene_type = "canine codes"
            # Handle canine gene -- TODO: Implement...
        case "MMUS":
            # continue
            ad = handle_mouse_symbols(ad)
            gene_type = "mouse symbols"
            # Handle mouse symbol
        case "HSAS":
            continue
            ad = handle_human_symbols(ad)
            gene_type = "human symbols"
        # Flags unhandled genes
        case _:
            raise Exception(f"Unhandled gene symbol: {gene_id} from {test_gene}.")
    ad.write_h5ad(h5ad_path)
    print(f"Standardized: {h5ad_path}")
    print(f"    Original format: {gene_type}")

TSPAN6 Human symbol
TSPAN6 Human symbol
TSPAN6 Human symbol
ENSMUSG00000051951 Mouse Gene Code
ENSMUSG00000051951 Mouse Gene Code
ENSMUSG00000051951 Mouse Gene Code
ENSMUSG00000051951 Mouse Gene Code
ENSG00000243485 Human Gene Code
