In [None]:
## This script needs to be run on the head node as mygene queries an online database.

In [11]:
# Imports
import csv
import glob2
import pandas as pd
import matplotlib.pyplot as plt
import mygene
import numpy as np
import scanpy as sc
import squidpy as sq
import warnings
from anndata import AnnData
from scipy import sparse

In [12]:
# Globals
PRJ_DIR = "/scratch/gpfs/KANG/sereno/spatialstem"
SRC_DIR = f"{PRJ_DIR}/sourcefiles"
RAW_DIR = f"{SRC_DIR}/raw"
HAD_DIR = f"{SRC_DIR}/h5ad"
INT_DIR = f"{PRJ_DIR}/intermediates"
FIG_DIR = f"{PRJ_DIR}/figs"

In [3]:
# Get paths of H5 datasets.
h5ad_paths = glob2.glob(f"{HAD_DIR}/*.h5ad")
h5ad_paths.sort()

In [None]:
def handle_gene_symbols(ad: AnnData) -> AnnData:
    if ("gene_ids" in ad.var.keys()):
        gene_list = list(ad.var["gene_ids"])
    elif ("SYMBOL" in ad.var.keys()):
        gene_list = list(ad.var["SYMBOL"])
    mg = mygene.MyGeneInfo()
    gene_info_list = mg.getgenes(gene_list, fields='name, symbol, entrezgene, taxid, ensembl')
    # Filters each query to only one gene info entry
    queries = [gene_info["query"] for gene_info in gene_info_list]
    first_entries = []
    # Slow due to index calls...speed up?
    for gene_code in gene_list:
        first_entry_idx = queries.index(gene_code)
        first_entries.append(first_entry_idx)
    gene_info_list_cleaned = [gene_info_list[idx] for idx in first_entries]
    # Converts to gene symbols and handles missing ones.
    gene_symbols = []
    missing_n, lncrna_n, tup_n, tpp_n, tec_n, protein_n = 0, 0, 0, 0, 0, 0
    tap_n, pp_n, up_n, igv_n, igvp_n, art_n = 0, 0, 0, 0, 0, 0
    for gene_info in gene_info_list_cleaned:
        try:
            gene_symbol = gene_info["symbol"]
            gene_symbols.append(gene_symbol)
        # Catches genes with no assigned symbol
        except KeyError:
            try:
                gene_type = gene_info["ensembl"]["type_of_gene"]
            # Handles multiple ensembl ids mapping to one entrez id
            except TypeError:
                gene_type = gene_info["ensembl"][0]["type_of_gene"]
            # Handles missing ensembl entries
            except KeyError:
                gene_type = "missing"
            match gene_type:
                case "missing":
                    missing_n += 1
                    gene_symbols.append("m_missing" + str(missing_n))
                case "lncRNA":
                    lncrna_n += 1
                    gene_symbols.append("m_lncRNA" + str(lncrna_n))
                case "transcribed_unprocessed_pseudogene":
                    tup_n += 1
                    gene_symbols.append("m_TUP" + str(tup_n))
                case "transcribed_processed_pseudogene":
                    tpp_n += 1
                    gene_symbols.append("m_TPP" + str(tpp_n))
                case "TEC":
                    tec_n += 1
                    gene_symbols.append("m_TEC" + str(tec_n))
                case "protein_coding":
                    protein_n += 1
                    gene_symbols.append("m_protein" + str(protein_n))
                case "transcribed_unitary_pseudogene":
                    tap_n += 1
                    gene_symbols.append("m_TAP" + str(tap_n))
                case "processed_pseudogene":
                    pp_n += 1
                    gene_symbols.append("m_PP" + str(pp_n))
                case "unprocessed_pseudogene":
                    up_n += 1
                    gene_symbols.append("m_UP" + str(up_n))
                case "IG_V_gene":
                    igv_n += 1
                    gene_symbols.append("m_IGV" + str(igv_n))
                case "IG_V_pseudogene":
                    igvp_n += 1
                    gene_symbols.append("m_IGVP" + str(igvp_n))
                case "artifact":
                    art_n += 1
                    gene_symbols.append("m_artifact" + str(art_n))
                case _:
                    raise Exception("Unhandled gene type: " + gene_type)
    ad.var["native_symbol"] = gene_symbols
    return(ad)

In [None]:
## Snippet for finding gene types present in the data with missing symbols.
# encountered_types = []
# for gene_info in gene_info_list:
#     if "symbol" in gene_info.keys():
#         continue
#     try:
#         gene_type = gene_info["ensembl"]["type_of_gene"]
#     except KeyError:
#         gene_type = "missing"
#     except TypeError:
#         gene_type = gene_info["ensembl"][0]["type_of_gene"]
#     if gene_type not in encountered_types:
#         print(gene_type)
#         encountered_types.append(gene_type)

querying 1-1000...

ConnectionError: HTTPConnectionPool(host='mygene.info', port=80): Max retries exceeded with url: /v3/gene/ (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x1537e5bc3b90>: Failed to resolve 'mygene.info' ([Errno -2] Name or service not known)"))

In [None]:
def handle_human_symbols(gene_list: list) -> list:
    

In [5]:
# TODO: Write gene handling functions:
## Human symbol
## Mouse symbol
## Human id
## Mouse id
## Canine id
# Iterates through H5ads, finds format of gene ids, and handles each one based on its format.
# Goal: make new vars columns with original id and human-converted id. 
for h5ad_path in h5ad_paths:
    h5ad_path_strip = h5ad_path.replace(f"{HAD_DIR}/", "")
    ad = sc.read_h5ad(h5ad_path)
    # Skips datasets that have already been processed unless force flag set to True
    if ("human_symbol" in ad.var.keys() and not force):
        continue
    if ("gene_ids" in ad.var.keys()):
        test_gene = ad.var["gene_ids"].iloc[0]
    elif ("SYMBOL" in ad.var.keys()):
        test_gene = ad.var["SYMBOL"].iloc[0]
    else:
        print(h5ad_path, "Gene ID key not found in vars.")
    gene_id = test_gene[0:4]
    # Detects gene symbols and splits into human or mouse symbols
    if (gene_id[0:3]!="ENS"):
        # Check for lowercase characters anywhere in the test gene
        if any([c for c in test_gene if c.islower()]):
            gene_id = "MMUS"
        else:
            gene_id = "HSAS"
    match gene_id:
        case "ENSG":
            print(test_gene, "Human Gene Code")
            break
            # Handle human gene
        case "ENSM":
            print(test_gene, "Mouse Gene Code")
            # Handle mouse gene
        case "ENSC":
            print(test_gene, "Canine Gene Code")
            # Handle canine gene
        case "MMUS":
            print(test_gene, "Mouse symbol")
            # Handle mouse symbol
        case "HSAS":
            print(test_gene, "Human symbol")
            # Handle human symbol
        case _:
            print(test_gene, "Unidentified")
            # Handle unhandled genes

TSPAN6 Human symbol
TSPAN6 Human symbol
TSPAN6 Human symbol
ENSMUSG00000051951 Mouse Gene Code
ENSMUSG00000051951 Mouse Gene Code
ENSMUSG00000051951 Mouse Gene Code
ENSMUSG00000051951 Mouse Gene Code
ENSG00000243485 Human Gene Code
