# 23/03/30

- Fix gene names
- Minimum number of genes: 50
- Add celebellum data from Cao 2020

In [1]:
import scanpy as sc
import os,sys,glob
import pandas as pd
import numpy as np
import anndata as ad
import matplotlib.pyplot as plt

import seaborn as sns
from matplotlib.colors import ListedColormap

In [2]:
# Configure integration
label_key = "cluster_main"
batch_key = "Dataset"

run_date = '230330'
join_method = 'outer'
min_genes = '50'

# Columns to use
cols_to_use = ['batch', 'sampleID', 'Age', 'Assay', 'Stage', 'Race', 'PMI', 'Hemisphere', 'Library',
               'Brain_Region', 'Dataset', 'Sex', 'Diagnosis', 'DF_classification', 
              'cluster_original', 'cluster_main']

In [3]:
# Output name
infile = '_'.join(['/home/sonic/scData/anndata/Input', run_date, 'adata',
                   join_method, 
                   'minGenes', min_genes + '.' + 'h5ad']
                 )
print (infile)

/home/sonic/scData/anndata/Input_230330_adata_outer_minGenes_50.h5ad


In [4]:
opt_create_input = True
if opt_create_input:
    # Allen: Multi cortical regions
    adata_amc = sc.read_h5ad('/home/sonic/scData/anndata/AllenMultiCortex.h5ad')

    # Allen: primary motor cortex
    adata_m1 = sc.read_h5ad('/home/sonic/scData/anndata/AllenM1.h5ad')

    # Braun
    adata_braun = sc.read_h5ad('/home/sonic/scData/anndata/HCA_Braun_minCell_1_pct30.h5ad')
#     adata_braun = sc.read_h5ad('/home/sonic/scData/anndata/HCA_Braun.subset_cortex.h5ad')
#     adata_braun_fc = sc.read_h5ad('/home/sonic/scData/anndata/HCA_Braun.subset_cortexFrontal.h5ad')
#     adata_braun_str = sc.read_h5ad('/home/sonic/scData/anndata/HCA_Braun.subset_Striatum.h5ad')

    # Cameron
    adata_cameron = sc.read_h5ad('/home/sonic/scData/anndata/Cameron2023.h5ad')

    # Cao
    adata_cao_cerebrum = sc.read_h5ad('/home/sonic/scData/anndata/Cao2020.Cerebrum.pct10.h5ad')
    adata_cao_cerebellum = sc.read_h5ad('/home/sonic/scData/anndata/Cao2020.Cerebellum.pct10.h5ad')

    # Eze
    adata_eze = sc.read_h5ad('/home/sonic/scData/anndata/HCA_Eze.h5ad')

    # Hardwick
    adata_hardwick = sc.read_h5ad('/home/sonic/scData/anndata/Hardwick2022.h5ad')

    # Hearring
    adata_herring = sc.read_h5ad('/home/sonic/scData/anndata/Herring2022.h5ad')

    # Morabito
    adata_morabito = sc.read_h5ad('/home/sonic/scData/anndata/Morabito2021.h5ad')

    # ROSMAP batch 2
    adata_rosmap2 = sc.read_h5ad('/home/sonic/scData/anndata/ROSMAP2.h5ad')
    sc.pp.subsample(adata_rosmap2, fraction=0.3, random_state=42)

    # Trevino (Var unique issue)
    adata_trevino = sc.read_h5ad('/home/sonic/scData/anndata/Trevino2021.h5ad')

    # Turecki
    adata_turecki = sc.read_h5ad('/home/sonic/scData/anndata/HCA_Turecki.h5ad')

    # Zhang (PD, Yale)
    adata_zhangPD = sc.read_h5ad('/home/sonic/scData/anndata/HCA_ZhangPD.h5ad')

    print ('List datasets')
    list_data = [adata_amc, 
                 adata_m1, 
                 adata_braun, 
#                  adata_braun_fc, 
#                  adata_braun_str,
                 adata_cameron, 
                 adata_cao_cerebrum,
                 adata_cao_cerebellum,
                 adata_eze, 
                 adata_hardwick, 
                 adata_herring,
                 adata_morabito, 
                 adata_rosmap2,
                 adata_trevino, 
                 adata_turecki, 
                 adata_zhangPD]

    for a in list_data:
        a.obs_names_make_unique() 
        a.var_names_make_unique() 
else:
    print ('Skip: merging input file')

  utils.warn_names_duplicates("var")


List datasets


In [5]:
# wget https://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz
# https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/non_alt_loci_set.txt 
ncbi_genes = pd.read_csv('/home/sonic/scData/misc/Homo_sapiens.gene_info.gz', sep='\t')
hgnc = pd.read_csv('/home/sonic/scData/misc/non_alt_loci_set.230328.txt', sep='\t')

# Create a dictionary to contain symbol:symbol and alias_symbol:symbol
# NCBI
gene_dict = {}
for index, row in ncbi_genes.iterrows():
    gene_dict[row['Symbol']] = row['Symbol']
    if pd.notna(row['Synonyms']):
        aliases = row['Synonyms'].split('|')
        for alias in aliases:
            gene_dict[alias] = row['Symbol']    

# HGNC
for index, row in hgnc.iterrows():
    gene_dict[row['symbol']] = row['symbol']
    if pd.notna(row['alias_symbol']):
        aliases = row['alias_symbol'].split('|')
        for alias in aliases:
            gene_dict[alias] = row['symbol']    
            
    if pd.notna(row['prev_symbol']):
        aliases = row['prev_symbol'].split('|')
        for alias in aliases:
            gene_dict[alias] = row['symbol']
            
    if pd.notna(row['lncipedia']):
        aliases = row['lncipedia'].split('|')
        for alias in aliases:
            gene_dict[alias] = row['symbol']

# Function to update gene name
def replace_gene_name(gene, gene_dict):
    if gene in gene_dict.keys():
        new_gene = gene_dict[gene]
    else:
        new_gene = 'None'
    return new_gene

len(gene_dict.keys())

  hgnc = pd.read_csv('/home/sonic/scData/misc/non_alt_loci_set.230328.txt', sep='\t')


222023

In [6]:
# adata_amc.var['new_gene'] = adata_amc.var.index.to_series().apply(lambda x: replace_gene_name(x, gene_dict))

In [7]:
# adata_amc.var[adata_amc.var['new_gene']=='None']

In [8]:
new_list_data = []
for a in list_data:
    data_name = a.obs.Dataset.unique()[0]
    a.var['new_gene'] = a.var.index.to_series().apply(lambda x: replace_gene_name(x, gene_dict)) 
    
    number_genes = sum(a.var['new_gene']!='None')
    number_nogenes = sum(a.var['new_gene']=='None')
    
    print("{}:{} genes present.".format(data_name, number_genes))
    print("{}: {} genes missing.".format(data_name, number_nogenes))
    print ("")
    
    # Remove genes without gene information
    a = a[:, a.var['new_gene']!='None']
    
    # Set up index using updated genes
    a.var = a.var.set_index('new_gene')
    a.var_names_make_unique()
    
    # Append a new dataset
    new_list_data.append(a)

AllenMultiCortex:42218 genes present.
AllenMultiCortex: 8057 genes missing.

AllenM1:42224 genes present.
AllenM1: 8057 genes missing.

Braun:30149 genes present.
Braun: 16312 genes missing.

Cameron:20413 genes present.
Cameron: 8417 genes missing.

Cao:30353 genes present.
Cao: 19979 genes missing.



  self._set_dim_df(value, "var")


Cao:30077 genes present.
Cao: 19651 genes missing.



  self._set_dim_df(value, "var")


Eze:22557 genes present.
Eze: 11137 genes missing.

Hardwick:23487 genes present.
Hardwick: 10051 genes missing.

Herring:21234 genes present.
Herring: 11504 genes missing.

Morabito:37673 genes present.
Morabito: 21048 genes missing.

ROSMAP2:23597 genes present.
ROSMAP2: 12261 genes missing.

Trevino:23355 genes present.
Trevino: 10000 genes missing.

Turecki:20500 genes present.
Turecki: 9562 genes missing.

ZhangPD:23487 genes present.
ZhangPD: 10051 genes missing.



In [9]:
new_list_data[1].var

DDX11L10
WASH7P
MIR6859-1
MIR1302-2
FAM138A
...
MT-ND6
TRE-TTC3-1
MT-CYB
TRNT
TRNP1


In [10]:
print ('Concat the list of dataset')
adata = ad.concat(new_list_data, join=join_method)
adata.obs = adata.obs[cols_to_use]
sc.pp.filter_cells(adata, min_genes=int(min_genes))

print ('Write to file')
categorical_cols = adata.obs.select_dtypes(include=['category', 'object']).columns
adata.obs[categorical_cols] = adata.obs[categorical_cols].astype(str)
adata.write(infile)

Concat the list of dataset


  warn(


Write to file


In [11]:
adata

AnnData object with n_obs × n_vars = 1918696 × 49133
    obs: 'batch', 'sampleID', 'Age', 'Assay', 'Stage', 'Race', 'PMI', 'Hemisphere', 'Library', 'Brain_Region', 'Dataset', 'Sex', 'Diagnosis', 'DF_classification', 'cluster_original', 'cluster_main', 'n_genes'