In [2]:
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
import scipy.sparse
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import keras as ks
import sklearn.metrics as metrics
import pandas as pd
import re

In [3]:
controls_final = "../Data/controls_final.h5ad"
granulomas_final = "../Data/granulomas_final.h5ad"
# sc78_final = "../Data/sc78_final.h5ad"
# sc92_final = "../Data/sc92_final.h5ad"
# sc93_final = "../Data/sc93_final.h5ad"

In [4]:
controls_final_anndata = sc.read_h5ad(controls_final) 
granulomas_final_anndata = sc.read_h5ad(granulomas_final)

def fix_and_print(anndata):
    anndata.uns['log1p']['base'] = None
    print(anndata.obs['sample'].value_counts())
    print('X matrix is sparse:', scipy.sparse.issparse(anndata.X))
    print('X size =', anndata.X.shape)
    print()

fix_and_print(controls_final_anndata)
fix_and_print(granulomas_final_anndata)

sample
control3    7395
control2    6385
control1    2185
Name: count, dtype: int64
X matrix is sparse: False
X size = (15965, 21870)

sample
granuloma2    9392
granuloma1    9302
granuloma3    8909
Name: count, dtype: int64
X matrix is sparse: False
X size = (27603, 23693)



In [5]:
controls_final_annotation_dict = {
    '0': 'CAP1',
    '12': 'CAP2',
    '21': 'VEC',
    '17': 'AEC',
    '28': 'LEC',
    '14': 'Ciliated',
    '5': 'Secretory',
    '1': 'AT1',
    '2': 'AT2',
    '3': 'AF',
    '20': 'Pericyte',
    '26': 'SMC',
    '18': 'Mesothelial',
    '8': 'B1',
    '23b': 'Th1',
    '11': 'Tnaive',
    '24': 'NK',
    '10': 'AM',
    '15b': 'M-C1q',
    '25': 'iMon',
    '15': 'DC',
    '15c': 'pDC',
    '22': 'N1',
}

granulomas_final_annotation_dict = {
    '9': 'CAP1',
    '24': 'CAP2',
    '9b': 'VEC',
    '27': 'LEC',
    '17': 'Ciliated',
    '15': 'Secretory',
    '22': 'AT1',
    '6': 'AT2',
    '12': 'AT2-t1',
    '19': 'AT2-t2',
    '14': 'AF',
    '25': 'Pericyte',
    '20': 'Mesothelial',
    '3': 'B1',
    '3b': 'B2',
    '0': 'Th1',
    '8': 'Tnaive',
    '11': 'Tex',
    '77': 'Treg',
    '11b': 'NK',
    '4a': 'AM',
    '4': 'M-t1',
    '10': 'M-lc',
    '7': 'M-t2',
    '7b': 'M-C1q',
    '7c': 'iMon',
    '23': 'pDC',
    '13': 'DC',
    '5b': 'N1',
    '5': 'N2',
}

In [6]:
def create_annotation(anndata, annotation_dict):
    anndata.obs['single_cell_types'] = [annotation_dict[clust] for clust in anndata.obs['my_clust_1']]
    dict_list = list(annotation_dict.keys())
    anndata_list = list(anndata.obs['my_clust_1'].unique())
    print('Keys in dictionary not in anndata:', [item for item in dict_list if item not in anndata_list])
    print('Keys in anndata not in dictionary:', [item for item in anndata_list if item not in dict_list])
    print()

In [7]:
create_annotation(controls_final_anndata, controls_final_annotation_dict)
create_annotation(granulomas_final_anndata, granulomas_final_annotation_dict)

Keys in dictionary not in anndata: []
Keys in anndata not in dictionary: []

Keys in dictionary not in anndata: []
Keys in anndata not in dictionary: []



In [8]:
def print_stats(anndata):
    unique_celltype_sub = anndata.obs['single_cell_types'].unique()
    print(unique_celltype_sub)
    num_unique_celltype_sub = anndata.obs['single_cell_types'].nunique()
    print(f"Number of unique sub cell types: {num_unique_celltype_sub}")
    print()

In [9]:
print_stats(controls_final_anndata)
print_stats(granulomas_final_anndata)

['Mesothelial' 'CAP1' 'AF' 'AT2' 'AT1' 'Secretory' 'B1' 'Tnaive'
 'Ciliated' 'AEC' 'DC' 'VEC' 'AM' 'Pericyte' 'NK' 'CAP2' 'pDC' 'iMon'
 'M-C1q' 'LEC' 'N1' 'SMC' 'Th1']
Number of unique sub cell types: 23

['AT2' 'B1' 'M-t1' 'DC' 'Th1' 'M-t2' 'Secretory' 'AM' 'N1' 'M-C1q'
 'AT2-t2' 'AF' 'VEC' 'CAP1' 'N2' 'AT2-t1' 'Pericyte' 'pDC' 'Ciliated' 'NK'
 'AT1' 'Tnaive' 'Treg' 'M-lc' 'Mesothelial' 'Tex' 'CAP2' 'LEC' 'iMon' 'B2']
Number of unique sub cell types: 30



In [10]:
def holdout_subset(name, anndata, split, seed):
    train_anndata, test_anndata = train_test_split(anndata.obs.index, test_size=split, random_state=seed, stratify=anndata.obs['single_cell_types'].values)
    print(f'{name}_train_anndata shape', train_anndata.shape)
    print(f'{name}_test_anndata shape', test_anndata.shape)
    print()

    train_anndata = anndata[train_anndata].copy()
    test_anndata = anndata[test_anndata].copy()

    train_anndata.write(f"../Data/{name}_train_anndata.h5ad")
    test_anndata.write(f"../Data/{name}_test_anndata.h5ad")

In [11]:
seed = 8653
split = 0.2
holdout_subset("controls_final", controls_final_anndata, split, seed)
holdout_subset("granulomas_final", granulomas_final_anndata, split, seed)

controls_final_train_anndata shape (12772,)
controls_final_test_anndata shape (3193,)

granulomas_final_train_anndata shape (22082,)
granulomas_final_test_anndata shape (5521,)



In [12]:
controls_final_anndata = sc.read_h5ad("../Data/controls_final_train_anndata.h5ad") 
granulomas_final_anndata = sc.read_h5ad("../Data/granulomas_final_train_anndata.h5ad")

In [13]:
top_dict = {
    'Endothelial': ['CAP1','CAP2','VEC','AEC','LEC'],
    'Epithelial': ['Ciliated','Secretory','AT1','AT2','AT2-t1','AT2-t2'],
    'Mesenchyme': ['AF','Pericyte','SMC','Mesothelial'],
    'Immune': ['B1','B2','Th1','Tnaive','Treg','Tex','NK','AM','M-t1','M-t2','M-C1q','M-lc','iMon','DC','pDC','N1','N2']
}

second_dict = {
    'Blood vessels': ['CAP1','CAP2','VEC','AEC'],
    'Lymphatic EC': ['LEC'],
    'Airway epithelium': ['Ciliated','Secretory'],
    'Alveolar epithelium' : ['AT1','AT2','AT2-t1','AT2-t2'],
    'Stromal': ['AF','Pericyte','SMC'],
    'Mesothelial': ['Mesothelial'],
    'Lymphoid': ['B1','B2','Th1','Tnaive','Treg','Tex','NK'],
    'Myeloid': ['AM','M-t1','M-t2','M-C1q','M-lc','iMon','DC','pDC','N1','N2']
}

third_dict = {
    'Blood vessels': ['CAP1','CAP2','VEC','AEC'],
    'Lymphatic EC': ['LEC'],
    'Airway epithelium': ['Ciliated','Secretory'],
    'Alveolar epithelium': ['AT1','AT2','AT2-t1','AT2-t2'],
    'Fibroblast': ['AF','Pericyte'],
    'Smooth muscle': ['SMC'],
    'Mesothelial': ['Mesothelial'],
    'B lineage': ['B1','B2'],
    'T lineage': ['Th1','Tnaive','Treg','Tex'],
    'NK': ['NK'],
    'mononuclear broad': ['AM','M-t1','M-t2','M-C1q','M-lc','iMon','DC','pDC'], # originally two mononuclear
    'Neutrophil': ['N1','N2'] # originally polymorphonuclear=Neutrophil
}

fourth_dict = {
    'Blood vessels': ['CAP1','CAP2','VEC','AEC'],
    'Lymphatic EC': ['LEC'],
    'Airway epithelium': ['Ciliated','Secretory'],
    'Alveolar epithelium': ['AT1','AT2','AT2-t1','AT2-t2'],
    'Fibroblast': ['AF','Pericyte'],
    'Smooth muscle': ['SMC'],
    'Mesothelial': ['Mesothelial'],
    'B lineage': ['B1','B2'],
    'T lineage': ['Th1','Tnaive','Treg','Tex'],
    'NK': ['NK'],
    'Macrophage': ['AM','M-t1','M-t2','M-C1q','M-lc'],        
    'mononuclear fine': ['iMon','DC','pDC'], # originally two mononuclear
    'Neutrophil': ['N1','N2']
}

L1_annotation = {}
for cell_type, cluster_num in top_dict.items():
    for x in cluster_num:
        L1_annotation[x] = cell_type

L2_annotation = {}
for cell_type, cluster_num in second_dict.items():
    for x in cluster_num:
        L2_annotation[x] = cell_type

L3_annotation = {}
for cell_type, cluster_num in third_dict.items():
    for x in cluster_num:
        L3_annotation[x] = cell_type

L4_annotation = {}
for cell_type, cluster_num in fourth_dict.items():
    for x in cluster_num:
        L4_annotation[x] = cell_type

def create_hierarchical_annotations(anndata):
    anndata.obs["top_level"] = anndata.obs['single_cell_types'].map(L1_annotation)
    anndata.obs["second_level"] = anndata.obs['single_cell_types'].map(L2_annotation)
    anndata.obs["third_level"] = anndata.obs['single_cell_types'].map(L3_annotation)
    anndata.obs["fourth_level"] = anndata.obs['single_cell_types'].map(L4_annotation)

create_hierarchical_annotations(controls_final_anndata)
create_hierarchical_annotations(granulomas_final_anndata)

In [14]:
hierarchy = ['top_level', 'second_level', 'third_level', 'fourth_level', 'single_cell_types']

In [15]:
controls_final_hierarchy_dict = {}
granulomas_final_hierarchy_dict = {}

def add_path(root, path):
    node = root
    prev = None 
    for label in path:
        if (label == prev): continue
        node = node.setdefault(label, {})
        prev = label

def create_hierarchy_dict(anndata, hierarchy_dict):
    unique_paths = anndata.obs[hierarchy].drop_duplicates().values
    for path in unique_paths: add_path(hierarchy_dict, path)

In [16]:
create_hierarchy_dict(controls_final_anndata, controls_final_hierarchy_dict)
create_hierarchy_dict(granulomas_final_anndata, granulomas_final_hierarchy_dict)

In [17]:
def create_name(input): return re.sub(r"[^A-Za-z0-9]+", "_", input).strip("_").lower()

In [18]:
def get_leaves(tree):
    res = []
    for key, value in tree.items():
        if value: res.extend(get_leaves(value))
        else: res.append(key)
    return res

In [19]:
def preprocess_node(cell_name, dataset_name, node, anndata, split, seed):
    sub_dict = {key: get_leaves(value) if value else [key] for key, value in node.items()}
    int_mapping = {key: idx for idx, key in enumerate(sub_dict)}
    reverse_mapping = {value: key for key, values in sub_dict.items() for value in values}
    
    finest_level = hierarchy[-1]
    anndata_subset = anndata[anndata.obs[finest_level].isin(reverse_mapping)].copy()
    anndata_subset.obs["cell_names"] = anndata_subset.obs[finest_level].map(reverse_mapping)
    anndata_subset.obs["cell_integers"] = anndata_subset.obs["cell_names"].map(int_mapping)
    anndata_subset_hvg = anndata_subset[:, anndata_subset.var['highly_variable']].copy()

    if scipy.sparse.issparse(anndata_subset_hvg.X):
        X = anndata_subset_hvg.X.toarray()
    else:
        X = anndata_subset_hvg.X
    
    y = anndata_subset_hvg.obs["cell_integers"].values

    train_features, val_features, train_labels, val_labels = train_test_split(X, y, test_size=split, random_state=seed, stratify=y)
    weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels)

    train_features = np.array(train_features)
    val_features = np.array(val_features)
    train_labels = np.array(train_labels)
    val_labels = np.array(val_labels)
    weights = np.array(weights)

    cell_name = create_name(cell_name)
    np.save(f'../Arrays/{dataset_name}_train_features_hvg_{cell_name}.npy', train_features)
    np.save(f'../Arrays/{dataset_name}_val_features_hvg_{cell_name}.npy', val_features)
    np.save(f'../Arrays/{dataset_name}_train_labels_hvg_{cell_name}.npy', train_labels)
    np.save(f'../Arrays/{dataset_name}_val_labels_hvg_{cell_name}.npy', val_labels)
    np.save(f'../Arrays/{dataset_name}_weights_hvg_{cell_name}.npy', weights)

    anndata_subset_hvg.write(f"../Data/{dataset_name}_train_anndata_hvg_{cell_name}.h5ad")

In [20]:
def preprocess_leaf(cell_name, dataset_name, leaf, anndata, split, seed):
    int_mapping = {key: idx for idx, key in enumerate(leaf)}

    finest_level = hierarchy[-1]
    anndata_subset = anndata[anndata.obs[finest_level].isin(leaf)].copy()
    anndata_subset.obs["cell_integers"] = anndata_subset.obs[finest_level].map(int_mapping)
    anndata_subset_hvg = anndata_subset[:, anndata_subset.var['highly_variable']].copy()

    if scipy.sparse.issparse(anndata_subset_hvg.X):
        X = anndata_subset_hvg.X.toarray()
    else:
        X = anndata_subset_hvg.X
    
    y = anndata_subset_hvg.obs["cell_integers"].values

    train_features, val_features, train_labels, val_labels = train_test_split(X, y, test_size=split, random_state=seed, stratify=y)
    weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels)

    train_features = np.array(train_features)
    val_features = np.array(val_features)
    train_labels = np.array(train_labels)
    val_labels = np.array(val_labels)
    weights = np.array(weights)

    cell_name = create_name(cell_name)
    np.save(f'../Arrays/{dataset_name}_train_features_hvg_{cell_name}.npy', train_features)
    np.save(f'../Arrays/{dataset_name}_val_features_hvg_{cell_name}.npy', val_features)
    np.save(f'../Arrays/{dataset_name}_train_labels_hvg_{cell_name}.npy', train_labels)
    np.save(f'../Arrays/{dataset_name}_val_labels_hvg_{cell_name}.npy', val_labels)
    np.save(f'../Arrays/{dataset_name}_weights_hvg_{cell_name}.npy', weights)

    anndata_subset_hvg.write(f"../Data/{dataset_name}_train_anndata_hvg_{cell_name}.h5ad")

In [21]:
def hierarchical_classification(cell_name, dataset_name, dict, anndata, split, seed):
    children = list(dict)
    if (not children): return

    if all(not dict[child] for child in children):
        preprocess_leaf(cell_name, dataset_name, children, anndata, split, seed)
        return
    
    preprocess_node(cell_name, dataset_name, dict, anndata, split, seed)
    for child in children: hierarchical_classification(child, dataset_name, dict[child], anndata, split, seed)

In [22]:
seed = 6296
split = 0.2
hierarchical_classification("top_level", "controls_final", controls_final_hierarchy_dict, controls_final_anndata, split, seed)
hierarchical_classification("top_level", "granulomas_final", granulomas_final_hierarchy_dict, granulomas_final_anndata, split, seed)

In [28]:
# look at data to confirm

cell_name = create_name("blood vessels")
dataset_name = "controls_final"

anndata = sc.read_h5ad(f"../Data/{dataset_name}_train_anndata_hvg_{cell_name}.h5ad")

print(anndata)
print()
print(anndata.obs["single_cell_types"].value_counts())
print(anndata.obs["single_cell_types"].unique())
print()
# print(anndata.obs["cell_names"].value_counts())
# print(anndata.obs["cell_names"].unique())
# print()
print(anndata.obs["cell_integers"].value_counts())
print(anndata.obs["cell_integers"].unique())
print()

train_features = np.load(f"../Arrays/{dataset_name}_train_features_hvg_{cell_name}.npy")
val_features = np.load(f"../Arrays/{dataset_name}_val_features_hvg_{cell_name}.npy")
train_labels = np.load(f"../Arrays/{dataset_name}_train_labels_hvg_{cell_name}.npy")
val_labels = np.load(f"../Arrays/{dataset_name}_val_labels_hvg_{cell_name}.npy")
weights = np.load(f"../Arrays/{dataset_name}_weights_hvg_{cell_name}.npy")

print('train features shape:', train_features.shape)
print('val features shape:', val_features.shape)
print('train labels shape:', train_labels.shape)
print('val labels shape:', val_labels.shape)
print('weights shape:', weights.shape)
print()

class_weights = dict(enumerate(weights))
print(class_weights)

AnnData object with n_obs × n_vars = 3067 × 3104
    obs: 'scDblFinder_score', 'scDblFinder_class', 'doublet', 'n_genes_by_counts', 'total_counts', 'type', 'sample', 'batch_type', 'batch', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'n_genes', 'n_counts', 'size_factors', 'leiden_0.2', 'leiden_0.3', 'leiden_0.4', 'leiden_0.6', 'leiden_0.8', 'leiden_1.0', 'leiden_1.4', 'leiden_1.7', 'leiden_2.0', 'leiden_2.5', 'leiden_3.0', 'leiden_3.5', 'kmeans', 'my_clust_1', 'immune', 'mesothelial', 'stromal', 'endothelial', 'epithelial', 'muscle', 'Immune', 'Myeloid', 'Lymphoid', 'T lineage', 'Th1', 'Tnaive', 'Treg', 'Tex', 'NK', 'Mesenchyme', 'Mesothelial', 'Stromal', 'Endothelial', 'Blood vessels', 'LEC', 'Epithelial', 'Ciliated', 'Secretory', 'single_cell_types'

In [23]:
def flat_classification(dataset_name, anndata, split, seed):
    finest_level = hierarchy[-1]
    cell_types = sorted(anndata.obs[finest_level].unique())
    int_mapping = {groups: i for i, groups in enumerate(cell_types)}
    anndata.obs["cell_integers"] = anndata.obs[finest_level].map(int_mapping)

    anndata_hvg = anndata[:, anndata.var['highly_variable']].copy()

    if scipy.sparse.issparse(anndata_hvg.X):
        X = anndata_hvg.X.toarray()
    else:
        X = anndata_hvg.X
    
    y = anndata_hvg.obs["cell_integers"].values

    train_features, val_features, train_labels, val_labels = train_test_split(X, y, test_size=split, random_state=seed, stratify=y)
    weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels)

    train_features = np.array(train_features)
    val_features = np.array(val_features)
    train_labels = np.array(train_labels)
    val_labels = np.array(val_labels)
    weights = np.array(weights)

    np.save(f'../Arrays/{dataset_name}_train_features_hvg_flat.npy', train_features)
    np.save(f'../Arrays/{dataset_name}_val_features_hvg_flat.npy', val_features)
    np.save(f'../Arrays/{dataset_name}_train_labels_hvg_flat.npy', train_labels)
    np.save(f'../Arrays/{dataset_name}_val_labels_hvg_flat.npy', val_labels)
    np.save(f'../Arrays/{dataset_name}_weights_hvg_flat.npy', weights)

    anndata_hvg.write(f"../Data/{dataset_name}_train_anndata_hvg_flat.h5ad")

In [24]:
seed = 6296
split = 0.2
flat_classification("controls_final", controls_final_anndata, split, seed)
flat_classification("granulomas_final", granulomas_final_anndata, split, seed)