In [1]:
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
import scipy.sparse
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import random



In [2]:
data = "C:\\Users\\bence\\Projects\\BIO446\\McKinnon-Rosati-Laboratory\\Project 1\\Data\\granulomas_final.h5ad"

In [3]:
adata = sc.read_h5ad(data)
adata.uns['log1p']["base"] = None # bug fix

print(adata.obs['sample'].value_counts())
print()
print('X matrix is sparse:', scipy.sparse.issparse(adata.X))
print('X size =', adata.X.shape)

sample
granuloma2    9392
granuloma1    9302
granuloma3    8909
Name: count, dtype: int64

X matrix is sparse: False
X size = (27603, 23693)


In [4]:
cluster_type = 'my_clust_1'

In [5]:
annotation_dict= {
    '9': 'CAP1',
    '24': 'CAP2',
    '9b': 'VEC',
    '27': 'LEC',
        
    '17': 'Ciliated',
    '15': 'Secretory',
    '22': 'AT1',
    '6': 'AT2',
    '12': 'AT2-t1',
    '19': 'AT2-t2',
        
    '14': 'AF',
    '25': 'Pericyte',
        
    '20': 'Mesothelial',
        
    '3': 'B1',
    '3b': 'B2',
        
    '0': 'Th1',
    '8': 'Tnaive',
    '11': 'Tex',
    '77': 'Treg',
        
    '11b': 'NK',
        
    '4a': 'AM',
    '4': 'M-t1',
    '10': 'M-lc',
    '7': 'M-t2',
    '7b': 'M-C1q',
    '7c': 'iMon',
        
    '23': 'pDC',
    '13': 'DC',
    '5b': 'N1',
    '5': 'N2',
}

In [6]:
adata.obs['cell_type_edit'] = [annotation_dict[clust] for clust in adata.obs[cluster_type]]

dict_list = list(annotation_dict.keys())
adata_list = list(adata.obs[cluster_type].unique())
print('Keys in dictionary not in adata:', [item for item in dict_list if item not in adata_list])
print('Keys in adata not in dictionary:', [item for item in adata_list if item not in dict_list])

Keys in dictionary not in adata: []
Keys in adata not in dictionary: []


In [7]:
replacement_dict = {
    'AT2': 0,
    'B1': 1,
    'M-t1': 2,
    'DC': 3,
    'Th1': 4,
    'M-t2': 5,
    'Secretory': 6,
    'AM': 7,
    'N1': 8,
    'M-C1q': 9,
    'AT2-t2': 10,
    'AF': 11,
    'VEC': 12,
    'CAP1': 13,
    'N2': 14,
    'AT2-t1': 15,
    'Pericyte': 16,
    'pDC': 17,
    'Ciliated': 18,
    'NK': 19,
    'AT1': 20,
    'Tnaive': 21,
    'Treg': 22,
    'M-lc': 23,
    'Mesothelial': 24,
    'Tex': 25,
    'CAP2': 26,
    'LEC': 27,
    'iMon': 28,
    'B2': 29
}

In [8]:
adata.obs['celltype'] = adata.obs['cell_type_edit'].map(replacement_dict)

In [9]:
FINAL_SEED = 99

subset, global_test = train_test_split(adata.obs.index.values, test_size=0.2, random_state=FINAL_SEED, stratify=adata.obs['celltype'].values)
print('subset shape', subset.shape)
print('global_test shape', global_test.shape)

subset shape (22082,)
global_test shape (5521,)


In [10]:
adata_subset = adata[subset].copy()
adata_global_test = adata[global_test].copy()

adata_global_test.write("C:\\Users\\bence\\Projects\\BIO446\\McKinnon-Rosati-Laboratory\\Project 1\\Data\\adata_global_test.h5ad")

adata = adata_subset

In [11]:
def preprocess_node(cell_name, cell_dict, replacement_dict, seed1, seed2):

    cell_list = [c for lst in cell_dict.values() for c in lst]
    adata_subset = adata[adata.obs['cell_type_edit'].isin(cell_list)].copy()
    annotation = {}

    for cell_type, cluster_num in cell_dict.items():
        for x in cluster_num:
            annotation[x] = cell_type

    adata_subset.obs["annotation"] = adata_subset.obs['cell_type_edit'].map(annotation)
    adata_subset.obs['celltype'] = adata_subset.obs["annotation"].map(replacement_dict)
    adata_subset_hvg = adata_subset[:, adata_subset.var['highly_variable']].copy()

    X = adata_subset_hvg.X
    y = adata_subset_hvg.obs['celltype'].values
    
    train_features, test_features, train_labels, test_labels = train_test_split(X, y, test_size = 0.2, random_state=seed1, stratify=y)
    train_features, val_features, train_labels, val_labels = train_test_split(train_features, train_labels, test_size = 0.125, random_state=seed2, stratify=train_labels)
    weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels)  

    train_features = np.array(train_features)
    test_features = np.array(test_features)
    val_features = np.array(val_features)
    train_labels = np.array(train_labels)
    test_labels = np.array(test_labels)
    val_labels = np.array(val_labels)
    weights = np.array(weights)                  

    np.save(f'../Arrays/train_features_hvg_{cell_name}.npy', train_features)
    np.save(f'../Arrays/test_features_hvg_{cell_name}.npy', test_features)
    np.save(f'../Arrays/val_features_hvg_{cell_name}.npy', val_features)
    np.save(f'../Arrays/train_labels_hvg_{cell_name}.npy', train_labels)
    np.save(f'../Arrays/test_labels_hvg_{cell_name}.npy', test_labels)
    np.save(f'../Arrays/val_labels_hvg_{cell_name}.npy', val_labels)
    np.save(f'../Arrays/weights_hvg_{cell_name}.npy', weights)

In [12]:
def preprocess_leaf(cell_name, cell_list, replacement_dict, seed1, seed2):

    adata_subset = adata[adata.obs['cell_type_edit'].isin(cell_list)].copy()
    adata_subset.obs['celltype'] = adata_subset.obs['cell_type_edit'].map(replacement_dict)
    adata_subset_hvg = adata_subset[:, adata_subset.var['highly_variable']].copy()

    X = adata_subset_hvg.X
    y = adata_subset_hvg.obs['celltype'].values

    train_features, test_features, train_labels, test_labels = train_test_split(X, y, test_size = 0.2, random_state=seed1, stratify=y)
    train_features, val_features, train_labels, val_labels = train_test_split(train_features, train_labels, test_size = 0.125, random_state=seed2, stratify=train_labels)
    weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels)  

    train_features = np.array(train_features)
    test_features = np.array(test_features)
    val_features = np.array(val_features)
    train_labels = np.array(train_labels)
    test_labels = np.array(test_labels)
    val_labels = np.array(val_labels)
    weights = np.array(weights)                  

    np.save(f'../Arrays/train_features_hvg_{cell_name}.npy', train_features)
    np.save(f'../Arrays/test_features_hvg_{cell_name}.npy', test_features)
    np.save(f'../Arrays/val_features_hvg_{cell_name}.npy', val_features)
    np.save(f'../Arrays/train_labels_hvg_{cell_name}.npy', train_labels)
    np.save(f'../Arrays/test_labels_hvg_{cell_name}.npy', test_labels)
    np.save(f'../Arrays/val_labels_hvg_{cell_name}.npy', val_labels)
    np.save(f'../Arrays/weights_hvg_{cell_name}.npy', weights)

In [13]:
top_level_dict = {
    'Endothelial': ['CAP1','CAP2','VEC','AEC','LEC'],
    'Epithelial': ['Ciliated','Secretory','AT1','AT2','AT2-t1','AT2-t2'],
    'Mesenchyme': ['AF','Pericyte','SMC','Mesothelial'],
    'Immune': ['B1','B2','Th1','Tnaive','Treg','Tex','NK','AM','M-t1','M-t2','M-C1q','M-lc','iMon','DC','pDC','N1','N2']
}

endothelial_dict = {
    'Blood vessels': ['CAP1','CAP2','VEC','AEC'],
    'Lymphatic EC': ['LEC']
}

epithelial_dict = {
    'Airway epithelium': ['Ciliated','Secretory'],
    'Alveolar epithelium': ['AT1','AT2','AT2-t1','AT2-t2']
}

mesenchyme_dict = {
    'Stromal': ['AF','Pericyte','SMC'],
    'Mesothelial': ['Mesothelial']
}

immune_dict = {
    'Lymphoid': ['B1','B2','Th1','Tnaive','Treg','Tex','NK'],
    'Myeloid': ['AM','M-t1','M-t2','M-C1q','M-lc','iMon','DC','pDC','N1','N2']
}

lymphoid_dict = {
    'B lineage': ['B1','B2'],
    'T lineage': ['Th1','Tnaive','Treg','Tex'],
    'NK': ['NK']
}

myeloid_dict = {
    'mononuclear': ['AM','M-t1','M-t2','M-C1q','M-lc','iMon','DC','pDC'],
    'polymorphonuclear': ['N1','N2']
}

mononuclear_dict = {
    'Macrophage': ['AM','M-t1','M-t2','M-C1q','M-lc'],        
    'mononuclear': ['iMon','DC','pDC']
}

blood_vessels_list = ['CAP1','CAP2','VEC','AEC']
airway_epithelium_list = ['Ciliated','Secretory']
alveolar_epithelium_list = ['AT1','AT2','AT2-t1','AT2-t2']
stromal_list = ['AF','Pericyte','SMC']
b_lineage_list = ['B1','B2']
t_lineage_list = ['Th1','Tnaive','Treg','Tex']
macrophage_list = ['AM','M-t1','M-t2','M-C1q','M-lc']
mononuclear_list = ['iMon','DC','pDC']
polymorphonuclear_list = ['N1','N2']

In [14]:
replacement_dict_top_level = {
    'Endothelial': 0,
    'Epithelial': 1,
    'Mesenchyme': 2,
    'Immune': 3
}

replacement_dict_endothelial = {
    'Blood vessels': 0,
    'Lymphatic EC': 1
}

replacement_dict_epithelial = {
    'Airway epithelium': 0,
    'Alveolar epithelium': 1
}

replacement_dict_mesenchyme = {
    'Stromal': 0,
    'Mesothelial': 1
}

replacement_dict_immune = {
    'Lymphoid': 0,
    'Myeloid': 1
}

replacement_dict_lymphoid = {
    'B lineage': 0,
    'T lineage': 1,
    'NK': 2
}

replacement_dict_myeloid = {
    'mononuclear': 0,
    'polymorphonuclear': 1
}

replacement_dict_mononuclear = {
    'Macrophage': 0,
    'mononuclear': 1
}

replacement_dict_blood_vessels = {
    'VEC': 0,
    'CAP1': 1,
    'CAP2': 2,
    'AEC': 3
}

replacement_dict_airway_epithelium = {
    'Ciliated': 0,
    'Secretory': 1
}

replacement_dict_alveolar_epithelium = {
    'AT1': 0,
    'AT2': 1,
    'AT2-t1': 2,
    'AT2-t2': 3
}

replacement_dict_stromal = {
    'AF': 0,
    'Pericyte': 1,
    'SMC': 2
}

replacement_dict_b_lineage = {
    'B1': 0,
    'B2': 1
}

replacement_dict_t_lineage = {
    'Th1': 0,
    'Tnaive': 1,
    'Treg': 2,
    'Tex': 3
}

replacement_dict_macrophage = {
    'AM': 0,
    'M-t1': 1,
    'M-t2': 2,
    'M-C1q': 3,
    'M-lc': 4
}

replacement_dict_mononuclear_child = {
    'iMon': 0,
    'DC': 1,
    'pDC': 2
}

replacement_dict_polymorphonuclear = {
    'N1': 0,
    'N2': 1
}

In [15]:
preprocess_node("top_level", top_level_dict, replacement_dict_top_level, seed1=6, seed2=20)
preprocess_node("endothelial", endothelial_dict, replacement_dict_endothelial, seed1=29, seed2=96)
preprocess_node("epithelial", epithelial_dict, replacement_dict_epithelial, seed1=773, seed2=413)
preprocess_node("mesenchyme", mesenchyme_dict, replacement_dict_mesenchyme, seed1=438, seed2=210)
preprocess_node("immune", immune_dict, replacement_dict_immune, seed1=589, seed2=262)
preprocess_node("lymphoid", lymphoid_dict, replacement_dict_lymphoid, seed1=92, seed2=956)
preprocess_node("myeloid", myeloid_dict, replacement_dict_myeloid, seed1=310, seed2=341)
preprocess_node("mononuclear", mononuclear_dict, replacement_dict_mononuclear, seed1=278, seed2=999)

preprocess_leaf("blood_vessels", blood_vessels_list, replacement_dict_blood_vessels, seed1=881, seed2=114)
preprocess_leaf("airway_epithelium", airway_epithelium_list, replacement_dict_airway_epithelium, seed1=248, seed2=80)
preprocess_leaf("alveolar_epithelium", alveolar_epithelium_list, replacement_dict_alveolar_epithelium, seed1=95, seed2=102)
preprocess_leaf("stromal", stromal_list, replacement_dict_stromal, seed1=430, seed2=879)
preprocess_leaf("b_lineage", b_lineage_list, replacement_dict_b_lineage, seed1=666, seed2=36)
preprocess_leaf("t_lineage", t_lineage_list, replacement_dict_t_lineage, seed1=622, seed2=995)
preprocess_leaf("macrophage", macrophage_list, replacement_dict_macrophage, seed1=699, seed2=604)
preprocess_leaf("mononuclear", mononuclear_list, replacement_dict_mononuclear_child, seed1=228, seed2=462)
preprocess_leaf("polymorphonuclear", polymorphonuclear_list, replacement_dict_polymorphonuclear, seed1=365, seed2=370)