In [2]:
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
import scipy.sparse
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import keras as ks
import sklearn.metrics as metrics
import pandas as pd

In [3]:
data = "C:\\Users\\bence\\Projects\\BIO446\\McKinnon-Rosati-Laboratory\\Project 1\\Data\\controls_final.h5ad"

In [4]:
adata = sc.read_h5ad(data)
adata.uns['log1p']["base"] = None # bug fix

print(adata.obs['sample'].value_counts())
print()
print('X matrix is sparse:', scipy.sparse.issparse(adata.X))
print('X size =', adata.X.shape)

sample
control3    7395
control2    6385
control1    2185
Name: count, dtype: int64

X matrix is sparse: False
X size = (15965, 21870)


In [5]:
adata

AnnData object with n_obs × n_vars = 15965 × 21870
    obs: 'scDblFinder_score', 'scDblFinder_class', 'doublet', 'n_genes_by_counts', 'total_counts', 'type', 'sample', 'batch_type', 'batch', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'n_genes', 'n_counts', 'size_factors', 'leiden_0.2', 'leiden_0.3', 'leiden_0.4', 'leiden_0.6', 'leiden_0.8', 'leiden_1.0', 'leiden_1.4', 'leiden_1.7', 'leiden_2.0', 'leiden_2.5', 'leiden_3.0', 'leiden_3.5', 'kmeans', 'my_clust_1', 'immune', 'mesothelial', 'stromal', 'endothelial', 'epithelial', 'muscle', 'Immune', 'Myeloid', 'Lymphoid', 'T lineage', 'Th1', 'Tnaive', 'Treg', 'Tex', 'NK', 'Mesenchyme', 'Mesothelial', 'Stromal', 'Endothelial', 'Blood vessels', 'LEC', 'Epithelial', 'Ciliated', 'Secretory'
    var: 'gene_ids

In [6]:
cluster_type = 'my_clust_1'

In [7]:
annotation_dict= {
    '0': 'CAP1',
    '12': 'CAP2',
    '21': 'VEC',
    '17': 'AEC',
    '28': 'LEC',

    '14': 'Ciliated',
    '5': 'Secretory',
    '1': 'AT1',
    '2': 'AT2',

    '3': 'AF',
    '20': 'Pericyte',
    '26': 'SMC',

    '18': 'Mesothelial',

    '8': 'B1',
        
    '23b': 'Th1',
    '11': 'Tnaive',

    '24': 'NK',
        
    '10': 'AM',
    '15b': 'M-C1q',
    '25': 'iMon',
    '15': 'DC',
    '15c': 'pDC',
    '22': 'N1',
}

In [8]:
adata.obs['single_cell_types'] = [annotation_dict[clust] for clust in adata.obs[cluster_type]]

dict_list = list(annotation_dict.keys())
adata_list = list(adata.obs[cluster_type].unique())
print('Keys in dictionary not in adata:', [item for item in dict_list if item not in adata_list])
print('Keys in adata not in dictionary:', [item for item in adata_list if item not in dict_list])

Keys in dictionary not in adata: []
Keys in adata not in dictionary: []


In [9]:
unique_celltype_sub = adata.obs['single_cell_types'].unique()
print(unique_celltype_sub)

num_unique_celltype_sub = adata.obs['single_cell_types'].nunique()
print(f"Number of unique sub cell types: {num_unique_celltype_sub}")

['Mesothelial' 'CAP1' 'AF' 'AT2' 'AT1' 'Secretory' 'B1' 'Tnaive'
 'Ciliated' 'AEC' 'DC' 'VEC' 'AM' 'Pericyte' 'NK' 'CAP2' 'pDC' 'iMon'
 'M-C1q' 'LEC' 'N1' 'SMC' 'Th1']
Number of unique sub cell types: 23


In [10]:
adata.obs['single_cell_types'].value_counts()

single_cell_types
AT2            3384
CAP1           2650
AF             2004
AT1            1577
Secretory      1090
B1              768
Tnaive          675
CAP2            560
AM              552
Ciliated        407
AEC             350
Mesothelial     341
Pericyte        280
VEC             274
N1              197
NK              177
DC              173
M-C1q           157
iMon            116
SMC              94
Th1              60
LEC              43
pDC              36
Name: count, dtype: int64

In [11]:
top_dict = {
    'Endothelial': ['CAP1','CAP2','VEC','AEC','LEC'],
    'Epithelial': ['Ciliated','Secretory','AT1','AT2','AT2-t1','AT2-t2'],
    'Mesenchyme': ['AF','Pericyte','SMC','Mesothelial'],
    'Immune': ['B1','B2','Th1','Tnaive','Treg','Tex','NK','AM','M-t1','M-t2','M-C1q','M-lc','iMon','DC','pDC','N1','N2']
}

second_dict = {
    'Blood vessels': ['CAP1','CAP2','VEC','AEC'],
    'Lymphatic EC': ['LEC'],
    'Airway epithelium': ['Ciliated','Secretory'],
    'Alveolar epithelium' : ['AT1','AT2','AT2-t1','AT2-t2'],
    'Stromal': ['AF','Pericyte','SMC'],
    'Mesothelial': ['Mesothelial'],
    'Lymphoid': ['B1','B2','Th1','Tnaive','Treg','Tex','NK'],
    'Myeloid': ['AM','M-t1','M-t2','M-C1q','M-lc','iMon','DC','pDC','N1','N2']
}

third_dict = {
    'Blood vessels': ['CAP1','CAP2','VEC','AEC'],
    'Lymphatic EC': ['LEC'],
    'Airway epithelium': ['Ciliated','Secretory'],
    'Alveolar epithelium': ['AT1','AT2','AT2-t1','AT2-t2'],
    'Fibroblast': ['AF','Pericyte'],
    'Smooth muscle': ['SMC'],
    'Mesothelial': ['Mesothelial'],
    'B lineage': ['B1','B2'],
    'T lineage': ['Th1','Tnaive','Treg','Tex'],
    'NK': ['NK'],
    'mononuclear broad': ['AM','M-t1','M-t2','M-C1q','M-lc','iMon','DC','pDC'],
    'Neutrophil': ['N1','N2']
}

fourth_dict = {
    'Blood vessels': ['CAP1','CAP2','VEC','AEC'],
    'Lymphatic EC': ['LEC'],
    'Airway epithelium': ['Ciliated','Secretory'],
    'Alveolar epithelium': ['AT1','AT2','AT2-t1','AT2-t2'],
    'Fibroblast': ['AF','Pericyte'],
    'Smooth muscle': ['SMC'],
    'Mesothelial': ['Mesothelial'],
    'B lineage': ['B1','B2'],
    'T lineage': ['Th1','Tnaive','Treg','Tex'],
    'NK': ['NK'],
    'Macrophage': ['AM','M-t1','M-t2','M-C1q','M-lc'],        
    'mononuclear fine': ['iMon','DC','pDC'],
    'Neutrophil': ['N1','N2']
}

L1_annotation = {}
for cell_type, cluster_num in top_dict.items():
    for x in cluster_num:
        L1_annotation[x] = cell_type

L2_annotation = {}
for cell_type, cluster_num in second_dict.items():
    for x in cluster_num:
        L2_annotation[x] = cell_type

L3_annotation = {}
for cell_type, cluster_num in third_dict.items():
    for x in cluster_num:
        L3_annotation[x] = cell_type

L4_annotation = {}
for cell_type, cluster_num in fourth_dict.items():
    for x in cluster_num:
        L4_annotation[x] = cell_type

adata.obs["top_level"] = adata.obs['single_cell_types'].map(L1_annotation)
adata.obs["second_level"] = adata.obs['single_cell_types'].map(L2_annotation)
adata.obs["third_level"] = adata.obs['single_cell_types'].map(L3_annotation)
adata.obs["fourth_level"] = adata.obs['single_cell_types'].map(L4_annotation)

In [26]:
adata.obs

Unnamed: 0,scDblFinder_score,scDblFinder_class,doublet,n_genes_by_counts,total_counts,type,sample,batch_type,batch,log1p_n_genes_by_counts,...,Blood vessels,LEC,Epithelial,Ciliated,Secretory,single_cell_types,top_level,second_level,third_level,fourth_level
AAACCTGAGAGAACAG_sc69_2,0.003917,singlet,0.0,2981,6566.0,control,control1,batch1,0,8.000349,...,0.052187,0.000000,0.070075,0.0,0.037748,Mesothelial,Mesenchyme,Mesothelial,Mesothelial,Mesothelial
AAACCTGGTCCGAAGA_sc69_2,0.002627,singlet,0.0,1815,3569.0,control,control1,batch1,0,7.504392,...,0.360203,0.138177,0.000000,0.0,0.036964,CAP1,Endothelial,Blood vessels,Blood vessels,Blood vessels
AAACCTGGTGAAATCA_sc69_2,0.223977,singlet,0.0,1883,3768.0,control,control1,batch1,0,7.541152,...,0.435829,0.000000,0.000000,0.0,0.037995,CAP1,Endothelial,Blood vessels,Blood vessels,Blood vessels
AAAGATGAGAGCCTAG_sc69_2,0.002211,singlet,0.0,2406,6212.0,control,control1,batch1,0,7.786136,...,0.000000,0.000000,0.000000,0.0,0.044055,AF,Mesenchyme,Stromal,Fibroblast,Fibroblast
AAAGATGTCCAGTAGT_sc69_2,0.001814,singlet,0.0,2398,6019.0,control,control1,batch1,0,7.782807,...,0.000000,0.000000,0.029073,0.0,0.048102,AF,Mesenchyme,Stromal,Fibroblast,Fibroblast
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAGTATCTGCA_sc72_2,0.001468,singlet,0.0,2631,7065.0,control,control3,batch2,2,7.875499,...,0.000000,0.000000,0.402864,0.0,0.000000,AT1,Epithelial,Alveolar epithelium,Alveolar epithelium,Alveolar epithelium
TTTGTCAGTGATGTGG_sc72_2,0.000857,singlet,0.0,2637,5836.0,control,control3,batch2,2,7.877776,...,0.372741,0.000000,0.000000,0.0,0.082962,CAP2,Endothelial,Blood vessels,Blood vessels,Blood vessels
TTTGTCATCAAACCAC_sc72_2,0.000857,singlet,0.0,2030,3948.0,control,control3,batch2,2,7.616284,...,0.000000,0.000000,0.409263,0.0,0.063188,AT2,Epithelial,Alveolar epithelium,Alveolar epithelium,Alveolar epithelium
TTTGTCATCCGGCACA_sc72_2,0.000483,singlet,0.0,1384,2184.0,control,control3,batch2,2,7.233455,...,0.383070,0.000000,0.000000,0.0,0.000000,CAP1,Endothelial,Blood vessels,Blood vessels,Blood vessels


In [77]:
level = 'top_level'
print(adata.obs[level].unique())
print(adata.obs[level].nunique())

['Mesenchyme' 'Endothelial' 'Epithelial' 'Immune']
4


In [78]:
hierarchy = ['top_level', 'second_level', 'third_level', 'fourth_level', 'single_cell_types']
hierarchy_dict = {}

def add_path(root, path):
    node = root
    prev = None 
    for label in path:
        if (label == prev): continue
        node = node.setdefault(label, {})
        prev = label

unique_paths = adata.obs[hierarchy].drop_duplicates().values

for path in unique_paths:
    add_path(hierarchy_dict, path)

In [91]:
hierarchy_dict

{'Mesenchyme': {'Mesothelial': {},
  'Stromal': {'Fibroblast': {'AF': {}, 'Pericyte': {}},
   'Smooth muscle': {'SMC': {}}}},
 'Endothelial': {'Blood vessels': {'CAP1': {},
   'AEC': {},
   'VEC': {},
   'CAP2': {}},
  'Lymphatic EC': {'LEC': {}}},
 'Epithelial': {'Alveolar epithelium': {'AT2': {}, 'AT1': {}},
  'Airway epithelium': {'Secretory': {}, 'Ciliated': {}}},
 'Immune': {'Lymphoid': {'B lineage': {'B1': {}},
   'T lineage': {'Tnaive': {}, 'Th1': {}},
   'NK': {}},
  'Myeloid': {'mononuclear broad': {'mononuclear fine': {'DC': {},
     'pDC': {},
     'iMon': {}},
    'Macrophage': {'AM': {}, 'M-C1q': {}}},
   'Neutrophil': {'N1': {}}}}}

In [None]:
unique_groups = sorted(adata.obs['cell_type_edit'].unique())

In [None]:
replacement_dict = {groups: i for i, groups in enumerate(unique_groups)}

In [None]:
replacement_dict

In [None]:
adata.obs['celltype'] = adata.obs['cell_type_edit'].map(replacement_dict)