In [1]:
from pathlib import Path
import scanpy as sc
import scanpy.external as sce
import anndata as ad

In [2]:
sc.set_figure_params(dpi=300)

BASE_DIR = Path("datasets/GSE171892")

In [3]:
adata = sc.read(BASE_DIR / "GSE171892_human_integrated.h5ad")

In [None]:
cell_populations = {
    # === Progenitors ===
    "RP": ["SOX2", "LMX1A", "MSX1", "MSX2", "PAX3", "WNT1"],
    "FP": ["SOX2", "FOXA2", "FERD3L", "ARX", "SHH", "LMX1B", "NKX6-1"],
    "Progenitor": ["SOX2"],
    "Neuron": ["STMN2","MAP2","ELAVL3"],
    "Oligodendrocyte": ["SOX10", "OLIG2", "MBP"],

    # === Mesoderm ===
    "Mesoderm_I": ["FOXC1"],
    "Mesoderm_II": ["FOXC2"],
    "Mesoderm_III": ["TWIST1"],
    "Mesoderm_IV": ["TWIST2"],
    "Mesoderm_V": ["MEOX1"],
    "Mesoderm_VI": ["MEOX2"],

    # === Myoblast and crest progenitors ===
    "Myoblast": ["MYOG"],
    "Neural_crest_progenitor": ["SOX10", "SOX2"],
    "DRG_Progenitor": ["SOX10"],
    "Sensory_neuron_progenitor": ["NEUROD1", "NEUROG1", "NEUROG2"],
 
}


In [None]:
sc.pl.dotplot(adata, cell_populations, groupby='leiden', standard_scale='var')

In [None]:
cluster_dict = {
'0': 'Neuron', '1': 'Neuron', '2': 'Neuron', '3': 'Mesoderm', '4': 'Prob Neuron', '5': 'Not Sure', '6': 'Mesoderm', '7': 'Mesoderm', '8': 'SOX2', '9': 'Mesoderm',
'10': 'Mesoderm', '11': 'Sensory Neuron', '12': 'Prob Neuron', '13': 'Peripheral Glia', '14': 'Neuron', '15': 'Mesoderm', '16': 'Motor Neuron', '17': 'Prob Neuron', '18': 'Roofplate', '19': 'Roofplate',
'20': 'Neuron', '21': 'Neuron', '22': 'Neuron', '23': 'Peripheral Glia', '24': 'SOX2', '25': 'Oligodendrocyte', '26': 'Mesoderm', '27': 'Floorplate', '28': 'Neuron', '29': 'Neuron',
'30': 'Neuron', '31': 'Blood', '32': 'Mesoderm', '33': 'Mesoderm', '34': 'Sensory Neuron Progenitor', '35': 'Skin', '36': 'Myoblast', '37': 'Mesoderm', '38': 'Prob Neuron', '39': 'Blood',
'40': 'Prob Neuron', '41': 'Prob Neuron', '42': 'Mesoderm', '43': 'Prob Neuron'}
adata.obs['celltype'] = adata.obs['leiden'].map(cluster_dict)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.pl.umap(adata, color=['celltype'])

In [None]:
adata.write(BASE_DIR / "GSE171892_human_integrated.h5ad")

In [None]:
import scipy.sparse as sp
adata.X = sp.csr_matrix(adata.X)
adata.write("datasets/GSE171892/GSE171892_sparse.h5ad", compression="gzip")

In [None]:
exclude = ["Mesoderm", "Blood", "Skin"]

# Boolean mask of the cells we want to keep
mask = ~adata.obs['celltype'].isin(exclude)

# Subset directly (no .copy()!)
adata = adata[mask]

In [9]:
adata = adata.copy()

MemoryError: Unable to allocate 14.9 GiB for an array with shape (71655, 27940) and data type float64

In [None]:
print(type(adata._X))

In [8]:
import scipy.sparse as sp
if not sp.issparse(adata.X):
    adata.X = sp.csr_matrix(adata.X)
adata.X = adata.X.astype("float32")

MemoryError: Unable to allocate 14.9 GiB for an array with shape (71655, 27940) and data type float64

In [5]:
sc.tl.pca(adata, svd_solver='arpack')
sc.pp.neighbors(adata, use_rep='X_pca_harmony')
sc.tl.umap(adata)
sc.tl.leiden(adata, resolution=2)

MemoryError: Unable to allocate 14.9 GiB for an array with shape (71655, 27940) and data type float64

In [None]:
sc.pl.umap(adata, color=['leiden'])