In [1]:
import celltypist
from celltypist import models

In [2]:
import scanpy as sc
import os,sys,glob
import pandas as pd
import numpy as np
import anndata as ad
import matplotlib.pyplot as plt

import seaborn as sns
from matplotlib.colors import ListedColormap

import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

sc.settings.n_jobs = 30
sc.set_figure_params(figsize=(4,4), vector_friendly = True)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

In [3]:
adata = sc.read_h5ad("Data/Output_230911_adata_scvi_random_sampleID_annot.h5ad") # 12m

In [4]:
adata.X = adata.layers["logcounts"].copy() # 8.4s # adata.X must be logcounts value

# CellTypist modelling using all genes

In [5]:
adata # 0.0s

AnnData object with n_obs √ó n_vars = 393060 √ó 49133
    obs: 'batch', 'sampleID', 'Age', 'Assay', 'Stage', 'Race', 'PMI', 'Hemisphere', 'Library', 'Brain_Region', 'Dataset', 'Sex', 'Diagnosis', 'DF_classification', 'cluster_original', 'cluster_main', 'n_genes', 'Stage2', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'doublet_scores', 'predicted_doublets', '_scvi_batch', '_scvi_labels', 'leiden_scvi', 'leiden_0.5', 'leiden_0.7', 'leiden_1.0', 'cluster_main2', 'Brain_Region2', 'Brain_Region3', 'Brain_Region_Unit', 'cluster_number'
    uns: 'Brain_Region3_colors', 'Stage2_colors', 'cluster_main2_colors', 'leiden_scvi_colors', 'log1p'
    obsm: 'X_scVI_sampleID', 'X_umap', '_scvi_extra_categorical_covs'
    layers: 'counts', 'logcounts', 'scaled'

In [11]:
new_model = celltypist.train(adata, 
                             labels = adata.obs["cluster_number"], 
                             check_expression = False, 
                             n_jobs = 40, use_SGD = True) # 90~m

üç≥ Preparing data before training
‚úÇÔ∏è 13046 non-expressed genes are filtered out
‚öñÔ∏è Scaling input data
üèãÔ∏è Training data using SGD logistic regression


KeyboardInterrupt: 

In [None]:
#Write out the model.
new_model.write('test_all.pkl') #

# Celltypist modelling using HVG

In [6]:
adata_hvg = sc.read_h5ad("Data/Output_230907_adata_scvi_random_sampleID_hvg.h5ad")
adata_hvg # 1m

AnnData object with n_obs √ó n_vars = 393064 √ó 5000
    obs: 'batch', 'sampleID', 'Age', 'Assay', 'Stage', 'Race', 'PMI', 'Hemisphere', 'Library', 'Brain_Region', 'Dataset', 'Sex', 'Diagnosis', 'DF_classification', 'cluster_original', 'cluster_main', 'n_genes', 'Stage2', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'doublet_scores', 'predicted_doublets', '_scvi_batch', '_scvi_labels', 'leiden_scvi', 'leiden_0.5', 'leiden_0.7', 'leiden_1.0'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'
    uns: 'Dataset_colors', '_scvi_manager_uuid', '_scvi_uuid', 'cluster_main_colors', 'hvg', 'leiden', 'leiden_0.5_colors', 'leiden_0.7_colors', 'leiden_1.0_colors', 'leiden_scvi_colors', 'log1p', 'neighbors', 'umap'
    obsm: 'X_scVI_sampleID', 'X_umap', '_scvi_extra_categorical_covs'
    layers: 'counts', 'logcounts'
    obsp: 'connectivities', 'distances'

In [11]:
adata_hvg.var.highly_variable.index

Index(['A1CF', 'A2M', 'A2ML1', 'A4GALT', 'AAMDC', 'AASS', 'ABAT', 'ABCA1',
       'ABCA5', 'ABCA8',
       ...
       'ZNF844', 'ZNF90', 'ZNF98', 'ZNRF2', 'ZNRF3', 'ZRANB2-DT', 'ZRANB3',
       'ZSCAN5A', 'ZSWIM5', 'ZWINT'],
      dtype='object', name='new_gene', length=5000)

In [14]:
hvg_model = celltypist.train(adata[:,adata_hvg.var.highly_variable.index], 
                             labels = adata.obs["cluster_number"], 
                             check_expression = False, 
                             n_jobs = 40, use_SGD = True) # 20-25m

üç≥ Preparing data before training
‚öñÔ∏è Scaling input data
üèãÔ∏è Training data using SGD logistic regression
‚úÖ Model training done!


In [15]:
#Write out the model.
hvg_model.write('test_hvg.pkl') #

# Celltypist Annotation

In [47]:
adata_cancer = sc.read_h5ad("EGAD00001008811_SF3391_filtered_adata.h5ad")

In [48]:
adata_cancer

AnnData object with n_obs √ó n_vars = 5184 √ó 36601
    obs: 'SampleID', 'Study', 'n_genes', 'n_counts', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'doublet_scores', 'predicted_doublets'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'

In [22]:
adata_cancer.layers["counts"] = adata_cancer.X.copy()
sc.pp.normalize_total(adata_cancer)
sc.pp.log1p(adata_cancer)
adata_cancer.layers["logcounts"] = adata_cancer.X.copy()

In [28]:
predictions = celltypist.annotate(adata_cancer, model = 'test_hvg.pkl', majority_voting = True) # 5s

üî¨ Input data has 5184 cells and 36601 genes
üîó Matching reference genes in the model
üß¨ 4805 features used for prediction
‚öñÔ∏è Scaling input data
üñãÔ∏è Predicting labels
‚úÖ Prediction done!
üëÄ Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
‚õìÔ∏è Over-clustering input data with resolution set to 10
üó≥Ô∏è Majority voting the predictions
‚úÖ Majority voting done!


In [25]:
# Get an `AnnData` with predicted labels embedded into the cell metadata columns.
adata_cancer_pre = predictions.to_adata()
adata_cancer_pre

AnnData object with n_obs √ó n_vars = 5184 √ó 36601
    obs: 'SampleID', 'Study', 'n_genes', 'n_counts', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'doublet_scores', 'predicted_doublets', 'predicted_labels', 'over_clustering', 'majority_voting', 'conf_score'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'log1p', 'neighbors', 'leiden'
    obsm: 'X_pca'
    layers: 'counts', 'logcounts'
    obsp: 'connectivities', 'distances'

In [26]:
adata_cancer_pre.obs["majority_voting"].value_counts() # oligo, Astro, RG

C0    3253
C2    1178
C3     753
Name: majority_voting, dtype: int64

In [49]:
adata_cancer2 = sc.pp.subsample(adata_cancer, fraction = 0.1, copy = True)

In [50]:
adata_cancer2

AnnData object with n_obs √ó n_vars = 518 √ó 36601
    obs: 'SampleID', 'Study', 'n_genes', 'n_counts', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'doublet_scores', 'predicted_doublets'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'

In [52]:
adata_cancer2.layers["counts"] = adata_cancer2.X.copy()
sc.pp.normalize_total(adata_cancer2)
sc.pp.log1p(adata_cancer2)
adata_cancer2.layers["logcounts"] = adata_cancer2.X.copy()

In [53]:
predictions2 = celltypist.annotate(adata_cancer2, model = 'test_hvg.pkl', majority_voting = True) # 5s

üî¨ Input data has 518 cells and 36601 genes
üîó Matching reference genes in the model
üß¨ 4805 features used for prediction
‚öñÔ∏è Scaling input data
üñãÔ∏è Predicting labels
‚úÖ Prediction done!
üëÄ Can not detect a neighborhood graph, will construct one before the over-clustering
‚õìÔ∏è Over-clustering input data with resolution set to 5
üó≥Ô∏è Majority voting the predictions
‚úÖ Majority voting done!


In [54]:
# Get an `AnnData` with predicted labels embedded into the cell metadata columns.
adata_cancer2_pre = predictions2.to_adata()
adata_cancer2_pre

AnnData object with n_obs √ó n_vars = 518 √ó 36601
    obs: 'SampleID', 'Study', 'n_genes', 'n_counts', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'doublet_scores', 'predicted_doublets', 'predicted_labels', 'over_clustering', 'majority_voting', 'conf_score'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'log1p', 'neighbors', 'leiden'
    obsm: 'X_pca'
    layers: 'counts', 'logcounts'
    obsp: 'connectivities', 'distances'

In [55]:
adata_cancer2_pre.obs["majority_voting"].value_counts()

C0    342
C2    131
C3     45
Name: majority_voting, dtype: int64