In [2]:
import scanpy as sc
import scvi
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import scipy.sparse as sp

seed = 42
scvi.settings.seed = seed
torch.manual_seed(seed)
np.random.seed(seed)

  from .autonotebook import tqdm as notebook_tqdm
Seed set to 42


In [4]:
adata = sc.read_h5ad("data/raw-count-full-genes-with-cell-type-annotation.h5ad")

In [17]:
adata

AnnData object with n_obs × n_vars = 107974 × 36601
    obs: 'sample_batch', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size', 'dataset', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'doublet_scores', 'predicted_doublets', 'n_counts', 'n_genes', 'sample description', 'experiment', 'patient', 'level1', 'level2', 'level3', 'sample'
    var: 'gene_ids', 'feature_types', 'Accession', 'Chromosome', 'End', 'Start', 'Strand', 'mt'
    layers: 'ambiguous', 'raw', 'spliced', 'unspliced'

In [24]:
adata.obs['dataset'].unique().tolist()

['neCRSwNP_2',
 'eCRSwNP_2',
 'Control_1',
 'Control_3',
 'neCRSwNP_4',
 'neCRSwNP_5',
 'eCRSwNP_3',
 'eCRSwNP_4',
 'eCRSwNP_5',
 'Control_4',
 'Control_5',
 'Control_6',
 'Control_7',
 'Control_8']

In [5]:
def clean_anndata(adata):
    # Create a copy to avoid modifying the original
    adata_cleaned = adata.copy()
    
    # Check and clean raw layer
    if 'raw' in adata_cleaned.layers:
        # Replace inf and -inf with NaN
        raw_layer = adata_cleaned.layers['raw'].copy()
        
        if sp.issparse(raw_layer):
            # For sparse matrices
            raw_layer.data[np.isinf(raw_layer.data)] = 0
            raw_layer.data[np.isnan(raw_layer.data)] = 0
        else:
            # For dense arrays
            raw_layer[np.isinf(raw_layer)] = 0
            raw_layer[np.isnan(raw_layer)] = 0
        
        adata_cleaned.layers['raw'] = raw_layer
    
    # Clean X layer if needed
    if sp.issparse(adata_cleaned.X):
        adata_cleaned.X.data[np.isinf(adata_cleaned.X.data)] = 0
        adata_cleaned.X.data[np.isnan(adata_cleaned.X.data)] = 0
    else:
        adata_cleaned.X[np.isinf(adata_cleaned.X)] = 0
        adata_cleaned.X[np.isnan(adata_cleaned.X)] = 0
    
    return adata_cleaned

In [6]:
def preprocess_scanvi_data(adata, cell_type_levels=['level1', 'level2', 'level3'], n_top_genes=4000):
    # Clean the AnnData object first
    adata_cleaned = clean_anndata(adata)
    
    # Ensure raw layer exists
    if 'raw' not in adata_cleaned.layers:
        adata_cleaned.layers['raw'] = adata_cleaned.X.copy()
    
    # Create a copy for processing
    adata_processed = adata_cleaned.copy()
    
    # Ensure X is using raw counts
    adata_processed.X = adata_processed.layers['raw'].copy()
    
    # Preprocessing steps
    sc.pp.filter_cells(adata_processed, min_genes=200)
    sc.pp.filter_genes(adata_processed, min_cells=3)
    
    # Log normalize and find variable genes
    sc.pp.normalize_total(adata_processed, target_sum=1e4)
    sc.pp.log1p(adata_processed)
    
    # Find highly variable genes with robust method
    sc.pp.highly_variable_genes(
        adata_processed, 
        n_top_genes=n_top_genes, 
        min_mean=0.0125, 
        max_mean=3, 
        min_disp=0.5
    )
    
    # Subset to highly variable genes
    adata_processed = adata_processed[:, adata_processed.var['highly_variable']]
    
    # Prepare label encoders
    from sklearn.preprocessing import LabelEncoder
    label_encoders = {}
    
    for level in cell_type_levels:
        # Check if the level exists in obs
        if level not in adata_processed.obs.columns:
            print(f"Warning: {level} not found in observation columns")
            continue
        
        # Remove any NaN values
        if pd.api.types.is_categorical_dtype(adata_processed.obs[level]):
        # Add 'Unknown' to the categories if it's not already present
            adata_processed.obs[level] = adata_processed.obs[level].cat.add_categories(['Unknown'])
        # Fill NaN values with 'Unknown'
        adata_processed.obs[level] = adata_processed.obs[level].fillna('Unknown')

        
        # Encode labels
        le = LabelEncoder()
        adata_processed.obs[f'{level}_encoded'] = le.fit_transform(
            adata_processed.obs[level]
        )
        label_encoders[level] = le
    
    return adata_processed, label_encoders

In [12]:
def train_scanvi_model(adata_processed, label_encoders, cell_type_levels=['level1', 'level2', 'level3']):
    # Prepare results dictionary
    results = {}
   
    # Train scANVI for each cell type level
    for level in cell_type_levels:
        print(f"\nTraining scANVI for {level}")
       
        # Prepare the data
        adata_level = adata_processed.copy()
       
        # Convert to string and then categorical if necessary
        adata_level.obs[f'{level}_encoded'] = adata_level.obs[f'{level}_encoded'].astype(str)
        if not pd.api.types.is_categorical_dtype(adata_level.obs[f'{level}_encoded']):
            adata_level.obs[f'{level}_encoded'] = adata_level.obs[f'{level}_encoded'].astype('category')
       
        # Add "Unknown" to the categories for encoded labels if not already present
        if 'Unknown' not in adata_level.obs[f'{level}_encoded'].cat.categories:
            adata_level.obs[f'{level}_encoded'] = adata_level.obs[f'{level}_encoded'].cat.add_categories(['Unknown'])
       
        # Update label encoder to include "Unknown"
        if 'Unknown' not in label_encoders[level].classes_:
            new_classes = list(label_encoders[level].classes_) + ['Unknown']
            label_encoders[level].classes_ = np.array(new_classes)
       
        # Setup scVI model
        scvi.model.SCVI.setup_anndata(
            adata_level,
            layer='raw',
            labels_key=f'{level}_encoded'
        )
       
        # Initialize and train scVI model
        vae = scvi.model.SCVI(
            adata_level,
            n_layers=2,
            n_latent=30,
            dropout_rate=0.2
        )
        vae.train(max_epochs=100, early_stopping=True)
       
        # Initialize and train scANVI model
        scanvi = scvi.model.SCANVI.from_scvi_model(
            vae,
            labels_key=f'{level}_encoded',
            unlabeled_category='Unknown'
        )
        scanvi.train(max_epochs=100, early_stopping=True)
       
        # Get predicted labels 
        # Use max probability to determine the predicted label
        predictions_prob = scanvi.predict_label(adata_level)
        predictions = predictions_prob.argmax(axis=1)
        
        # Convert predictions to original label names
        decoded_predictions = label_encoders[level].inverse_transform(predictions)
       
        # Compute classification metrics
        true_labels = adata_level.obs[level]
       
        print("\nClassification Report:")
        print(classification_report(true_labels, decoded_predictions))
       
        # Plot confusion matrix
        plt.figure(figsize=(12, 10))
        cm = confusion_matrix(true_labels, decoded_predictions)
        unique_labels = label_encoders[level].classes_
        sns.heatmap(cm, annot=True, fmt='d',
                    xticklabels=unique_labels,
                    yticklabels=unique_labels)
        plt.title(f'Confusion Matrix - {level}')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.tight_layout()
        plt.show()
       
        # Store results
        results[level] = {
            'model': scanvi,
            'predictions': decoded_predictions,
            'true_labels': true_labels,
            'label_encoder': label_encoders[level]
        }
   
    return results

In [8]:
def visualize_latent_space(results, adata_processed, cell_type_levels=['level1', 'level2', 'level3']):
    for level in cell_type_levels:
        # Get the trained scANVI model
        scanvi = results[level]['model']
        
        # Get latent representation
        latent = scanvi.get_latent_representation(adata_processed)
        
        # Create a new AnnData object with latent representation
        adata_latent = sc.AnnData(X=latent)
        adata_latent.obs[level] = adata_processed.obs[level]
        
        # Perform UMAP on latent space
        sc.pp.neighbors(adata_latent)
        sc.tl.umap(adata_latent)
        
        # Plot UMAP
        plt.figure(figsize=(12, 10))
        sc.pl.umap(adata_latent, color=level, palette='tab20', 
                   title=f'UMAP of Latent Space - {level}', 
                   show=True, 
                   save=f'_umap_{level}.png')

In [9]:
adata_prep = clean_anndata(adata)

In [10]:
cell_type_levels=['level1', 'level2', 'level3']
adata_processed, label_encoders = preprocess_scanvi_data(
        adata_prep, 
        cell_type_levels=cell_type_levels
    )

  if pd.api.types.is_categorical_dtype(adata_processed.obs[level]):
  adata_processed.obs[level] = adata_processed.obs[level].cat.add_categories(['Unknown'])
  if pd.api.types.is_categorical_dtype(adata_processed.obs[level]):
  if pd.api.types.is_categorical_dtype(adata_processed.obs[level]):


In [13]:
results = train_scanvi_model(adata_processed, label_encoders, cell_type_levels=cell_type_levels)


Training scANVI for level1


  if not pd.api.types.is_categorical_dtype(adata_level.obs[f'{level}_encoded']):
Trainer will use only 1 of 8 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=8)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
/home/dhakal/anaconda3/envs/URT_NSCL/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=39` in the `DataLoader` to improve performance.
/home/dhakal/anaconda3/envs/URT_NSCL/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_data

Epoch 100/100: 100%|██████████| 100/100 [20:14<00:00, 11.05s/it, v_num=1, train_loss_step=758, train_loss_epoch=701]

`Trainer.fit` stopped: `max_epochs=100` reached.


Epoch 100/100: 100%|██████████| 100/100 [20:14<00:00, 12.15s/it, v_num=1, train_loss_step=758, train_loss_epoch=701]
[34mINFO    [0m Training for [1;36m100[0m epochs.                                                                                  


Trainer will use only 1 of 8 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=8)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
/home/dhakal/anaconda3/envs/URT_NSCL/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=39` in the `DataLoader` to improve performance.
/home/dhakal/anaconda3/envs/URT_NSCL/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing

Epoch 95/100:  95%|█████████▌| 95/100 [52:00<02:44, 32.85s/it, v_num=1, train_loss_step=722, train_loss_epoch=705] 
Monitored metric elbo_validation did not improve in the last 45 records. Best score: 725.767. Signaling Trainer to stop.


AttributeError: 'SCANVI' object has no attribute 'predict_label'