In [97]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
import scanpy as sc
import scvi
import anndata
import scipy.sparse as sp


from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
SEED = 42
np.random.seed(SEED)


In [94]:
lca_data = sc.read_h5ad("data/cell_atlas_of_the_human_lung_in_health_and_disease_full.h5ad")

In [98]:
print(lca_data)

AnnData object with n_obs × n_vars = 2282447 × 56239
    obs: 'suspension_type', 'donor_id', 'is_primary_data', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'tissue_ontology_term_id', 'organism_ontology_term_id', 'sex_ontology_term_id', "3'_or_5'", 'BMI', 'age_or_mean_of_age_range', 'age_range', 'anatomical_region_ccf_score', 'ann_coarse_for_GWAS_and_modeling', 'ann_finest_level', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5', 'cause_of_death', 'core_or_extension', 'dataset', 'fresh_or_frozen', 'log10_total_counts', 'lung_condition', 'mixed_ancestry', 'original_ann_level_1', 'original_ann_level_2', 'original_ann_level_3', 'original_ann_level_4', 'original_ann_level_5', 'original_ann_nonharmonized', 'reannotation_type', 'sample', 'scanvi_label', 'sequencing_platform', 'smoking_status', 'study', 'subject_type', 'tissue_coarse_unharmonized', 

In [101]:
def preprocess_lung_cell_atlas(adata, 
                                cell_level_1='Epithelial', 
                                cell_levels_2=['Airway epithelium', 'Alveolar epithelium', 'Submucosal Gland'],
                                cell_levels_3=['Basal', 'Secretory', 'Submucosal Secretory'],
                                cell_levels_4=['Basal resting', 'Club', 'Deuterosomal', 'Goblet', 
                                               'Hillock-like', 'SMG duct', 'SMG mucous', 
                                               'SMG serous', 'Suprabasal', 'Transitional Club-AT2'],
                                cell_levels_5=['Club (non-nasal)', 'Goblet (bronchial)', 
                                               'Goblet (nasal)', 'Goblet (subsegmental)', 
                                               'SMG serous (bronchial)', 'SMG serous (nasal)', 
                                               'pre-TB secretory'],
                                n_top_genes=4000):
    """
    Preprocess Lung Cell Atlas AnnData for deep learning
    
    Parameters:
    -----------
    adata : AnnData
        Input annotated data matrix
    cell_level_*: list of str
        Specified cell types to filter at different hierarchical levels
    n_top_genes : int
        Number of top genes to select
    
    Returns:
    --------
    adata_filtered : AnnData
        Preprocessed and filtered AnnData object
    """
    # 1. Filter cell types based on specified levels
    # Create a boolean mask for cell type selection
    cell_type_mask = (
        (adata.obs['ann_level_1'] == cell_level_1) & 
        (adata.obs['ann_level_2'].isin(cell_levels_2)) &
        (adata.obs['ann_level_3'].isin(cell_levels_3)) &
        (adata.obs['ann_level_4'].isin(cell_levels_4)) &
        (adata.obs['ann_level_5'].isin(cell_levels_5))
    )
    
    # Filter the AnnData object
    adata_filtered = adata[cell_type_mask].copy()
    
    # 2. Preprocessing steps
    # Log normalize the data
    sc.pp.normalize_total(adata_filtered, target_sum=1e4)
    sc.pp.log1p(adata_filtered)
    
    # 3. Select top genes
    # Use highly variable gene selection with specified number of top genes
    sc.pp.highly_variable_genes(adata_filtered, 
                                 min_mean=0.0125, 
                                 max_mean=3, 
                                 min_disp=0.5,
                                 n_top_genes=n_top_genes)
    
    # Select only the top n_top_genes
    adata_filtered = adata_filtered[:, adata_filtered.var.highly_variable]
    
    # 4. Prepare label encoding for disease
    # Create a label encoder for diseases
    disease_encoder = {disease: idx for idx, disease in enumerate(adata_filtered.obs['disease'].unique())}
    adata_filtered.obs['disease_label'] = adata_filtered.obs['disease'].map(disease_encoder)
    
    # 5. Optional: Scaling features
    sc.pp.scale(adata_filtered, max_value=10)
    
    # 6. Store additional metadata
    adata_filtered.uns['disease_encoder'] = disease_encoder
    adata_filtered.uns['n_top_genes'] = n_top_genes
    
    return adata_filtered

In [100]:
prep_data = preprocess_lung_cell_atlas(lca_data)

  adata_filtered.obs['disease_label'] = adata_filtered.obs['disease'].map(disease_encoder)


In [102]:
exp_data = pd.DataFrame(prep_data.X, columns=prep_data.var['feature_name'].values)

In [103]:
exp_data['disease'] = prep_data.obs['disease'].values

In [104]:
exp_data.shape

(106931, 4001)

In [135]:
sample_data=exp_data[:10]

#save this file as a csv
sample_data.to_csv('sample_data.csv', index=False)


In [106]:
target_diseases = [
    "normal",
    "chronic obstructive pulmonary disease",
    "chronic rhinitis",
    "pulmonary fibrosis"
]

In [107]:
data = exp_data[exp_data['disease'].isin(target_diseases)]

In [108]:
data['disease'].unique()

['normal', 'chronic rhinitis', 'pulmonary fibrosis', 'chronic obstructive pulmonary disease']
Categories (15, object): ['pulmonary sarcoidosis', 'pulmonary fibrosis', 'lung large cell carcinoma', 'chronic rhinitis', ..., 'hypersensitivity pneumonitis', 'non-specific interstitial pneumonia', 'COVID-19', 'normal']

In [109]:
disease_counts = data['disease'].value_counts()
print(disease_counts)

disease
normal                                   92924
chronic rhinitis                          3057
pulmonary fibrosis                        2577
chronic obstructive pulmonary disease      173
lung large cell carcinoma                    0
lung adenocarcinoma                          0
pulmonary sarcoidosis                        0
squamous cell lung carcinoma                 0
pneumonia                                    0
lymphangioleiomyomatosis                     0
cystic fibrosis                              0
interstitial lung disease                    0
hypersensitivity pneumonitis                 0
non-specific interstitial pneumonia          0
COVID-19                                     0
Name: count, dtype: int64


In [110]:
# Encode the disease column using one-hot encoding
onehot_encoder = OneHotEncoder(sparse_output=False)
y = onehot_encoder.fit_transform(data[['disease']])

In [111]:
X = data.drop(columns=['disease'])

In [112]:
print(X.shape, y.shape)

(98731, 4000) (98731, 4)


In [113]:
# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [114]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)


In [115]:
y_train_tensor = y_train_tensor.long()
y_test_tensor = y_test_tensor.long()

In [116]:
num_classes = len(torch.unique(y_train_tensor))
input_dim = X_train_tensor.shape[1]

In [117]:
print(num_classes, input_dim)

2 4000


In [118]:
# Create datasets and loaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [119]:
class HLCANet(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(HLCANet, self).__init__()
        # A deeper network with multiple layers, batch norm, and dropout
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(64, num_classes)  # final layer for multi-class
        )

    def forward(self, x):
        return self.model(x)

In [120]:
model = HLCANet(input_dim, num_classes)

In [125]:

device = torch.device("cpu")

In [126]:
# Use CrossEntropyLoss for multi-class classification
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [131]:
epochs = 10
model.train()
for epoch in range(epochs):
    running_loss = 0.0
    for batch_X, batch_y in train_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)

        # If batch_y is one-hot encoded, convert to indices
        if batch_y.dim() > 1 and batch_y.size(1) > 1:
            batch_y = torch.argmax(batch_y, dim=1)

        # Remove extra dimensions if necessary
        if batch_y.dim() > 1:
            batch_y = batch_y.squeeze()

        # Ensure correct dtype for loss function
        batch_y = batch_y.to(dtype=torch.long)

        # Debugging prints (optional)
        print(f"Batch_y shape: {batch_y.shape}, unique labels: {torch.unique(batch_y)}")

        optimizer.zero_grad()
        logits = model(batch_X)  # Output of shape [N, C]
        loss = criterion(logits, batch_y)  # CrossEntropyLoss expects targets [N]
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * batch_X.size(0)
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch [{epoch+1}/{epochs}] Loss: {epoch_loss:.4f}")


Batch_y shape: torch.Size([32]), unique labels: tensor([0, 1, 2])


IndexError: Target 2 is out of bounds.

torch.float32 torch.float32
0.0 1.0
