In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
import scanpy as sc
import scvi
import anndata
import scipy.sparse as sp

from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
SEED = 42
np.random.seed(SEED)
# Set TensorFlow to use CPU for debugging
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

  from .autonotebook import tqdm as notebook_tqdm
2024-12-16 09:45:11.393968: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734309911.412313   23340 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734309911.417963   23340 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
lca_data = sc.read_h5ad("data/cell_atlas_of_the_human_lung_in_health_and_disease_full.h5ad")

In [3]:
print(lca_data)

AnnData object with n_obs × n_vars = 2282447 × 56239
    obs: 'suspension_type', 'donor_id', 'is_primary_data', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'tissue_ontology_term_id', 'organism_ontology_term_id', 'sex_ontology_term_id', "3'_or_5'", 'BMI', 'age_or_mean_of_age_range', 'age_range', 'anatomical_region_ccf_score', 'ann_coarse_for_GWAS_and_modeling', 'ann_finest_level', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5', 'cause_of_death', 'core_or_extension', 'dataset', 'fresh_or_frozen', 'log10_total_counts', 'lung_condition', 'mixed_ancestry', 'original_ann_level_1', 'original_ann_level_2', 'original_ann_level_3', 'original_ann_level_4', 'original_ann_level_5', 'original_ann_nonharmonized', 'reannotation_type', 'sample', 'scanvi_label', 'sequencing_platform', 'smoking_status', 'study', 'subject_type', 'tissue_coarse_unharmonized', 

In [4]:
def preprocess_lung_cell_atlas(adata, 
                                cell_level_1='Epithelial', 
                                cell_levels_2=['Airway epithelium', 'Alveolar epithelium', 'Submucosal Gland'],
                                cell_levels_3=['Basal', 'Secretory', 'Submucosal Secretory'],
                                cell_levels_4=['Basal resting', 'Club', 'Deuterosomal', 'Goblet', 
                                               'Hillock-like', 'SMG duct', 'SMG mucous', 
                                               'SMG serous', 'Suprabasal', 'Transitional Club-AT2'],
                                cell_levels_5=['Club (non-nasal)', 'Goblet (bronchial)', 
                                               'Goblet (nasal)', 'Goblet (subsegmental)', 
                                               'SMG serous (bronchial)', 'SMG serous (nasal)', 
                                               'pre-TB secretory'],
                                n_top_genes=4000):
    """
    Preprocess Lung Cell Atlas AnnData for deep learning
    
    Parameters:
    -----------
    adata : AnnData
        Input annotated data matrix
    cell_level_*: list of str
        Specified cell types to filter at different hierarchical levels
    n_top_genes : int
        Number of top genes to select
    
    Returns:
    --------
    adata_filtered : AnnData
        Preprocessed and filtered AnnData object
    """
    # 1. Filter cell types based on specified levels
    # Create a boolean mask for cell type selection
    cell_type_mask = (
        (adata.obs['ann_level_1'] == cell_level_1) & 
        (adata.obs['ann_level_2'].isin(cell_levels_2)) &
        (adata.obs['ann_level_3'].isin(cell_levels_3)) &
        (adata.obs['ann_level_4'].isin(cell_levels_4)) &
        (adata.obs['ann_level_5'].isin(cell_levels_5))
    )
    
    # Filter the AnnData object
    adata_filtered = adata[cell_type_mask].copy()
    
    # 2. Preprocessing steps
    # Log normalize the data
    sc.pp.normalize_total(adata_filtered, target_sum=1e4)
    sc.pp.log1p(adata_filtered)
    
    # 3. Select top genes
    # Use highly variable gene selection with specified number of top genes
    sc.pp.highly_variable_genes(adata_filtered, 
                                 min_mean=0.0125, 
                                 max_mean=3, 
                                 min_disp=0.5,
                                 n_top_genes=n_top_genes)
    
    # Select only the top n_top_genes
    adata_filtered = adata_filtered[:, adata_filtered.var.highly_variable]
    
    # 4. Prepare label encoding for disease
    # Create a label encoder for diseases
    disease_encoder = {disease: idx for idx, disease in enumerate(adata_filtered.obs['disease'].unique())}
    adata_filtered.obs['disease_label'] = adata_filtered.obs['disease'].map(disease_encoder)
    
    # 5. Optional: Scaling features
    sc.pp.scale(adata_filtered, max_value=10)
    
    # 6. Store additional metadata
    adata_filtered.uns['disease_encoder'] = disease_encoder
    adata_filtered.uns['n_top_genes'] = n_top_genes
    
    return adata_filtered

In [5]:
prep_data = preprocess_lung_cell_atlas(lca_data)

  adata_filtered.obs['disease_label'] = adata_filtered.obs['disease'].map(disease_encoder)


In [6]:
exp_data = pd.DataFrame(prep_data.X, columns=prep_data.var['feature_name'].values)

In [7]:
exp_data['disease'] = prep_data.obs['disease'].values

In [8]:
exp_data.shape

(106931, 4001)

In [9]:
sample_data=exp_data[:10]

#save this file as a csv
sample_data


Unnamed: 0,A2M,A2ML1,ABALON,ABCA1,ABCA8,ABCB1,ABCC11,ABCG4,ABHD5,ABLIM3,...,ZNF460,ZNF474-AS1,ZNF542P_ENSG00000240225,ZNF593OS,ZNF728,ZNRD2-DT,ZRANB2-DT,ZSWIM2,ZSWIM8-AS1,disease
0,-0.094079,-0.192755,-0.088329,-0.246342,-0.038406,-0.02932,-0.06462,-0.011724,-0.468072,-0.141087,...,-0.200778,-0.005439,-0.013607,-0.054294,-0.012434,-0.011179,-0.036577,-0.003058,-0.011319,normal
1,-0.094079,-0.192755,-0.088329,-0.246342,-0.038406,-0.02932,-0.06462,-0.011724,-0.468072,-0.141087,...,-0.200778,-0.005439,-0.013607,-0.054294,-0.012434,-0.011179,-0.036577,-0.003058,-0.011319,normal
2,-0.094079,-0.192755,-0.088329,-0.246342,-0.038406,-0.02932,-0.06462,-0.011724,-0.468072,-0.141087,...,-0.200778,-0.005439,-0.013607,-0.054294,-0.012434,-0.011179,-0.036577,-0.003058,-0.011319,normal
3,-0.094079,-0.192755,-0.088329,0.850272,-0.038406,-0.02932,-0.06462,-0.011724,-0.468072,-0.141087,...,-0.200778,-0.005439,-0.013607,-0.054294,-0.012434,-0.011179,-0.036577,-0.003058,-0.011319,normal
4,-0.094079,-0.192755,-0.088329,-0.246342,-0.038406,-0.02932,-0.06462,-0.011724,-0.468072,-0.141087,...,-0.200778,-0.005439,-0.013607,-0.054294,-0.012434,-0.011179,-0.036577,-0.003058,-0.011319,normal
5,-0.094079,-0.192755,-0.088329,2.011213,-0.038406,-0.02932,-0.06462,-0.011724,0.941356,-0.141087,...,-0.200778,-0.005439,-0.013607,-0.054294,-0.012434,-0.011179,-0.036577,-0.003058,-0.011319,normal
6,-0.094079,-0.192755,-0.088329,-0.246342,-0.038406,-0.02932,-0.06462,-0.011724,-0.468072,-0.141087,...,-0.200778,-0.005439,-0.013607,-0.054294,-0.012434,-0.011179,-0.036577,-0.003058,-0.011319,normal
7,-0.094079,-0.192755,-0.088329,-0.246342,-0.038406,-0.02932,-0.06462,-0.011724,-0.468072,-0.141087,...,-0.200778,-0.005439,-0.013607,-0.054294,-0.012434,-0.011179,-0.036577,-0.003058,-0.011319,normal
8,-0.094079,-0.192755,-0.088329,-0.246342,-0.038406,-0.02932,-0.06462,-0.011724,0.42516,-0.141087,...,-0.200778,-0.005439,-0.013607,-0.054294,-0.012434,-0.011179,-0.036577,-0.003058,-0.011319,normal
9,-0.094079,-0.192755,-0.088329,-0.246342,-0.038406,-0.02932,-0.06462,-0.011724,-0.468072,-0.141087,...,-0.200778,-0.005439,-0.013607,-0.054294,-0.012434,-0.011179,-0.036577,-0.003058,-0.011319,normal


In [10]:
target_diseases = [
    "normal",
    "chronic obstructive pulmonary disease",
    "chronic rhinitis",
    "pulmonary fibrosis"
]

In [11]:
data = exp_data[exp_data['disease'].isin(target_diseases)]

In [12]:
data['disease'].unique()

['normal', 'chronic rhinitis', 'pulmonary fibrosis', 'chronic obstructive pulmonary disease']
Categories (15, object): ['pulmonary sarcoidosis', 'pulmonary fibrosis', 'lung large cell carcinoma', 'chronic rhinitis', ..., 'hypersensitivity pneumonitis', 'non-specific interstitial pneumonia', 'COVID-19', 'normal']

In [13]:
disease_counts = data['disease'].value_counts()
print(disease_counts)

disease
normal                                   92924
chronic rhinitis                          3057
pulmonary fibrosis                        2577
chronic obstructive pulmonary disease      173
lung large cell carcinoma                    0
lung adenocarcinoma                          0
pulmonary sarcoidosis                        0
squamous cell lung carcinoma                 0
pneumonia                                    0
lymphangioleiomyomatosis                     0
cystic fibrosis                              0
interstitial lung disease                    0
hypersensitivity pneumonitis                 0
non-specific interstitial pneumonia          0
COVID-19                                     0
Name: count, dtype: int64


In [14]:
# Encode the disease column using one-hot encoding
onehot_encoder = OneHotEncoder(sparse_output=False)
y = onehot_encoder.fit_transform(data[['disease']])

In [15]:
X = data.drop(columns=['disease'])

In [16]:
print(X.shape, y.shape)

(98731, 4000) (98731, 4)


In [17]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [18]:
# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [19]:
class ProbabilisticModel(tf.keras.Model):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.dense1 = layers.Dense(128, activation='relu')
        self.dropout1 = layers.Dropout(0.3)
        self.dense2 = layers.Dense(64, activation='relu')
        self.dropout2 = layers.Dropout(0.3)
        self.out = layers.Dense(num_classes, activation='softmax')

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dropout1(x)
        x = self.dense2(x)
        x = self.dropout2(x)
        return self.out(x)

In [20]:
num_classes = len(np.unique(y))
input_dim = X_train.shape[1]
model = ProbabilisticModel(input_dim=input_dim, num_classes=num_classes)

2024-12-16 10:00:36.400694: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [21]:
# def latent_visualization(model, data):
#     o1 = model.dense1(data)
#     o2 = model.dropout1(o1)
#     o3 = model.dense2(o2)
#     # TSNE for visualization
#     tsne = TSNE(n_components=2, random_state=42)
#     o3_tsne = tsne.fit_transform(o3)
#     return o3_tsne