In [1]:
import os
import torch

import scanpy as sc
import numpy as np
import pandas as pd
import gseapy as gp

from tqdm import tqdm
from sklearn.cluster import KMeans
from contrastive_vi.model.contrastive_vi import ContrastiveVIModel
from scripts import constants
from scvi._settings import settings

Global seed set to 0
1: package ‘methods’ was built under R version 3.6.1 
2: package ‘datasets’ was built under R version 3.6.1 
3: package ‘utils’ was built under R version 3.6.1 
4: package ‘grDevices’ was built under R version 3.6.1 
5: package ‘graphics’ was built under R version 3.6.1 
6: package ‘stats’ was built under R version 3.6.1 


In [2]:
settings.seed = 0
device = "cuda:2"
dataset = "zheng_2017"

Global seed set to 0


In [3]:
pathway_enr_fdr = 0.05
expression_delta = 0.15

In [4]:
split_key = constants.DATASET_SPLIT_LOOKUP[dataset]["split_key"]
background_value = constants.DATASET_SPLIT_LOOKUP[dataset]["background_value"]
label_key = constants.DATASET_SPLIT_LOOKUP[dataset]["label_key"]
seeds = constants.DEFAULT_SEEDS
latent_size = 10

In [5]:
adata = sc.read_h5ad(
    os.path.join(
        constants.DEFAULT_DATA_PATH,
        f"{dataset}/preprocessed/adata_top_2000_genes.h5ad",
    )
)
ContrastiveVIModel.setup_anndata(adata, layer="count")

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Successfully registered anndata object containing [1;36m16856[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches,
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


In [6]:
target_indices = np.where(adata.obs[split_key] != background_value)[0]
target_adata = adata[target_indices]
background_indices = np.where(adata.obs[split_key] == background_value)[0]
background_adata = adata[background_indices]

In [7]:
genes = pd.read_table(
    os.path.join(
        constants.DEFAULT_DATA_PATH,
        dataset,
        "aml027_post_transplant_filtered_gene_bc_matrices",
        "filtered_matrices_mex/hg19",
        "genes.tsv",
    ),
    header=None,
)
genes = genes.rename(columns={0: "ensembl_id", 1: "gene_symbol"})
genes = genes[genes["ensembl_id"].isin(adata.var.index)]

In [8]:
model_list = []
latent_rep_list = []
for seed in tqdm(seeds):
    result_dir = os.path.join(
        "/projects/leelab/contrastiveVI/results-fixed-background-size",
        f"{dataset}/contrastiveVI/latent_{latent_size}",
        f"{seed}",
    )
    model_list.append(
        torch.load(
            os.path.join(result_dir, "model.ckpt"),
            map_location=device,
        ),
    )
    latent_rep_list.append(
        np.load(os.path.join(result_dir, "latent_representations.npy")),
    )

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:48<00:00,  9.68s/it]


In [9]:
de_results = []
enr_results = []
label_proportion_results = []
assigned_label_results = []

for seed_index, seed in enumerate(seeds):
    model = model_list[seed_index]
    latent_rep = latent_rep_list[seed_index]
    latent_clusters = KMeans(n_clusters=2, random_state=123).fit(latent_rep).labels_
    cluster_label = f"cluster_{seed}"
    
    tmp_target_adata = target_adata.copy()
    tmp_target_adata.obs[cluster_label] = latent_clusters.astype(str)
    tmp_background_adata = background_adata.copy()
    tmp_background_adata.obs[cluster_label] = "background"
    
    de_results.append({})
    enr_results.append({})
    label_proportion_results.append({})
    assigned_label_results.append({})
    
    for cluster in np.unique(latent_clusters):
        cluster_str = f"{cluster}"
        
        label_proportions = (
            tmp_target_adata[tmp_target_adata.obs[cluster_label] == cluster_str]
            .obs[label_key].value_counts()
            / (tmp_target_adata.obs[cluster_label] == cluster_str).sum()
        )
        label_proportion_results[-1][cluster_str] = label_proportions
        assigned_label_results[-1][cluster_str] = (
            label_proportions.index[label_proportions.argmax()]
        )
        
        cluster_adata = tmp_background_adata.concatenate(
            tmp_target_adata[tmp_target_adata.obs[cluster_label] == cluster_str]
        )
    
        cluster_de_result = model.differential_expression(
            adata=cluster_adata,
            groupby=cluster_label,
            group1="background",
            group2=cluster_str,
            idx1=None,
            idx2=None,
            mode="change",
            delta=expression_delta,
            batch_size=128,
            all_stats=True,
            batch_correction=False,
            batchid1=None,
            batchid2=None,
            fdr_target=0.05,
            silent=False,
            target_idx=target_indices,
        )

        cluster_de_result.reset_index()
        cluster_de_result["ensembl_id"] = cluster_de_result.index
        cluster_de_result = cluster_de_result.merge(genes, on="ensembl_id")
        cluster_de_result["seed"] = seed
        de_results[-1][cluster_str] = cluster_de_result
        
        top_genes = cluster_de_result[
            cluster_de_result["proba_de"] > 0.95
        ]["gene_symbol"].tolist()
        enr = gp.enrichr(
            gene_list=top_genes,
            gene_sets="KEGG_2016",
            organism="human",
            cutoff=pathway_enr_fdr,
        )
        cluster_enr_result = enr.results
        cluster_enr_result = cluster_enr_result[
            cluster_enr_result["Adjusted P-value"] < pathway_enr_fdr
        ]
        enr_results[-1][cluster_str] = cluster_enr_result

[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m8608[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:34<00:00, 34.43s/it]
[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m12705[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches,
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:34<00:00, 34.93s/it]
[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m8651[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:34<00:00, 34.93s/it]
[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m12662[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches,
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:34<00:00, 34.56s/it]
[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m12712[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches,
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:34<00:00, 34.78s/it]
[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m8601[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:35<00:00, 35.38s/it]
[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m8672[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:34<00:00, 34.75s/it]
[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m12641[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches,
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:35<00:00, 35.04s/it]
[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m8484[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:33<00:00, 33.66s/it]
[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m12829[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches,
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:34<00:00, 34.66s/it]


In [10]:
enr_df_list = []
for i in range(len(assigned_label_results)):
    assigned_labels = assigned_label_results[i]
    enrs = enr_results[i]
    for cluster in assigned_labels.keys():
        enr = enrs[cluster].copy()
        enr["assigned_label"] = assigned_labels[cluster]
        enr["cluster"] = cluster
        enr["seed"] = constants.DEFAULT_SEEDS[i]
        enr_df_list.append(enr)
enr_df = pd.concat(enr_df_list)

In [11]:
cols = ["Gene_set", "Term", "Adjusted P-value", "Overlap", "Genes"]
cols += ["assigned_label", "cluster", "seed"]
enr_df = enr_df[cols]

### Enriched pathways for cluster associated with pre-transplant vs. healthy healthy controls.

In [12]:
pre_translant_enr = enr_df[enr_df["assigned_label"] == "pre_transplant"].sort_values(by="Term", ascending=False)

In [13]:
pre_translant_enr["Term"].value_counts()

Inflammatory bowel disease (IBD) Homo sapiens hsa05321                5
Leishmaniasis Homo sapiens hsa05140                                   5
Allograft rejection Homo sapiens hsa05330                             5
Antigen processing and presentation Homo sapiens hsa04612             5
Arginine and proline metabolism Homo sapiens hsa00330                 5
Asthma Homo sapiens hsa05310                                          5
Autoimmune thyroid disease Homo sapiens hsa05320                      5
Cell adhesion molecules (CAMs) Homo sapiens hsa04514                  5
Graft-versus-host disease Homo sapiens hsa05332                       5
HTLV-I infection Homo sapiens hsa05166                                5
Hematopoietic cell lineage Homo sapiens hsa04640                      5
Influenza A Homo sapiens hsa05164                                     5
Intestinal immune network for IgA production Homo sapiens hsa04672    5
Alcoholism Homo sapiens hsa05034                                

### Enriched pathways for cluster associated with post-transplant vs. healthy controls.

In [14]:
post_transplant_enr = enr_df[enr_df["assigned_label"] == "post_transplant"].sort_values(by="Term", ascending=False)

In [15]:
post_transplant_enr["Term"].value_counts()

Viral myocarditis Homo sapiens hsa05416                               5
Intestinal immune network for IgA production Homo sapiens hsa04672    5
Allograft rejection Homo sapiens hsa05330                             5
Antigen processing and presentation Homo sapiens hsa04612             5
Arginine and proline metabolism Homo sapiens hsa00330                 5
Asthma Homo sapiens hsa05310                                          5
Autoimmune thyroid disease Homo sapiens hsa05320                      5
Cell adhesion molecules (CAMs) Homo sapiens hsa04514                  5
Graft-versus-host disease Homo sapiens hsa05332                       5
Hematopoietic cell lineage Homo sapiens hsa04640                      5
Influenza A Homo sapiens hsa05164                                     5
Alcoholism Homo sapiens hsa05034                                      5
Rheumatoid arthritis Homo sapiens hsa05323                            5
Type I diabetes mellitus Homo sapiens hsa04940                  