In [1]:
import os
import torch

import scanpy as sc
import numpy as np
import pandas as pd
import gseapy as gp

from tqdm import tqdm
from sklearn.cluster import KMeans
from contrastive_vi.model.total_contrastive_vi import TotalContrastiveVIModel
from scripts import constants
from scvi._settings import settings

Global seed set to 0
1: package ‘methods’ was built under R version 3.6.1 
2: package ‘datasets’ was built under R version 3.6.1 
3: package ‘utils’ was built under R version 3.6.1 
4: package ‘grDevices’ was built under R version 3.6.1 
5: package ‘graphics’ was built under R version 3.6.1 
6: package ‘stats’ was built under R version 3.6.1 


In [2]:
settings.seed = 0
device = "cuda:7"
dataset = "papalexi_2021"

Global seed set to 0


In [3]:
split_key = constants.DATASET_SPLIT_LOOKUP[dataset]["split_key"]
background_value = constants.DATASET_SPLIT_LOOKUP[dataset]["background_value"]
seeds = constants.DEFAULT_SEEDS
latent_size = 10

In [4]:
adata = sc.read_h5ad(
    os.path.join(
        constants.DEFAULT_DATA_PATH,
        f"{dataset}/preprocessed/adata_top_2000_genes_tc.h5ad",
    )
)
TotalContrastiveVIModel.setup_anndata(
    adata,
    layer="count",
    protein_expression_obsm_key="protein_expression",
)

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Using protein expression from adata.obsm[1m[[0m[32m'protein_expression'[0m[1m][0m                      
[34mINFO    [0m Using protein names from columns of adata.obsm[1m[[0m[32m'protein_expression'[0m[1m][0m                
[34mINFO    [0m Successfully registered anndata object containing [1;36m20729[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches,
         [1;36m1[0m labels, and [1;36m4[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further mo

In [5]:
target_indices = np.where(adata.obs[split_key] != background_value)[0]
target_adata = adata[target_indices]

In [6]:
genes = adata.var.index.tolist()

In [7]:
model_list = []
latent_rep_list = []
for seed in tqdm(seeds):
    result_dir = os.path.join(
        constants.DEFAULT_RESULTS_PATH,
        f"{dataset}/total_contrastiveVI/latent_{latent_size}",
        f"{seed}",
    )
    model_list.append(
        torch.load(
            os.path.join(result_dir, "model.ckpt"),
            map_location=device,
        ),
    )
    latent_rep_list.append(
        np.load(os.path.join(result_dir, "latent_representations.npy")),
    )

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:15<00:00,  3.02s/it]


In [8]:
de_result_list = []
for seed_index, seed in enumerate(seeds):
    model = model_list[seed_index]
    latent_rep = latent_rep_list[seed_index]
    latent_clusters = KMeans(n_clusters=2, random_state=123).fit(latent_rep).labels_
    cluster_label = f"cluster_{seed}"
    
    tmp_target_adata = target_adata.copy()
    tmp_target_adata.obs[cluster_label] = latent_clusters.astype(str)
    
    de_result = model.differential_expression(
        adata=tmp_target_adata,
        groupby=cluster_label,
        group1="0",
        group2="1",
        idx1=None,
        idx2=None,
        mode="change",
        delta=0.25,
        batch_size=128,
        all_stats=True,
        batch_correction=False,
        batchid1=None,
        batchid2=None,
        fdr_target=0.05,
        silent=False,
    )
    de_result["gene_symbol"] = de_result.index
    de_result["seed"] = seed
    de_result_list.append(de_result)

DE...: 100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [04:15<00:00, 255.13s/it]
DE...: 100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [04:01<00:00, 241.00s/it]
DE...: 100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [04:16<00:00, 256.29s/it]
DE...: 100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [03:47<00:00, 227.38s/it]
DE...: 100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [04:56<00:00, 297.00s/it]


In [9]:
de_result_mean = (
    de_result.groupby("gene_symbol", as_index=False)
    .mean()
    .sort_values(by="proba_de", ascending=False)
)

In [10]:
top_genes = de_result_mean[de_result_mean["proba_de"] > 0.95]["gene_symbol"].tolist()

enr = gp.enrichr(
    gene_list=top_genes,
    gene_sets="KEGG_2016",
    organism="human",
    cutoff=0.05,
)

enr_results = enr.results
enr_results = enr_results[enr_results["Adjusted P-value"] < 0.05]

In [11]:
cols = ["Gene_set", "Term", "Adjusted P-value", "Overlap", "Genes"]
enr_results[cols]

Unnamed: 0,Gene_set,Term,Adjusted P-value,Overlap,Genes
0,KEGG_2016,Graft-versus-host disease Homo sapiens hsa05332,7.888372e-16,12/41,HLA-DMA;HLA-DRB5;IL6;HLA-DMB;PRF1;HLA-DPB1;HLA...
1,KEGG_2016,Staphylococcus aureus infection Homo sapiens h...,7.888372e-16,13/56,C1QB;HLA-DRB5;CFH;C1S;HLA-DMA;HLA-DMB;HLA-DPB1...
2,KEGG_2016,Allograft rejection Homo sapiens hsa05330,1.187764e-14,11/38,HLA-DMA;HLA-DRB5;HLA-DMB;PRF1;HLA-DPB1;HLA-DRA...
3,KEGG_2016,Type I diabetes mellitus Homo sapiens hsa04940,4.151127e-14,11/43,HLA-DMA;HLA-DRB5;HLA-DMB;PRF1;HLA-DPB1;HLA-DRA...
4,KEGG_2016,Intestinal immune network for IgA production H...,1.271704e-13,11/48,CCL25;HLA-DMA;HLA-DRB5;IL6;HLA-DMB;HLA-DPB1;HL...
5,KEGG_2016,Autoimmune thyroid disease Homo sapiens hsa05320,3.485049e-13,11/53,HLA-DMA;HLA-DRB5;HLA-DMB;PRF1;HLA-DPB1;HLA-DRA...
6,KEGG_2016,Viral myocarditis Homo sapiens hsa05416,1.063759e-12,11/59,HLA-DMA;HLA-DRB5;HLA-DMB;PRF1;HLA-DPB1;HLA-DRA...
7,KEGG_2016,Asthma Homo sapiens hsa05310,2.417851e-12,9/31,HLA-DMA;HLA-DRB5;HLA-DMB;HLA-DPB1;HLA-DRA;HLA-...
8,KEGG_2016,Inflammatory bowel disease (IBD) Homo sapiens ...,2.56629e-12,11/65,HLA-DMA;HLA-DRB5;IL6;HLA-DMB;TBX21;HLA-DPB1;HL...
9,KEGG_2016,Antigen processing and presentation Homo sapie...,1.621771e-11,11/77,CD74;HLA-DMA;HLA-DRB5;HLA-DMB;HLA-DPB1;HLA-DRA...


In [12]:
enr_results["Term"].tolist()

['Graft-versus-host disease Homo sapiens hsa05332',
 'Staphylococcus aureus infection Homo sapiens hsa05150',
 'Allograft rejection Homo sapiens hsa05330',
 'Type I diabetes mellitus Homo sapiens hsa04940',
 'Intestinal immune network for IgA production Homo sapiens hsa04672',
 'Autoimmune thyroid disease Homo sapiens hsa05320',
 'Viral myocarditis Homo sapiens hsa05416',
 'Asthma Homo sapiens hsa05310',
 'Inflammatory bowel disease (IBD) Homo sapiens hsa05321',
 'Antigen processing and presentation Homo sapiens hsa04612',
 'Systemic lupus erythematosus Homo sapiens hsa05322',
 'Tuberculosis Homo sapiens hsa05152',
 'Cell adhesion molecules (CAMs) Homo sapiens hsa04514',
 'Herpes simplex infection Homo sapiens hsa05168',
 'Rheumatoid arthritis Homo sapiens hsa05323',
 'Leishmaniasis Homo sapiens hsa05140',
 'Influenza A Homo sapiens hsa05164',
 'Phagosome Homo sapiens hsa04145',
 'Toxoplasmosis Homo sapiens hsa05145',
 'Cytokine-cytokine receptor interaction Homo sapiens hsa04060',
 'H

In [13]:
len(top_genes)

122