In [1]:
import os
import torch

import scanpy as sc
import numpy as np
import pandas as pd
import gseapy as gp

from tqdm import tqdm
from sklearn.cluster import KMeans
from scvi.model import TOTALVI
from scripts import constants
from scvi._settings import settings

Global seed set to 0


In [2]:
settings.seed = 0
device = "cuda:7"
dataset = "papalexi_2021"

Global seed set to 0


In [3]:
split_key = constants.DATASET_SPLIT_LOOKUP[dataset]["split_key"]
background_value = constants.DATASET_SPLIT_LOOKUP[dataset]["background_value"]
seeds = constants.DEFAULT_SEEDS
latent_size = 10

In [5]:
adata = sc.read_h5ad(
    os.path.join(
        constants.DEFAULT_DATA_PATH,
        f"{dataset}/preprocessed/adata_top_2000_genes_tc.h5ad",
    )
)
TOTALVI.setup_anndata(
    adata,
    layer="count",
    protein_expression_obsm_key="protein_expression",
)

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Using protein expression from adata.obsm[1m[[0m[32m'protein_expression'[0m[1m][0m                      
[34mINFO    [0m Using protein names from columns of adata.obsm[1m[[0m[32m'protein_expression'[0m[1m][0m                
[34mINFO    [0m Successfully registered anndata object containing [1;36m20729[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches,
         [1;36m1[0m labels, and [1;36m4[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further mo

In [6]:
target_indices = np.where(adata.obs[split_key] != background_value)[0]
target_adata = adata[target_indices]

In [7]:
genes = adata.var.index.tolist()

In [8]:
model_list = []
latent_rep_list = []
for seed in tqdm(seeds):
    result_dir = os.path.join(
        constants.DEFAULT_RESULTS_PATH,
        f"{dataset}/totalVI/latent_{latent_size}",
        f"{seed}",
    )
    model_list.append(
        torch.load(
            os.path.join(result_dir, "model.ckpt"),
            map_location=device,
        ),
    )
    latent_rep_list.append(
        np.load(os.path.join(result_dir, "latent_representations.npy")),
    )

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:01<00:00, 24.20s/it]


In [9]:
de_result_list = []
for seed_index, seed in enumerate(seeds):
    model = model_list[seed_index]
    latent_rep = latent_rep_list[seed_index]
    latent_clusters = KMeans(n_clusters=2, random_state=123).fit(latent_rep).labels_
    cluster_label = f"cluster_{seed}"
    
    tmp_target_adata = target_adata.copy()
    tmp_target_adata.obs[cluster_label] = latent_clusters.astype(str)
    
    de_result = model.differential_expression(
        adata=tmp_target_adata,
        groupby=cluster_label,
        group1="0",
        group2="1",
        idx1=None,
        idx2=None,
        mode="change",
        delta=0.25,
        batch_size=128,
        all_stats=True,
        batch_correction=False,
        batchid1=None,
        batchid2=None,
        fdr_target=0.05,
        silent=False,
    )
    de_result["gene_symbol"] = de_result.index
    de_result["seed"] = seed
    de_result_list.append(de_result)

DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.53s/it]
DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.28s/it]
DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.69s/it]
DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.78s/it]
DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.14s/it]


In [10]:
de_result_mean = (
    de_result.groupby("gene_symbol", as_index=False)
    .mean()
    .sort_values(by="proba_de", ascending=False)
)

In [11]:
top_genes = de_result_mean[de_result_mean["proba_de"] > 0.95]["gene_symbol"].tolist()

enr = gp.enrichr(
    gene_list=top_genes,
    gene_sets="KEGG_2016",
    organism="human",
    cutoff=0.05,
)

enr_results = enr.results
enr_results = enr_results[enr_results["Adjusted P-value"] < 0.05]

In [12]:
cols = ["Gene_set", "Term", "Adjusted P-value", "Overlap", "Genes"]
enr_results[cols]

Unnamed: 0,Gene_set,Term,Adjusted P-value,Overlap,Genes
0,KEGG_2016,Cytokine-cytokine receptor interaction Homo sa...,0.000261,12/265,CXCL11;CXCL9;IFNL1;IL18RAP;CCL8;IFNB1;CCL4L2;K...
1,KEGG_2016,Vascular smooth muscle contraction Homo sapien...,0.000449,8/120,PPP1R14A;EDNRA;RAMP3;CALML5;ADORA2A;CALML3;ADC...
2,KEGG_2016,Melanogenesis Homo sapiens hsa04916,0.000844,7/100,EDN1;CALML5;CREB3L3;WNT7B;KIT;CALML3;ADCY8
3,KEGG_2016,Salivary secretion Homo sapiens hsa04970,0.003178,6/89,HTN3;CALML5;CALML3;ATP1B2;ADCY8;HTN1
4,KEGG_2016,Inflammatory mediator regulation of TRP channe...,0.004338,6/98,ASIC4;CALML5;HTR2C;CALML3;NGF;ADCY8
5,KEGG_2016,cAMP signaling pathway Homo sapiens hsa04024,0.005501,8/199,HCAR2;EDNRA;CALML5;CREB3L3;ADORA2A;CALML3;ATP1...
6,KEGG_2016,Nitrogen metabolism Homo sapiens hsa00910,0.006899,3/17,CA4;CA7;CA8
7,KEGG_2016,Aldosterone synthesis and secretion Homo sapie...,0.00946,5/81,NR4A1;CALML5;CREB3L3;CALML3;ADCY8
8,KEGG_2016,Rap1 signaling pathway Homo sapiens hsa04015,0.025463,7/211,CALML5;ADORA2A;KIT;CALML3;ADCY8;NGF;MET
9,KEGG_2016,Toll-like receptor signaling pathway Homo sapi...,0.025463,5/106,CXCL11;CXCL9;IFNB1;CCL4L2;CCL4


In [13]:
enr_results["Term"].tolist()

['Cytokine-cytokine receptor interaction Homo sapiens hsa04060',
 'Vascular smooth muscle contraction Homo sapiens hsa04270',
 'Melanogenesis Homo sapiens hsa04916',
 'Salivary secretion Homo sapiens hsa04970',
 'Inflammatory mediator regulation of TRP channels Homo sapiens hsa04750',
 'cAMP signaling pathway Homo sapiens hsa04024',
 'Nitrogen metabolism Homo sapiens hsa00910',
 'Aldosterone synthesis and secretion Homo sapiens hsa04925',
 'Rap1 signaling pathway Homo sapiens hsa04015',
 'Toll-like receptor signaling pathway Homo sapiens hsa04620',
 'cGMP-PKG signaling pathway Homo sapiens hsa04022',
 'Gastric acid secretion Homo sapiens hsa04971',
 'Pertussis Homo sapiens hsa05133',
 'Calcium signaling pathway Homo sapiens hsa04020',
 'Chemokine signaling pathway Homo sapiens hsa04062',
 'TGF-beta signaling pathway Homo sapiens hsa04350']

In [14]:
len(top_genes)

162