In [1]:
import os
import torch

import scanpy as sc
import numpy as np
import pandas as pd
import gseapy as gp

from tqdm import tqdm
from sklearn.cluster import KMeans
from contrastive_vi.model.contrastive_vi import ContrastiveVIModel
from scripts import constants
from scvi._settings import settings

Global seed set to 0
1: package ‘methods’ was built under R version 3.6.1 
2: package ‘datasets’ was built under R version 3.6.1 
3: package ‘utils’ was built under R version 3.6.1 
4: package ‘grDevices’ was built under R version 3.6.1 
5: package ‘graphics’ was built under R version 3.6.1 
6: package ‘stats’ was built under R version 3.6.1 


In [2]:
settings.seed = 0
device = "cpu"
dataset = "zheng_2017"

Global seed set to 0


In [3]:
split_key = constants.DATASET_SPLIT_LOOKUP[dataset]["split_key"]
background_value = constants.DATASET_SPLIT_LOOKUP[dataset]["background_value"]
seeds = constants.DEFAULT_SEEDS
latent_size = 10

In [4]:
adata = sc.read_h5ad(
    os.path.join(
        constants.DEFAULT_DATA_PATH,
        f"{dataset}/preprocessed/adata_top_2000_genes.h5ad",
    )
)
ContrastiveVIModel.setup_anndata(adata, layer="count")

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Successfully registered anndata object containing [1;36m16856[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches,
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


In [5]:
target_indices = np.where(adata.obs[split_key] != background_value)[0]
target_adata = adata[target_indices]

In [6]:
genes = pd.read_table(
    os.path.join(
        constants.DEFAULT_DATA_PATH,
        dataset,
        "aml027_post_transplant_filtered_gene_bc_matrices",
        "filtered_matrices_mex/hg19",
        "genes.tsv",
    ),
    header=None,
)
genes = genes.rename(columns={0: "ensembl_id", 1: "gene_symbol"})
genes = genes[genes["ensembl_id"].isin(adata.var.index)]

In [7]:
model_list = []
latent_rep_list = []
for seed in tqdm(seeds):
    result_dir = os.path.join(
        constants.DEFAULT_RESULTS_PATH,
        f"{dataset}/contrastiveVI/latent_{latent_size}",
        f"{seed}",
    )
    model_list.append(
        torch.load(
            os.path.join(result_dir, "model.ckpt"),
            map_location=device,
        ),
    )
    latent_rep_list.append(
        np.load(os.path.join(result_dir, "latent_representations.npy")),
    )

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:14<00:00, 14.96s/it]


In [8]:
de_result_list = []
for seed_index, seed in enumerate(seeds):
    model = model_list[seed_index]
    latent_rep = latent_rep_list[seed_index]
    latent_clusters = KMeans(n_clusters=2, random_state=123).fit(latent_rep).labels_
    cluster_label = f"cluster_{seed}"
    
    tmp_target_adata = target_adata.copy()
    tmp_target_adata.obs[cluster_label] = latent_clusters.astype(str)
    
    de_result = model.differential_expression(
        adata=tmp_target_adata,
        groupby=cluster_label,
        group1="0",
        group2="1",
        idx1=None,
        idx2=None,
        mode="change",
        delta=0.25,
        batch_size=128,
        all_stats=True,
        batch_correction=False,
        batchid1=None,
        batchid2=None,
        fdr_target=0.05,
        silent=False,
    )
    
    de_result.reset_index()
    de_result["ensembl_id"] = de_result.index
    de_result = de_result.merge(genes, on="ensembl_id")
    de_result["seed"] = seed
    de_result_list.append(de_result)

DE...: 100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [05:05<00:00, 305.60s/it]
DE...: 100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [04:37<00:00, 277.71s/it]
DE...: 100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [04:36<00:00, 276.61s/it]
DE...: 100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [04:34<00:00, 274.65s/it]
DE...: 100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [04:36<00:00, 276.23s/it]


In [9]:
de_result = pd.concat(de_result_list)

In [10]:
de_result_mean = (
    de_result.groupby("gene_symbol", as_index=False)
    .mean()
    .sort_values(by="proba_de", ascending=False)
)

In [11]:
top_genes = de_result_mean[de_result_mean["proba_de"] > 0.95]["gene_symbol"].tolist()

enr = gp.enrichr(
    gene_list=top_genes,
    gene_sets="KEGG_2016",
    organism="human",
    cutoff=0.05,
)

enr_results = enr.results
enr_results = enr_results[enr_results["Adjusted P-value"] < 0.05]

In [12]:
cols = ["Gene_set", "Term", "Adjusted P-value", "Overlap", "Genes"]
enr_results[cols]

Unnamed: 0,Gene_set,Term,Adjusted P-value,Overlap,Genes
0,KEGG_2016,Hematopoietic cell lineage Homo sapiens hsa04640,8.72654e-10,23/88,HLA-DRB5;CSF1;FLT3;ITGA2B;DNTT;GP1BA;TNF;CD3D;...
1,KEGG_2016,Asthma Homo sapiens hsa05310,3.040355e-08,13/31,IL10;HLA-DRB5;FCER1G;PRG2;RNASE3;TNF;HLA-DMB;H...
2,KEGG_2016,Systemic lupus erythematosus Homo sapiens hsa0...,1.085732e-05,22/135,IL10;C1QA;HIST1H2BM;HLA-DRB5;HIST1H3J;HIST1H4L...
3,KEGG_2016,Type I diabetes mellitus Homo sapiens hsa04940,0.0001245843,11/43,HLA-DRB5;HLA-DMB;GAD1;IL1B;HLA-DPB1;GZMB;HLA-D...
4,KEGG_2016,Allograft rejection Homo sapiens hsa05330,0.0002209117,10/38,IL10;HLA-DRB5;HLA-DMB;HLA-DPB1;GZMB;HLA-DRA;TN...
5,KEGG_2016,Antigen processing and presentation Homo sapie...,0.0002788452,14/77,CD74;HLA-DRB5;HSPA5;IFI30;TNF;CTSS;HLA-DMB;CD8...
6,KEGG_2016,Graft-versus-host disease Homo sapiens hsa05332,0.0003006236,10/41,HLA-DRB5;HLA-DMB;IL1B;HLA-DPB1;GZMB;HLA-DRA;TN...
7,KEGG_2016,Rheumatoid arthritis Homo sapiens hsa05323,0.0003006236,15/90,HLA-DRB5;CSF1;CCL3L1;TNF;HLA-DMB;IL1B;CCL5;CCL...
8,KEGG_2016,Leishmaniasis Homo sapiens hsa05140,0.0004515285,13/73,IL10;HLA-DRB5;NCF2;PTGS2;TNF;NFKBIA;HLA-DMB;IL...
9,KEGG_2016,Cell adhesion molecules (CAMs) Homo sapiens hs...,0.0004515285,19/142,CD274;HLA-DRB5;CD2;CLDN10;HLA-DMB;CD8B;SELL;HL...


In [13]:
enr_results["Term"].tolist()

['Hematopoietic cell lineage Homo sapiens hsa04640',
 'Asthma Homo sapiens hsa05310',
 'Systemic lupus erythematosus Homo sapiens hsa05322',
 'Type I diabetes mellitus Homo sapiens hsa04940',
 'Allograft rejection Homo sapiens hsa05330',
 'Antigen processing and presentation Homo sapiens hsa04612',
 'Graft-versus-host disease Homo sapiens hsa05332',
 'Rheumatoid arthritis Homo sapiens hsa05323',
 'Leishmaniasis Homo sapiens hsa05140',
 'Cell adhesion molecules (CAMs) Homo sapiens hsa04514',
 'Staphylococcus aureus infection Homo sapiens hsa05150',
 'Chagas disease (American trypanosomiasis) Homo sapiens hsa05142',
 'Intestinal immune network for IgA production Homo sapiens hsa04672',
 'NF-kappa B signaling pathway Homo sapiens hsa04064',
 'Viral myocarditis Homo sapiens hsa05416',
 'Tuberculosis Homo sapiens hsa05152',
 'Autoimmune thyroid disease Homo sapiens hsa05320',
 'Inflammatory bowel disease (IBD) Homo sapiens hsa05321',
 'Toxoplasmosis Homo sapiens hsa05145',
 'Glycine, serine

In [14]:
len(top_genes)

891