In [1]:
import os
import torch

import scanpy as sc
import numpy as np
import pandas as pd
import gseapy as gp

from tqdm import tqdm
from sklearn.cluster import KMeans
from contrastive_vi.model.contrastive_vi import ContrastiveVIModel
from scripts import constants
from scvi._settings import settings

Global seed set to 0
1: package ‘methods’ was built under R version 3.6.1 
2: package ‘datasets’ was built under R version 3.6.1 
3: package ‘utils’ was built under R version 3.6.1 
4: package ‘grDevices’ was built under R version 3.6.1 
5: package ‘graphics’ was built under R version 3.6.1 
6: package ‘stats’ was built under R version 3.6.1 


In [2]:
settings.seed = 0
device = "cuda:2"
dataset = "zheng_2017"

Global seed set to 0


In [3]:
pathway_enr_fdr = 0.05
expression_delta = 0.15

In [4]:
split_key = constants.DATASET_SPLIT_LOOKUP[dataset]["split_key"]
background_value = constants.DATASET_SPLIT_LOOKUP[dataset]["background_value"]
label_key = constants.DATASET_SPLIT_LOOKUP[dataset]["label_key"]
seeds = constants.DEFAULT_SEEDS
latent_size = 10

In [5]:
adata = sc.read_h5ad(
    os.path.join(
        constants.DEFAULT_DATA_PATH,
        f"{dataset}/preprocessed/adata_top_2000_genes.h5ad",
    )
)
ContrastiveVIModel.setup_anndata(adata, layer="count")

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Successfully registered anndata object containing [1;36m16856[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches,
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


In [6]:
target_indices = np.where(adata.obs[split_key] != background_value)[0]
background_indices = np.where(adata.obs[split_key] == background_value)[0]

In [7]:
genes = pd.read_table(
    os.path.join(
        constants.DEFAULT_DATA_PATH,
        dataset,
        "aml027_post_transplant_filtered_gene_bc_matrices",
        "filtered_matrices_mex/hg19",
        "genes.tsv",
    ),
    header=None,
)
genes = genes.rename(columns={0: "ensembl_id", 1: "gene_symbol"})
genes = genes[genes["ensembl_id"].isin(adata.var.index)]

In [8]:
model_list = []
for seed in tqdm(seeds):
    result_dir = os.path.join(
        "/projects/leelab/contrastiveVI/results-fixed-background-size",
        f"{dataset}/contrastiveVI/latent_{latent_size}",
        f"{seed}",
    )
    model_list.append(
        torch.load(
            os.path.join(result_dir, "model.ckpt"),
            map_location=device,
        ),
    )

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:48<00:00,  9.75s/it]


In [9]:
de_result_list = []
enr_result_list = []

for seed_index, seed in enumerate(seeds):
    model = model_list[seed_index]
    
    de_result = model.differential_expression(
        adata=adata,
        groupby=None,
        group1=None,
        group2=None,
        idx1=background_indices,
        idx2=target_indices,
        mode="change",
        delta=expression_delta,
        batch_size=128,
        all_stats=True,
        batch_correction=False,
        batchid1=None,
        batchid2=None,
        fdr_target=0.05,
        silent=False,
        target_idx=target_indices,
    )

    de_result.reset_index()
    de_result["ensembl_id"] = de_result.index
    de_result = de_result.merge(genes, on="ensembl_id")
    de_result["seed"] = seed
    de_result_list.append(de_result)

    top_genes = de_result[de_result["proba_de"] > 0.95]["gene_symbol"].tolist()
    enr = gp.enrichr(
        gene_list=top_genes,
        gene_sets="KEGG_2016",
        organism="human",
        cutoff=pathway_enr_fdr,
    )
    enr_result = enr.results
    enr_result = enr_result[enr_result["Adjusted P-value"] < pathway_enr_fdr]
    enr_result["seed"] = seed
    enr_result_list.append(enr_result)

DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:37<00:00, 37.06s/it]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  enr_result["seed"] = seed


DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:35<00:00, 35.40s/it]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  enr_result["seed"] = seed


DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:36<00:00, 36.20s/it]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  enr_result["seed"] = seed


DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:35<00:00, 35.61s/it]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  enr_result["seed"] = seed


DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:35<00:00, 35.95s/it]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  enr_result["seed"] = seed


In [10]:
enr_df = pd.concat(enr_result_list)
cols = ["Gene_set", "Term", "Adjusted P-value", "Overlap", "Genes"]
cols += ["seed"]
enr_df = enr_df[cols]

In [11]:
enr_df

Unnamed: 0,Gene_set,Term,Adjusted P-value,Overlap,Genes,seed
0,KEGG_2016,Systemic lupus erythematosus Homo sapiens hsa0...,6.882865e-14,45/135,C1QA;HIST1H2BM;HIST1H2BO;HIST1H3J;HIST1H2BJ;HI...,123
1,KEGG_2016,Hematopoietic cell lineage Homo sapiens hsa04640,5.099962e-08,28/88,GYPA;CSF1;FLT3;ITGA2B;TNF;CD3D;CD38;CD14;CD34;...,123
2,KEGG_2016,Asthma Homo sapiens hsa05310,5.272066e-08,16/31,IL10;HLA-DRB5;FCER1G;PRG2;RNASE3;TNF;HLA-DMA;H...,123
3,KEGG_2016,Antigen processing and presentation Homo sapie...,1.648756e-05,22/77,CD74;HLA-DRB5;HSPA5;HSPA6;IFI30;TNF;CTSS;HLA-D...,123
4,KEGG_2016,Alcoholism Homo sapiens hsa05034,4.610281e-05,36/179,HIST1H2BM;HIST1H2BO;HIST1H3J;HIST1H2BJ;MAOA;HI...,123
...,...,...,...,...,...,...
29,KEGG_2016,Epstein-Barr virus infection Homo sapiens hsa0...,3.864584e-02,28/202,CDKN1A;PIK3R5;MAPK8;MYC;AKT3;CD38;RIPK1;PLCG1;...,999
30,KEGG_2016,Chagas disease (American trypanosomiasis) Homo...,4.047421e-02,17/104,IL10;C1QA;JUN;CCL3L1;TNF;CD3D;TGFBR1;GNAI1;PIK...,999
31,KEGG_2016,Viral carcinogenesis Homo sapiens hsa05203,4.451226e-02,28/205,GTF2A1;HIST1H2BM;CDKN1A;HIST1H2BO;HIST1H2BJ;HI...,999
32,KEGG_2016,Malaria Homo sapiens hsa05144,4.740853e-02,10/49,IL10;GYPA;KLRB1;CD81;GYPB;HGF;IL1B;CD36;HBA1;TNF,999


## Aggregate analysis

In [12]:
de_result = pd.concat(de_result_list)
de_result_mean = (
    de_result.groupby("gene_symbol", as_index=False)
    .mean()
    .sort_values(by="proba_de", ascending=False)
)

In [13]:
de_result_mean["proba_de"].describe()

count    2000.000000
mean        0.963476
std         0.014507
min         0.892560
25%         0.956230
50%         0.966480
75%         0.974170
max         0.987760
Name: proba_de, dtype: float64

In [14]:
top_genes = de_result_mean[de_result_mean["proba_de"] > 0.95]["gene_symbol"].tolist()

enr = gp.enrichr(
    gene_list=top_genes,
    gene_sets="KEGG_2016",
    organism="human",
    cutoff=0.05,
)
enr_results = enr.results
enr_results = enr_results[enr_results["Adjusted P-value"] < 0.05]

In [15]:
enr_results

Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes
0,KEGG_2016,Systemic lupus erythematosus Homo sapiens hsa0...,44/135,1.078488e-15,2.987411e-13,0,0,5.380375,185.425044,C1QA;HIST1H2BM;HIST1H2BO;HIST1H3J;HIST1H2BJ;HI...
1,KEGG_2016,Hematopoietic cell lineage Homo sapiens hsa04640,29/88,5.876639e-11,8.139145e-09,0,0,5.429379,127.902319,GYPA;CSF1;FLT3;ITGA2B;TNF;CD3D;CD38;CD36;CD14;...
2,KEGG_2016,Asthma Homo sapiens hsa05310,16/31,5.18526e-10,4.787724e-08,0,0,11.718607,250.544189,IL10;HLA-DRB5;FCER1G;PRG2;RNASE3;TNF;HLA-DMA;H...
3,KEGG_2016,Antigen processing and presentation Homo sapie...,22/77,2.126149e-07,1.472358e-05,0,0,4.400723,67.611754,CD74;HLA-DRB5;HSPA5;HSPA6;IFI30;TNF;CTSS;HLA-D...
4,KEGG_2016,Rheumatoid arthritis Homo sapiens hsa05323,23/90,1.011851e-06,5.081949e-05,0,0,3.776533,52.130243,JUN;HLA-DRB5;CSF1;CCL3L1;IL15;TNF;TNFSF13B;VEG...
5,KEGG_2016,Type I diabetes mellitus Homo sapiens hsa04940,15/43,1.100783e-06,5.081949e-05,0,0,5.877753,80.639766,HLA-DRB5;GAD1;ICA1;GZMB;TNF;HLA-DMA;HLA-DMB;IL...
6,KEGG_2016,Alcoholism Homo sapiens hsa05034,35/179,2.003061e-06,7.926397e-05,0,0,2.682023,35.190377,HIST1H2BM;HIST1H2BO;HIST1H3J;HIST1H2BJ;MAOA;HI...
7,KEGG_2016,Graft-versus-host disease Homo sapiens hsa05332,14/41,3.359729e-06,0.0001163306,0,0,5.685985,71.66416,HLA-DRB5;GZMB;TNF;HLA-DMA;HLA-DMB;IL1B;HLA-DPB...
8,KEGG_2016,Leishmaniasis Homo sapiens hsa05140,19/73,6.477042e-06,0.000199349,0,0,3.864235,46.166965,IL10;JUN;MARCKSL1;HLA-DRB5;NCF2;PTGS2;TNF;NFKB...
9,KEGG_2016,Allograft rejection Homo sapiens hsa05330,13/38,7.340108e-06,0.000203321,0,0,5.699437,67.379637,IL10;HLA-DRB5;GZMB;TNF;HLA-DMA;HLA-DMB;HLA-DPB...


In [16]:
len(top_genes)

1682

Systemic lupus erythematosus is related to AML (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6078549/)