In [1]:
import os
import torch

import scanpy as sc
import numpy as np
import pandas as pd
import gseapy as gp

from tqdm import tqdm
from sklearn.cluster import KMeans
from contrastive_vi.model.contrastive_vi import ContrastiveVIModel
from scripts import constants
from scvi._settings import settings

Global seed set to 0
1: package ‘methods’ was built under R version 3.6.1 
2: package ‘datasets’ was built under R version 3.6.1 
3: package ‘utils’ was built under R version 3.6.1 
4: package ‘grDevices’ was built under R version 3.6.1 
5: package ‘graphics’ was built under R version 3.6.1 
6: package ‘stats’ was built under R version 3.6.1 


In [2]:
settings.seed = 0
device = "cuda:1"
dataset = "haber_2017"

Global seed set to 0


In [3]:
pathway_enr_fdr = 0.05
expression_delta = 0.15

In [4]:
split_key = constants.DATASET_SPLIT_LOOKUP[dataset]["split_key"]
background_value = constants.DATASET_SPLIT_LOOKUP[dataset]["background_value"]
label_key = constants.DATASET_SPLIT_LOOKUP[dataset]["label_key"]
seeds = constants.DEFAULT_SEEDS
latent_size = 10

In [5]:
adata = sc.read_h5ad(
    os.path.join(
        constants.DEFAULT_DATA_PATH,
        f"{dataset}/preprocessed/adata_top_2000_genes_tc.h5ad",
    )
)
ContrastiveVIModel.setup_anndata(adata, layer="count")

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Successfully registered anndata object containing [1;36m7721[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


In [6]:
target_indices = np.where(adata.obs[split_key] != background_value)[0]
background_indices = np.where(adata.obs[split_key] == background_value)[0]

In [7]:
model_list = []
for seed in tqdm(seeds):
    result_dir = os.path.join(
        constants.DEFAULT_RESULTS_PATH,
        f"{dataset}/contrastiveVI/latent_{latent_size}",
        f"{seed}",
    )
    model_list.append(
        torch.load(
            os.path.join(result_dir, "model.ckpt"),
            map_location=device,
        ),
    )

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:48<00:00,  9.65s/it]


In [8]:
de_result_list = []
enr_result_list = []

for seed_index, seed in enumerate(seeds):
    model = model_list[seed_index]
    
    de_result = model.differential_expression(
        adata=adata,
        groupby=None,
        group1=None,
        group2=None,
        idx1=background_indices,
        idx2=target_indices,
        mode="change",
        delta=expression_delta,
        batch_size=128,
        all_stats=True,
        batch_correction=False,
        batchid1=None,
        batchid2=None,
        fdr_target=0.05,
        silent=False,
        target_idx=target_indices,
    )

    de_result.reset_index()
    de_result["gene_symbol"] = de_result.index
    de_result["seed"] = seed
    de_result_list.append(de_result)

    top_genes = de_result[de_result["proba_de"] > 0.95]["gene_symbol"].tolist()
    enr = gp.enrichr(
        gene_list=top_genes,
        gene_sets="KEGG_2019_Mouse",
        organism="mouse",
        cutoff=pathway_enr_fdr,
    )
    enr_result = enr.results
    enr_result = enr_result[enr_result["Adjusted P-value"] < pathway_enr_fdr]
    enr_result["seed"] = seed
    enr_result_list.append(enr_result)

DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:32<00:00, 32.57s/it]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  enr_result["seed"] = seed


DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:33<00:00, 33.80s/it]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  enr_result["seed"] = seed


DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:33<00:00, 33.50s/it]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  enr_result["seed"] = seed


DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:33<00:00, 33.24s/it]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  enr_result["seed"] = seed


DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:33<00:00, 33.69s/it]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  enr_result["seed"] = seed


In [9]:
enr_df = pd.concat(enr_result_list)
cols = ["Gene_set", "Term", "Adjusted P-value", "Overlap", "Genes"]
cols += ["seed"]
enr_df = enr_df[cols]

In [15]:
enr_df

Unnamed: 0,Gene_set,Term,Adjusted P-value,Overlap,Genes,seed
0,KEGG_2019_Mouse,Arachidonic acid metabolism,1.2e-05,22/89,CBR2;PLA2G2F;PLA2G2D;CYP2J5;GPX3;GPX5;PLA2G4C;...,123
1,KEGG_2019_Mouse,Linoleic acid metabolism,0.001942,13/50,PLA2G2F;PLA2G2D;CYP2J5;PLA2G4C;CYP3A11;PLA2G4A...,123
2,KEGG_2019_Mouse,Maturity onset diabetes of the young,0.003203,9/27,NEUROD1;PAX4;SLC2A2;IAPP;BHLHA15;GCK;FOXA3;NEU...,123
3,KEGG_2019_Mouse,Fat digestion and absorption,0.013578,10/40,PLA2G2F;FABP1;PNLIPRP2;SCARB1;FABP2;PLA2G2D;CL...,123
4,KEGG_2019_Mouse,Chemical carcinogenesis,0.020853,16/94,GSTO2;UGT2B36;EPHX1;CYP3A11;CYP3A25;CYP2C29;AL...,123
0,KEGG_2019_Mouse,Arachidonic acid metabolism,0.000787,20/89,CBR2;PLA2G2F;PLA2G2D;CYP2J5;GPX3;PLA2G4C;PLA2G...,42
1,KEGG_2019_Mouse,Linoleic acid metabolism,0.004161,13/50,PLA2G2F;PLA2G2D;CYP2J5;PLA2G4C;CYP3A11;PLA2G4A...,42
2,KEGG_2019_Mouse,Chemical carcinogenesis,0.025383,17/94,GSTM3;GSTO2;UGT2B36;EPHX1;CYP3A11;CYP3A25;CYP2...,42
0,KEGG_2019_Mouse,Arachidonic acid metabolism,0.001652,20/89,CBR2;PLA2G2F;CYP2J5;GPX3;GPX5;PLA2G4A;PLA2G3;C...,789
1,KEGG_2019_Mouse,Insulin secretion,0.022303,17/86,CAMK2B;SNAP25;PRKCB;GPR119;SLC2A2;GCG;CCK;ADCY...,789


## Aggregate analysis

In [11]:
de_result = pd.concat(de_result_list)
de_result_mean = (
    de_result.groupby("gene_symbol", as_index=False)
    .mean()
    .sort_values(by="proba_de", ascending=False)
)

In [12]:
top_genes = de_result_mean[de_result_mean["proba_de"] > 0.95]["gene_symbol"].tolist()

enr = gp.enrichr(
    gene_list=top_genes,
    gene_sets="KEGG_2019_Mouse",
    organism="mouse",
    cutoff=0.05,
)
enr_results = enr.results
enr_results = enr_results[enr_results["Adjusted P-value"] < 0.05]

In [13]:
enr_results

Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes
0,KEGG_2019_Mouse,Arachidonic acid metabolism,19/89,2.1e-05,0.006048,0,0,3.437847,36.956407,CBR2;PLA2G2F;PLA2G2D;GPX3;GPX5;PLA2G4C;PLA2G4A...
1,KEGG_2019_Mouse,Linoleic acid metabolism,12/50,0.000217,0.030651,0,0,3.987489,33.629846,PLA2G2F;PLA2G2D;CYP2C55;CYP2C66;PLA2G4C;CYP3A1...
2,KEGG_2019_Mouse,Fat digestion and absorption,10/40,0.000508,0.03605,0,0,4.205093,31.896626,PLA2G2F;FABP1;PNLIPRP2;SCARB1;FABP2;PLA2G2D;CL...
3,KEGG_2019_Mouse,Maturity onset diabetes of the young,8/27,0.000538,0.03605,0,0,5.307615,39.953797,NEUROD1;PAX4;PAX6;BHLHA15;GCK;FOXA3;NEUROG3;FOXA2
4,KEGG_2019_Mouse,Pancreatic secretion,18/105,0.000639,0.03605,0,0,2.616291,19.243633,PLA2G2F;PNLIPRP2;CAR2;PLA2G2D;CTRB1;ATP2A3;ATP...
