In [1]:
import os
import torch

import scanpy as sc
import numpy as np
import pandas as pd
import gseapy as gp

from tqdm import tqdm
from sklearn.cluster import KMeans
from contrastive_vi.model.contrastive_vi import ContrastiveVIModel
from scripts import constants
from scvi._settings import settings

Global seed set to 0
1: package ‘methods’ was built under R version 3.6.1 
2: package ‘datasets’ was built under R version 3.6.1 
3: package ‘utils’ was built under R version 3.6.1 
4: package ‘grDevices’ was built under R version 3.6.1 
5: package ‘graphics’ was built under R version 3.6.1 
6: package ‘stats’ was built under R version 3.6.1 


In [2]:
settings.seed = 0
device = "cuda:1"
dataset = "mcfarland_2020"

Global seed set to 0


In [3]:
pathway_enr_fdr = 0.1

In [4]:
split_key = constants.DATASET_SPLIT_LOOKUP[dataset]["split_key"]
background_value = constants.DATASET_SPLIT_LOOKUP[dataset]["background_value"]
label_key = constants.DATASET_SPLIT_LOOKUP[dataset]["label_key"]
seeds = constants.DEFAULT_SEEDS
latent_size = 10

In [5]:
adata = sc.read_h5ad(
    os.path.join(
        constants.DEFAULT_DATA_PATH,
        f"{dataset}/preprocessed/adata_top_2000_genes_tc.h5ad",
    )
)
ContrastiveVIModel.setup_anndata(adata, layer="count")

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.


[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Successfully registered anndata object containing [1;36m5928[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


In [6]:
target_indices = np.where(adata.obs[split_key] != background_value)[0]
target_adata = adata[target_indices]
background_indices = np.where(adata.obs[split_key] == background_value)[0]
background_adata = adata[background_indices]

In [7]:
genes = pd.read_table(
    os.path.join(
        constants.DEFAULT_DATA_PATH,
        dataset,
        "idasanutlin",
        "Idasanutlin_24hr_expt1",
        "genes.tsv",
    ),
    header=None,
)
genes = genes.rename(columns={0: "ensembl_id", 1: "gene_symbol"})
genes = genes[genes["ensembl_id"].isin(adata.var.index)]

In [8]:
model_list = []
latent_rep_list = []
for seed in tqdm(seeds):
    result_dir = os.path.join(
        constants.DEFAULT_RESULTS_PATH,
        f"{dataset}/contrastiveVI/latent_{latent_size}",
        f"{seed}",
    )
    model_list.append(
        torch.load(
            os.path.join(result_dir, "model.ckpt"),
            map_location=device,
        ),
    )
    latent_rep_list.append(
        np.load(os.path.join(result_dir, "latent_representations.npy")),
    )

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:45<00:00, 21.20s/it]


In [9]:
de_results = []
enr_results = []
label_proportion_results = []
assigned_label_results = []

for seed_index, seed in enumerate(seeds):
    model = model_list[seed_index]
    latent_rep = latent_rep_list[seed_index]
    latent_clusters = KMeans(n_clusters=2, random_state=123).fit(latent_rep).labels_
    cluster_label = f"cluster_{seed}"
    
    tmp_target_adata = target_adata.copy()
    tmp_target_adata.obs[cluster_label] = latent_clusters.astype(str)
    tmp_background_adata = background_adata.copy()
    tmp_background_adata.obs[cluster_label] = "background"
    
    de_results.append({})
    enr_results.append({})
    label_proportion_results.append({})
    assigned_label_results.append({})
    
    for cluster in np.unique(latent_clusters):
        cluster_str = f"{cluster}"
        
        label_proportions = (
            tmp_target_adata[tmp_target_adata.obs[cluster_label] == cluster_str]
            .obs[label_key].value_counts()
            / (tmp_target_adata.obs[cluster_label] == cluster_str).sum()
        )
        label_proportion_results[-1][cluster_str] = label_proportions
        assigned_label_results[-1][cluster_str] = (
            label_proportions.index[label_proportions.argmax()]
        )
        
        cluster_adata = tmp_background_adata.concatenate(
            tmp_target_adata[tmp_target_adata.obs[cluster_label] == cluster_str]
        )
    
        cluster_de_result = model.differential_expression(
            adata=cluster_adata,
            groupby=cluster_label,
            group1="background",
            group2=cluster_str,
            idx1=None,
            idx2=None,
            mode="change",
            delta=0.25,
            batch_size=128,
            all_stats=True,
            batch_correction=False,
            batchid1=None,
            batchid2=None,
            fdr_target=0.05,
            silent=False,
        )

        cluster_de_result.reset_index()
        cluster_de_result["ensembl_id"] = cluster_de_result.index
        cluster_de_result = cluster_de_result.merge(genes, on="ensembl_id")
        cluster_de_result["seed"] = seed
        de_results[-1][cluster_str] = cluster_de_result
        
        top_genes = cluster_de_result[
            cluster_de_result["proba_de"] > 0.95
        ]["gene_symbol"].tolist()
        enr = gp.enrichr(
            gene_list=top_genes,
            gene_sets="KEGG_2016",
            organism="human",
            cutoff=pathway_enr_fdr,
        )
        cluster_enr_result = enr.results
        cluster_enr_result = cluster_enr_result[
            cluster_enr_result["Adjusted P-value"] < pathway_enr_fdr
        ]
        enr_results[-1][cluster_str] = cluster_enr_result

[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m3934[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:38<00:00, 38.61s/it]
[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m4825[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:39<00:00, 39.14s/it]




[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m5205[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:38<00:00, 38.41s/it]




[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m3554[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:38<00:00, 38.32s/it]
[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using



DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:36<00:00, 36.23s/it]
[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m5296[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:37<00:00, 37.55s/it]




[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m3507[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:36<00:00, 36.92s/it]




[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m5252[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:37<00:00, 37.54s/it]
[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using



DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:36<00:00, 36.75s/it]
[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m3670[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:37<00:00, 37.33s/it]


In [10]:
enr_df_list = []
for i in range(len(assigned_label_results)):
    assigned_labels = assigned_label_results[i]
    enrs = enr_results[i]
    for cluster in assigned_labels.keys():
        enr = enrs[cluster].copy()
        enr["assigned_label"] = assigned_labels[cluster]
        enr["cluster"] = cluster
        enr["seed"] = constants.DEFAULT_SEEDS[i]
        enr_df_list.append(enr)
enr_df = pd.concat(enr_df_list)

In [11]:
cols = ["Gene_set", "Term", "Adjusted P-value", "Overlap", "Genes"]
cols += ["assigned_label", "cluster", "seed"]
enr_df = enr_df[cols]

### Enriched pathways for cluster associated with idasanutlin-treated TP53 mutants vs. DMSO-treated cells.

In [12]:
enr_df[enr_df["assigned_label"] == "Mutation"].sort_values(by="Term", ascending=False)

Unnamed: 0,Gene_set,Term,Adjusted P-value,Overlap,Genes,assigned_label,cluster,seed
18,KEGG_2016,Vascular smooth muscle contraction Homo sapien...,0.088861,2/120,CALML5;CALML3,Mutation,1,46
4,KEGG_2016,Vascular smooth muscle contraction Homo sapien...,0.064789,3/120,CALML5;PLA2G2A;CALML3,Mutation,0,123
2,KEGG_2016,Thyroid hormone synthesis Homo sapiens hsa04918,0.014125,1/71,GPX3,Mutation,0,999
1,KEGG_2016,Salivary secretion Homo sapiens hsa04970,0.033661,3/89,CST1;CALML5;CALML3,Mutation,1,46
3,KEGG_2016,Renin secretion Homo sapiens hsa04924,0.07213,2/64,CALML5;CALML3,Mutation,1,46
0,KEGG_2016,Renin secretion Homo sapiens hsa04924,0.047842,3/64,CALML5;CLCA2;CALML3,Mutation,0,123
10,KEGG_2016,Ras signaling pathway Homo sapiens hsa04014,0.07213,3/227,FGF14;CALML5;CALML3,Mutation,1,46
8,KEGG_2016,Rap1 signaling pathway Homo sapiens hsa04015,0.07213,3/211,FGF14;CALML5;CALML3,Mutation,1,46
2,KEGG_2016,Phototransduction Homo sapiens hsa04744,0.033661,2/27,CALML5;CALML3,Mutation,1,46
2,KEGG_2016,Phototransduction Homo sapiens hsa04744,0.0529,2/27,CALML5;CALML3,Mutation,0,123


### Enriched pathways for cluster associated with idasanutlin-treated TP53 wild-types vs. DMSO-treated cells.

In [13]:
enr_df[enr_df["assigned_label"] == "Wild Type"].sort_values(by="Term", ascending=False)

Unnamed: 0,Gene_set,Term,Adjusted P-value,Overlap,Genes,assigned_label,cluster,seed
0,KEGG_2016,p53 signaling pathway Homo sapiens hsa04115,0.001761,3/69,CDKN1A;MDM2;GTSE1,Wild Type,1,999
0,KEGG_2016,p53 signaling pathway Homo sapiens hsa04115,0.001896,6/69,CDKN1A;CCNB1;TP53I3;SERPINE1;MDM2;FAS,Wild Type,0,789
1,KEGG_2016,p53 signaling pathway Homo sapiens hsa04115,0.022649,4/69,CDKN1A;TP53I3;SERPINE1;MDM2,Wild Type,1,42
12,KEGG_2016,Viral carcinogenesis Homo sapiens hsa05203,0.055491,2/205,CDKN1A;MDM2,Wild Type,1,999
6,KEGG_2016,Vascular smooth muscle contraction Homo sapien...,0.034514,2/120,ACTA2;ACTG2,Wild Type,1,999
0,KEGG_2016,Ubiquitin mediated proteolysis Homo sapiens hs...,0.012323,6/137,CDC20;UBE2C;UBE2S;MDM2;UBE2QL1;BIRC7,Wild Type,1,42
9,KEGG_2016,Transcriptional misregulation in cancer Homo s...,0.055491,2/180,CDKN1A;MDM2,Wild Type,1,999
11,KEGG_2016,Proteoglycans in cancer Homo sapiens hsa05205,0.055491,2/203,CDKN1A;MDM2,Wild Type,1,999
5,KEGG_2016,Prostate cancer Homo sapiens hsa05215,0.024131,2/89,CDKN1A;MDM2,Wild Type,1,999
3,KEGG_2016,Melanoma Homo sapiens hsa05218,0.01962,2/71,CDKN1A;MDM2,Wild Type,1,999
