In [1]:
import os
import torch

import scanpy as sc
import numpy as np
import pandas as pd
import gseapy as gp

from tqdm import tqdm
from sklearn.cluster import KMeans
from contrastive_vi.model.contrastive_vi import ContrastiveVIModel
from scripts import constants
from scvi._settings import settings

Global seed set to 0
1: package ‘methods’ was built under R version 3.6.1 
2: package ‘datasets’ was built under R version 3.6.1 
3: package ‘utils’ was built under R version 3.6.1 
4: package ‘grDevices’ was built under R version 3.6.1 
5: package ‘graphics’ was built under R version 3.6.1 
6: package ‘stats’ was built under R version 3.6.1 


In [2]:
settings.seed = 0
device = "cuda:1"
dataset = "mcfarland_2020"

Global seed set to 0


In [3]:
pathway_enr_fdr = 0.05
expression_delta = 0.15

In [4]:
split_key = constants.DATASET_SPLIT_LOOKUP[dataset]["split_key"]
background_value = constants.DATASET_SPLIT_LOOKUP[dataset]["background_value"]
label_key = constants.DATASET_SPLIT_LOOKUP[dataset]["label_key"]
seeds = constants.DEFAULT_SEEDS
latent_size = 10

In [5]:
adata = sc.read_h5ad(
    os.path.join(
        constants.DEFAULT_DATA_PATH,
        f"{dataset}/preprocessed/adata_top_2000_genes_tc.h5ad",
    )
)
ContrastiveVIModel.setup_anndata(adata, layer="count")

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.


[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Successfully registered anndata object containing [1;36m5928[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


In [6]:
target_indices = np.where(adata.obs[split_key] != background_value)[0]
target_adata = adata[target_indices]
background_indices = np.where(adata.obs[split_key] == background_value)[0]
background_adata = adata[background_indices]

In [7]:
genes = pd.read_table(
    os.path.join(
        constants.DEFAULT_DATA_PATH,
        dataset,
        "idasanutlin",
        "Idasanutlin_24hr_expt1",
        "genes.tsv",
    ),
    header=None,
)
genes = genes.rename(columns={0: "ensembl_id", 1: "gene_symbol"})
genes = genes[genes["ensembl_id"].isin(adata.var.index)]

In [8]:
model_list = []
latent_rep_list = []
for seed in tqdm(seeds):
    result_dir = os.path.join(
        constants.DEFAULT_RESULTS_PATH,
        f"{dataset}/contrastiveVI/latent_{latent_size}",
        f"{seed}",
    )
    model_list.append(
        torch.load(
            os.path.join(result_dir, "model.ckpt"),
            map_location=device,
        ),
    )
    latent_rep_list.append(
        np.load(os.path.join(result_dir, "latent_representations.npy")),
    )

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:22<00:00,  4.43s/it]


In [9]:
de_results = []
enr_results = []
label_proportion_results = []
assigned_label_results = []

for seed_index, seed in enumerate(seeds):
    model = model_list[seed_index]
    latent_rep = latent_rep_list[seed_index]
    latent_clusters = KMeans(n_clusters=2, random_state=123).fit(latent_rep).labels_
    cluster_label = f"cluster_{seed}"
    
    tmp_target_adata = target_adata.copy()
    tmp_target_adata.obs[cluster_label] = latent_clusters.astype(str)
    tmp_background_adata = background_adata.copy()
    tmp_background_adata.obs[cluster_label] = "background"
    
    de_results.append({})
    enr_results.append({})
    label_proportion_results.append({})
    assigned_label_results.append({})
    
    for cluster in np.unique(latent_clusters):
        cluster_str = f"{cluster}"
        
        label_proportions = (
            tmp_target_adata[tmp_target_adata.obs[cluster_label] == cluster_str]
            .obs[label_key].value_counts()
            / (tmp_target_adata.obs[cluster_label] == cluster_str).sum()
        )
        label_proportion_results[-1][cluster_str] = label_proportions
        assigned_label = label_proportions.index[label_proportions.argmax()]
        assigned_label_results[-1][cluster_str] = (assigned_label)
        
        cluster_adata = tmp_background_adata[
            tmp_background_adata.obs[label_key] == assigned_label
        ].concatenate(
            tmp_target_adata[tmp_target_adata.obs[cluster_label] == cluster_str]
        )
    
        cluster_de_result = model.differential_expression(
            adata=cluster_adata,
            groupby=cluster_label,
            group1="background",
            group2=cluster_str,
            idx1=None,
            idx2=None,
            mode="change",
            delta=expression_delta,
            batch_size=128,
            all_stats=True,
            batch_correction=False,
            batchid1=None,
            batchid2=None,
            fdr_target=0.05,
            silent=False,
            target_idx=target_indices,
        )

        cluster_de_result.reset_index()
        cluster_de_result["ensembl_id"] = cluster_de_result.index
        cluster_de_result = cluster_de_result.merge(genes, on="ensembl_id")
        cluster_de_result["seed"] = seed
        de_results[-1][cluster_str] = cluster_de_result
        
        top_genes = cluster_de_result[
            cluster_de_result["proba_de"] > 0.95
        ]["gene_symbol"].tolist()
        enr = gp.enrichr(
            gene_list=top_genes,
            gene_sets="KEGG_2016",
            organism="human",
            cutoff=pathway_enr_fdr,
        )
        cluster_enr_result = enr.results
        cluster_enr_result = cluster_enr_result[
            cluster_enr_result["Adjusted P-value"] < pathway_enr_fdr
        ]
        enr_results[-1][cluster_str] = cluster_enr_result

[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m3091[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:32<00:00, 32.99s/it]
[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m3982[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:32<00:00, 32.64s/it]
[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m1565[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:32<00:00, 32.99s/it]
[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m4363[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:32<00:00, 32.74s/it]
[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m1475[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:32<00:00, 32.76s/it]
[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m4453[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:32<00:00, 32.99s/it]
[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m1519[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:32<00:00, 32.85s/it]
[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m4409[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:30<00:00, 30.40s/it]
[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m4246[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:31<00:00, 31.05s/it]
[34mINFO    [0m Input adata not setup with scvi. attempting to transfer anndata setup               
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Registered keys:[1m[[0m[32m'X'[0m, [32m'batch_indices'[0m, [32m'labels'[0m[1m][0m                                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m1682[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              




DE...: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:31<00:00, 31.02s/it]


In [10]:
enr_df_list = []
for i in range(len(assigned_label_results)):
    assigned_labels = assigned_label_results[i]
    enrs = enr_results[i]
    for cluster in assigned_labels.keys():
        enr = enrs[cluster].copy()
        enr["assigned_label"] = assigned_labels[cluster]
        enr["cluster"] = cluster
        enr["seed"] = constants.DEFAULT_SEEDS[i]
        enr_df_list.append(enr)
enr_df = pd.concat(enr_df_list)

In [11]:
cols = ["Gene_set", "Term", "Adjusted P-value", "Overlap", "Genes"]
cols += ["assigned_label", "cluster", "seed"]
enr_df = enr_df[cols]

### Enriched pathways for cluster associated with idasanutlin-treated TP53 mutants vs. DMSO-treated TP53 mutatns.

In [12]:
mutant_enr_df = (
    enr_df[enr_df["assigned_label"] == "Mutation"]
    .sort_values(by="Term", ascending=False)
)

In [13]:
mutant_enr_df.sort_values(by="seed")

Unnamed: 0,Gene_set,Term,Adjusted P-value,Overlap,Genes,assigned_label,cluster,seed
2,KEGG_2016,ECM-receptor interaction Homo sapiens hsa04512,0.007212,12/82,COL1A1;COL1A2;LAMB3;COL6A2;COL6A1;TNC;SPP1;FN1...,Mutation,1,42
1,KEGG_2016,Protein digestion and absorption Homo sapiens ...,0.006446,13/90,COL17A1;PRSS1;COL11A1;COL1A1;COL3A1;COL1A2;COL...,Mutation,1,42
0,KEGG_2016,Rheumatoid arthritis Homo sapiens hsa05323,0.006446,13/90,IL11;MMP1;TGFB3;CCL20;MMP3;CXCL1;IL6;CXCL12;CT...,Mutation,1,42
3,KEGG_2016,Amoebiasis Homo sapiens hsa05146,0.009661,13/100,SERPINB3;SERPINB4;SERPINB1;LAMB3;TGFB3;FN1;LAM...,Mutation,1,42
0,KEGG_2016,Rheumatoid arthritis Homo sapiens hsa05323,0.004296,15/90,CXCL6;IL11;HLA-DRB5;MMP1;TGFB3;MMP3;CXCL5;IL6;...,Mutation,1,46
3,KEGG_2016,Cytokine-cytokine receptor interaction Homo sa...,0.019953,26/265,CXCL6;CSF3;IL26;IL24;TNFRSF11B;CXCL13;CXCL14;C...,Mutation,1,46
1,KEGG_2016,Protein digestion and absorption Homo sapiens ...,0.008637,14/90,CELA3A;PRSS1;COL15A1;COL27A1;COL1A1;COL3A1;SLC...,Mutation,1,46
2,KEGG_2016,ECM-receptor interaction Homo sapiens hsa04512,0.008637,13/82,ITGA2;LAMA4;TNC;FN1;LAMC2;THBS2;THBS1;COL1A1;C...,Mutation,1,46
4,KEGG_2016,Cytokine-cytokine receptor interaction Homo sa...,0.01202,30/265,CXCL6;CSF3;IL24;CXCL1;CXCL13;CXCL3;CXCL14;CXCL...,Mutation,0,123
8,KEGG_2016,Complement and coagulation cascades Homo sapie...,0.042364,12/79,SERPINB2;CFH;C1S;PLAU;C1R;SERPINE1;FGG;BDKRB1;...,Mutation,0,123


In [14]:
mutant_enr_df["Term"].value_counts()

Rheumatoid arthritis Homo sapiens hsa05323                       6
Protein digestion and absorption Homo sapiens hsa04974           6
Cytokine-cytokine receptor interaction Homo sapiens hsa04060     5
ECM-receptor interaction Homo sapiens hsa04512                   5
Amoebiasis Homo sapiens hsa05146                                 4
Graft-versus-host disease Homo sapiens hsa05332                  2
Complement and coagulation cascades Homo sapiens hsa04610        2
Pertussis Homo sapiens hsa05133                                  2
Chemokine signaling pathway Homo sapiens hsa04062                1
Focal adhesion Homo sapiens hsa04510                             1
Transcriptional misregulation in cancer Homo sapiens hsa05202    1
Malaria Homo sapiens hsa05144                                    1
TNF signaling pathway Homo sapiens hsa04668                      1
PI3K-Akt signaling pathway Homo sapiens hsa04151                 1
Pathways in cancer Homo sapiens hsa05200                      

### Enriched pathways for cluster associated with idasanutlin-treated TP53 wild-types vs. DMSO-treated TP53 wild-types.

In [15]:
wildtype_enr_df = (
    enr_df[enr_df["assigned_label"] == "Wild Type"]
    .sort_values(by="Term", ascending=False)
)

In [16]:
wildtype_enr_df["Term"].value_counts()

Rheumatoid arthritis Homo sapiens hsa05323                                    4
Transcriptional misregulation in cancer Homo sapiens hsa05202                 4
Cytokine-cytokine receptor interaction Homo sapiens hsa04060                  4
Protein digestion and absorption Homo sapiens hsa04974                        4
p53 signaling pathway Homo sapiens hsa04115                                   3
Mineral absorption Homo sapiens hsa04978                                      3
TNF signaling pathway Homo sapiens hsa04668                                   3
AGE-RAGE signaling pathway in diabetic complications Homo sapiens hsa04933    2
Amoebiasis Homo sapiens hsa05146                                              2
Pertussis Homo sapiens hsa05133                                               2
Pathways in cancer Homo sapiens hsa05200                                      2
Cell adhesion molecules (CAMs) Homo sapiens hsa04514                          2
ECM-receptor interaction Homo sapiens hs

The p53 signaling pathway is enriched for idasanutlin-treated TP53 wild-types vs. DMSO-treated TP53 wild-types while not enriched for the TP53 mutant comparison. This is consistent with the mechanism of action for idasanutlin. For one randomly initialized model, both K-means assigned clusters are more similar to idasanutlin-treated TP53 mutants as opposed to idasanutlin-treated TP53 wild-types. This is expected from the lower cluster-based metrics of contrastiveVI on this dataset (while still higher than other methods).