In [1]:
import os
import torch

import scanpy as sc
import numpy as np
import pandas as pd
import gseapy as gp

from tqdm import tqdm
from sklearn.cluster import KMeans
from contrastive_vi.model.contrastive_vi import ContrastiveVIModel
from scripts import constants
from scvi._settings import settings

Global seed set to 0
1: package ‘methods’ was built under R version 3.6.1 
2: package ‘datasets’ was built under R version 3.6.1 
3: package ‘utils’ was built under R version 3.6.1 
4: package ‘grDevices’ was built under R version 3.6.1 
5: package ‘graphics’ was built under R version 3.6.1 
6: package ‘stats’ was built under R version 3.6.1 


## Load data and set up environment

In [2]:
settings.seed = 0
device = "cpu"
dataset = "haber_2017"

Global seed set to 0


In [3]:
split_key = constants.DATASET_SPLIT_LOOKUP[dataset]["split_key"]
background_value = constants.DATASET_SPLIT_LOOKUP[dataset]["background_value"]
seeds = constants.DEFAULT_SEEDS
latent_size = 10

In [4]:
adata = sc.read_h5ad(
    os.path.join(
        constants.DEFAULT_DATA_PATH,
        f"{dataset}/preprocessed/adata_top_2000_genes_tc.h5ad",
    )
)
ContrastiveVIModel.setup_anndata(adata, layer="count")

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"count"[0m[1m][0m                                               
[34mINFO    [0m Successfully registered anndata object containing [1;36m7721[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


In [5]:
target_indices = np.where(adata.obs[split_key] != background_value)[0]
target_adata = adata[target_indices]

In [6]:
model_list = []
latent_rep_list = []
for seed in tqdm(seeds):
    result_dir = os.path.join(
        constants.DEFAULT_RESULTS_PATH,
        f"{dataset}/contrastiveVI/latent_{latent_size}",
        f"{seed}",
    )
    model_list.append(
        torch.load(
            os.path.join(result_dir, "model.ckpt"),
            map_location=device,
        ),
    )
    latent_rep_list.append(
        np.load(os.path.join(result_dir, "latent_representations.npy")),
    )

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:59<00:00, 11.91s/it]


## contrastiveVI differential expression analysis
Compare cells infected with Salmonella vs. cells infected with H. poly.

In [7]:
de_result_list = []
for seed_index, seed in enumerate(seeds):
    model = model_list[seed_index]
    latent_rep = latent_rep_list[seed_index]
    latent_clusters = KMeans(n_clusters=2, random_state=123).fit(latent_rep).labels_
    cluster_label = f"cluster_{seed}"
    
    tmp_target_adata = target_adata.copy()
    tmp_target_adata.obs[cluster_label] = latent_clusters.astype(str)
    
    de_result = model.differential_expression(
        adata=tmp_target_adata,
        groupby=cluster_label,
        group1="0",
        group2="1",
        idx1=None,
        idx2=None,
        mode="change",
        delta=0.25,
        batch_size=128,
        all_stats=True,
        batch_correction=False,
        batchid1=None,
        batchid2=None,
        fdr_target=0.05,
        silent=False,
    )
    
    de_result.reset_index()
    de_result["gene_symbol"] = de_result.index
    de_result["seed"] = seed
    de_result_list.append(de_result)

DE...: 100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [04:02<00:00, 242.76s/it]
DE...: 100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [03:59<00:00, 239.64s/it]
DE...: 100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [04:01<00:00, 241.64s/it]
DE...: 100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [04:02<00:00, 242.65s/it]
DE...: 100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [04:03<00:00, 243.82s/it]


In [8]:
de_result = pd.concat(de_result_list)

In [9]:
de_result_mean = (
    de_result.groupby("gene_symbol", as_index=False)
    .mean()
    .sort_values(by="proba_de", ascending=False)
)

## Pathway enrichment analysis with top differentially expressed genes

In [10]:
top_genes = de_result_mean[de_result_mean["proba_de"] > 0.95]["gene_symbol"].tolist()

enr = gp.enrichr(
    gene_list=top_genes,
    gene_sets="KEGG_2019_Mouse",
    organism="mouse",
    cutoff=0.05,
)

enr_results = enr.results
enr_results = enr_results[enr_results["Adjusted P-value"] < 0.05]

In [11]:
cols = ["Gene_set", "Term", "Adjusted P-value", "Overlap", "Genes"]
enr_results[cols]

Unnamed: 0,Gene_set,Term,Adjusted P-value,Overlap,Genes
0,KEGG_2019_Mouse,Fat digestion and absorption,0.023545,5/40,FABP1;FABP2;PLA2G3;APOA1;APOA4
1,KEGG_2019_Mouse,Vitamin digestion and absorption,0.023545,4/24,CUBN;RBP2;APOA1;APOA4
2,KEGG_2019_Mouse,Cholesterol metabolism,0.029304,5/49,APOH;APOC2;APOA1;APOC3;APOA4


In [12]:
enr_results["Term"].tolist()

['Fat digestion and absorption',
 'Vitamin digestion and absorption',
 'Cholesterol metabolism']

In [13]:
len(top_genes)

280

## Check differential expression results against original paper results

In [14]:
original_salmonella_results = pd.read_excel(
    os.path.join(
        constants.DEFAULT_DATA_PATH,
        dataset,
        "41586_2017_BFnature24489_MOESM10_ESM.xlsx",
    ),
    sheet_name="Salmonella",
    header=1,
)

original_salmonella_degs = original_salmonella_results[
    original_salmonella_results["Gene category"] == "global.filtered"
]["Gene"].tolist()
original_salmonella_degs = [
    gene.upper() for gene in original_salmonella_degs
    if type(gene) is str  # Filter out weird rows with date info.
]
original_salmonella_degs = set(original_salmonella_degs)

In [15]:
original_hpoly_results = pd.read_excel(
    os.path.join(
        constants.DEFAULT_DATA_PATH,
        dataset,
        "41586_2017_BFnature24489_MOESM10_ESM.xlsx",
    ),
    sheet_name="H.poly (Day 10)",
    header=1,
)

original_hpoly_degs = original_hpoly_results[
    original_hpoly_results["Gene category"] == "global.filtered"
]["Gene"].tolist()
original_hpoly_degs = [
    gene.upper() for gene in original_hpoly_degs 
    if type(gene) is str  # Filter out weird rows with date info.
]
original_hpoly_degs = set(original_hpoly_degs)

In [16]:
original_unique_degs = (
    original_salmonella_degs.union(original_hpoly_degs)
    - original_salmonella_degs.intersection(original_hpoly_degs)
)

In [17]:
original_salmonella_degs = pd.DataFrame(
    {"gene_symbol": list(original_salmonella_degs), "salmonella_deg": True}
)
original_hpoly_degs = pd.DataFrame(
    {"gene_symbol": list(original_hpoly_degs), "hpoly_deg": True}
)
original_unique_degs = pd.DataFrame(
    {"gene_symbol": list(original_unique_degs), "unique_deg": True}
)

In [18]:
de_result_mean = (
    de_result_mean
    .merge(original_salmonella_degs, on="gene_symbol", how="left")
    .merge(original_hpoly_degs, on="gene_symbol", how="left")
    .merge(original_unique_degs, on="gene_symbol", how="left")
)
de_result_mean[["salmonella_deg", "hpoly_deg", "unique_deg"]] = (
    de_result_mean[["salmonella_deg", "hpoly_deg", "unique_deg"]].fillna(False)
)

In [19]:
de_result_mean[de_result_mean["unique_deg"]]["proba_de"].mean()

0.9165261375661374

In [20]:
de_result_mean[~de_result_mean["unique_deg"]]["proba_de"].mean()

0.9315457979017118

## Check genes associated with enrich pathways against original paper results

In [21]:
enriched_pathway_genes = [
    set(row["Genes"].split(";")) for _, row in enr_results.iterrows()
]
enriched_pathway_genes = set.union(*enriched_pathway_genes)
enriched_pathway_genes = pd.DataFrame({"gene_symbol": list(enriched_pathway_genes)})

In [22]:
enriched_pathway_genes = (
    enriched_pathway_genes
    .merge(original_salmonella_degs, on="gene_symbol", how="left")
    .merge(original_hpoly_degs, on="gene_symbol", how="left")
    .merge(original_unique_degs, on="gene_symbol", how="left")
)
enriched_pathway_genes[["salmonella_deg", "hpoly_deg", "unique_deg"]] = (
    enriched_pathway_genes[["salmonella_deg", "hpoly_deg", "unique_deg"]].fillna(False)
)

In [23]:
enriched_pathway_genes.sort_values("unique_deg")

Unnamed: 0,gene_symbol,salmonella_deg,hpoly_deg,unique_deg
0,PLA2G3,False,False,False
1,FABP2,True,True,False
2,APOC3,True,True,False
4,APOH,False,False,False
3,APOA4,True,False,True
5,APOC2,True,False,True
6,APOA1,True,False,True
7,FABP1,True,False,True
8,CUBN,False,True,True
9,RBP2,True,False,True


### For genes not labeled as uniquely differentially expressed in the original paper, check whether related related genes are uniquely differentially expressed.

In [24]:
# APOC3
print([gene for gene in original_unique_degs["gene_symbol"] if "APOC" in gene])

['APOC2']


In [25]:
# PLA2G3
print([gene for gene in original_unique_degs["gene_symbol"] if "PLA2" in gene])

['PLA2G12B', 'PLA2G4C', 'PLA2G5']


In [26]:
# FABP2
print([gene for gene in original_unique_degs["gene_symbol"] if "FAB" in gene])

['FABP1', 'FABP6']


In [27]:
# APOH
print([gene for gene in original_unique_degs["gene_symbol"] if "APO" in gene])

['APOL10A', 'APOL7A', 'APOC2', 'APOA1', 'APOA4']


APOH is an apolipoprotein like APOC2, APOA1, etc. We can conclude that all the genes identified in the statistically significant enriched pathways are related to the uniquely differentially expressed genes identified in the original paper (Haber et al. 2017). It has been shown that apolipoproteins and, in general, lipid and lipoprotein metabolism are rleated to infection and inflammation (https://pubmed.ncbi.nlm.nih.gov/15102878/). Pathogen infection is also related to cholesterol metobalism (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4219984/) as identified by the pathway enrichment analysis. 