In [1]:
%load_ext autoreload
%autoreload 2
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gseapy as gp
import scanpy as sc

# Load gene expression dataset

In [2]:
adata = sc.datasets.ebi_expression_atlas("E-GEOD-131882",filter_boring=True) # data from SeuratData::ifnb

VBox(children=(  0%|          | 0.00/15.3M [00:00<?, ?B/s],))

VBox(children=(0.00B [00:00, ?B/s],))

In [3]:
adata.obs.head()

Unnamed: 0,Sample Characteristic[individual],Sample Characteristic[sex],Sample Characteristic Ontology Term[sex],Sample Characteristic[age],Sample Characteristic[disease],Sample Characteristic Ontology Term[disease],Sample Characteristic[clinical information],Sample Characteristic[clinical treatment],Factor Value[disease],Factor Value Ontology Term[disease]
SAMN11878523-AAACCTGAGGCCCTCA,Diabetes_3,female,http://purl.obolibrary.org/obo/PATO_0000383,57 year,diabetic nephropathy,http://www.ebi.ac.uk/efo/EFO_0000401,"hemoglobin A1c 9.7 mmol/mol, glomerular filtra...",radical nephrectomy,diabetic nephropathy,http://www.ebi.ac.uk/efo/EFO_0000401
SAMN11878523-AAACCTGAGTCGATAA,Diabetes_3,female,http://purl.obolibrary.org/obo/PATO_0000383,57 year,diabetic nephropathy,http://www.ebi.ac.uk/efo/EFO_0000401,"hemoglobin A1c 9.7 mmol/mol, glomerular filtra...",radical nephrectomy,diabetic nephropathy,http://www.ebi.ac.uk/efo/EFO_0000401
SAMN11878523-AAACCTGAGTTTAGGA,Diabetes_3,female,http://purl.obolibrary.org/obo/PATO_0000383,57 year,diabetic nephropathy,http://www.ebi.ac.uk/efo/EFO_0000401,"hemoglobin A1c 9.7 mmol/mol, glomerular filtra...",radical nephrectomy,diabetic nephropathy,http://www.ebi.ac.uk/efo/EFO_0000401
SAMN11878523-AAACCTGCAGAGTGTG,Diabetes_3,female,http://purl.obolibrary.org/obo/PATO_0000383,57 year,diabetic nephropathy,http://www.ebi.ac.uk/efo/EFO_0000401,"hemoglobin A1c 9.7 mmol/mol, glomerular filtra...",radical nephrectomy,diabetic nephropathy,http://www.ebi.ac.uk/efo/EFO_0000401
SAMN11878523-AAACCTGCAGCCTGTG,Diabetes_3,female,http://purl.obolibrary.org/obo/PATO_0000383,57 year,diabetic nephropathy,http://www.ebi.ac.uk/efo/EFO_0000401,"hemoglobin A1c 9.7 mmol/mol, glomerular filtra...",radical nephrectomy,diabetic nephropathy,http://www.ebi.ac.uk/efo/EFO_0000401


In [4]:
adata.var.head()

ENSG00000000003
ENSG00000000005
ENSG00000000419
ENSG00000000457
ENSG00000000460


# Preprocess data

In [5]:
# preprocessing
print(adata.shape)
sc.pp.filter_cells(adata, min_genes=int(adata.X.shape[1])*0.01)
print(adata.shape)
sc.pp.filter_genes(adata, min_cells=int(adata.X.shape[0])*0.01)
print(adata.shape)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.scale(adata)

adata.obs.groupby('Factor Value[disease]')['Sample Characteristic[individual]'].value_counts()

(28348, 39846)
(28274, 39846)
(28274, 17902)


Factor Value[disease]  Sample Characteristic[individual]
diabetic nephropathy   Diabetes_1                           5621
                       Diabetes_3                           4052
                       Diabetes_2                            243
                       Control_1                               0
                       Control_2                               0
                       Control_3                               0
normal                 Control_1                            7116
                       Control_3                            6575
                       Control_2                            4667
                       Diabetes_1                              0
                       Diabetes_2                              0
                       Diabetes_3                              0
Name: Sample Characteristic[individual], dtype: int64

In [6]:
adata.obs['disease'] = pd.Categorical(adata.obs['Factor Value[disease]'], categories=["diabetic nephropathy", "normal"], ordered=True)
indices = adata.obs.sort_values(['Factor Value[disease]', 'disease']).index
adata = adata[indices,:]

# Focus on the most prevalent cell type

In [7]:
bdata = adata[adata.obs["Sample Characteristic[sex]"] == "male"].copy()# female
bdata

AnnData object with n_obs × n_vars = 17647 × 17902
    obs: 'Sample Characteristic[individual]', 'Sample Characteristic[sex]', 'Sample Characteristic Ontology Term[sex]', 'Sample Characteristic[age]', 'Sample Characteristic[disease]', 'Sample Characteristic Ontology Term[disease]', 'Sample Characteristic[clinical information]', 'Sample Characteristic[clinical treatment]', 'Factor Value[disease]', 'Factor Value Ontology Term[disease]', 'n_genes', 'disease'
    var: 'n_cells', 'mean', 'std'
    uns: 'log1p'

In [8]:
from collections import Counter
Counter(bdata.obs["Factor Value[disease]"])

Counter({'diabetic nephropathy': 5864, 'normal': 11783})

In [9]:
4925/(4925+6025)*100

44.97716894977169

In [10]:
bdata.shape

(17647, 17902)

# Map gene names

In [11]:
mapper = pd.read_csv('data/mart_export.txt',index_col=0).fillna(-1)["Gene name"].to_dict()

In [12]:
new_indexes = []
for i in bdata.var.index:
    if i in mapper and mapper[i]!="":
        new_indexes.append(mapper[i])
    else:
        new_indexes.append(-1)

In [13]:
bdata.var.index = new_indexes
bdata = bdata[:,bdata.var.index != -1]

In [14]:
bdata.shape

(17647, 15481)

In [15]:
bdata.var_names_make_unique()

  utils.warn_names_duplicates("var")


In [16]:
bdata.to_df()

Unnamed: 0,TSPAN6,DPM1,SCYL3,FIRRM,CFH,FUCA2,GCLC,NFYA,STPG1,NIPAL3,...,LSP1P5,RPSA2,LINC03112,LINC02256-1,ZNF496-DT,LINC03063,CTAGE8,XNDC1N-ZNF705EP-ALG1L9P,TALAM1,FAM95A
SAMN11878524-AAACGGGAGACATAAC,-0.146593,-0.359668,-0.383372,-0.281666,-0.189161,-0.199349,1.369002,-0.244033,-0.20523,1.040997,...,-0.139584,0.300692,1.228590,-0.311402,-0.210439,-0.086515,-0.127619,-0.490919,0.532566,-0.157443
SAMN11878524-AAACGGGGTTCCGTCT,-0.146593,-0.359668,1.442294,-0.281666,-0.189161,-0.199349,0.815460,-0.244033,-0.20523,-0.286242,...,-0.139584,1.045291,1.427017,-0.311402,-0.210439,-0.086515,-0.127619,-0.490919,0.509876,-0.157443
SAMN11878524-AAAGTAGTCGCGTTTC,2.392723,-0.359668,-0.383372,-0.281666,-0.189161,-0.199349,-0.337752,1.492125,-0.20523,1.177733,...,-0.139584,1.459011,-0.243747,-0.311402,-0.210439,-0.086515,-0.127619,-0.490919,0.480276,-0.157443
SAMN11878524-AAATGCCGTTGACGTT,-0.146593,-0.359668,-0.383372,-0.281666,-0.189161,-0.199349,0.860431,-0.244033,-0.20523,1.282571,...,5.659317,-0.515680,-0.243747,-0.311402,-0.210439,-0.086515,-0.127619,-0.490919,0.246965,1.858150
SAMN11878524-AACACGTTCGCGCCAA,-0.146593,-0.359668,0.075055,-0.281666,2.690777,-0.199349,0.144064,2.517450,-0.20523,-0.286242,...,-0.139584,-0.515680,2.332839,-0.311402,0.927649,0.356021,-0.127619,1.753608,0.180038,-0.157443
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAMN11878528-TTTGTCATCACAGGCC,-0.146593,-0.359668,-0.383372,-0.281666,-0.189161,-0.199349,6.969540,-0.244033,-0.20523,-0.286242,...,-0.139584,-0.515680,-0.243747,-0.311402,-0.210439,-0.086515,-0.127619,3.418672,-1.197762,-0.157443
SAMN11878528-TTTGTCATCACCAGGC,-0.146593,-0.359668,-0.383372,-0.281666,-0.189161,-0.199349,2.086680,-0.244033,-0.20523,-0.286242,...,-0.139584,-0.515680,-0.243747,-0.311402,-0.210439,-0.086515,-0.127619,-0.490919,0.950753,-0.157443
SAMN11878528-TTTGTCATCACCGTAA,-0.146593,-0.359668,-0.383372,-0.281666,-0.189161,-0.199349,-0.337752,-0.244033,-0.20523,-0.286242,...,-0.139584,-0.515680,-0.243747,-0.311402,-0.210439,-0.086515,-0.127619,-0.490919,1.354196,-0.157443
SAMN11878528-TTTGTCATCCTACAGA,-0.146593,-0.359668,-0.383372,-0.281666,-0.189161,-0.199349,-0.337752,-0.244033,-0.20523,-0.286242,...,-0.139584,-0.515680,3.242243,-0.311402,-0.210439,-0.086515,-0.127619,-0.490919,1.139663,-0.157443


# Run GSEA

In [17]:
gs = gp.get_library("GO_Biological_Process_2021")


In [18]:
import time
t1 = time.time()
res = gp.gsea(data=bdata.to_df().T,
        gene_sets=gs,
        cls=bdata.obs.disease,
        permutation_num=1000,
        outdir=None,
        threads= 8)
t2=time.time()
print(t2-t1)

418.13077998161316


In [19]:
res.res2d.head(5)

Unnamed: 0,Name,Term,ES,NES,NOM p-val,FDR q-val,FWER p-val,Tag %,Gene %,Lead_genes
0,gsea,T cell receptor signaling pathway (GO:0050852),0.551565,2.378612,0.0,0.0,0.0,69/126,28.94%,PDE4D;PSME4;PTPRC;HLA-DRB5;HLA-DRB1;FBXW11;SPP...
1,gsea,lymphocyte differentiation (GO:0030098),0.642425,2.341168,0.0,0.0,0.0,23/42,22.03%,KLF6;SOX4;ADAM17;PTPRC;RELB;VCAM1;ITGB1;PLCG2;...
2,gsea,regulation of interleukin-2 production (GO:003...,0.688742,2.331517,0.0,0.0,0.0,12/33,9.00%,PDE4D;RUNX1;ANXA1;PTPRC;SPTBN1;PLCG2;PDE4B;EZR...
3,gsea,response to molecule of bacterial origin (GO:0...,0.626404,2.330823,0.0,0.0,0.0,16/44,10.37%,TRIB1;C4B;CYP27B1;CYRIB;ADAM17;FER;RPS6KA3;GCH...
4,gsea,positive regulation of ubiquitin-dependent pro...,0.58516,2.322724,0.0,0.0,0.0,27/70,21.89%,ARIH1;FBXW7;TRIB1;RNF19A;GSK3B;CSNK1A1;SMURF1;...


In [20]:
sum(res.res2d["NOM p-val"]<0.011)/len(gs)*100

14.136559496188267

# Test GSHAPA

In [21]:
gs = gp.get_library("GO_Biological_Process_2021")#KEGG_2021_Human


In [22]:
from gshapa.gshapa import GSHAPA3
from sklearn.ensemble import RandomForestClassifier
import time

In [23]:
t1 = time.time()

# Define parameters
MODEL = RandomForestClassifier(random_state=33,
                               max_features="sqrt",
                               n_estimators=1000,
                               n_jobs=-1)
                               
                               
                               
TEST_SET_SIZE = 0.2

# Build TreeSHAPGSEALite object
tsl = GSHAPA3(MODEL,TEST_SET_SIZE,random_state=33)

# Fit model
tsl.fit(bdata.to_df(), bdata.obs["disease"])

# Explain each gene contribution
X_explain = tsl.X_test[tsl.y_test=="diabetic nephropathy"]
tsl.shap(X_explain)

# Compute pvalues
res2 = tsl.explain_gene_sets(gs,'diabetic nephropathy',n_tests=1000)

t2=time.time()
print(t2-t1)

VBox(children=(  0%|          | 0/6034 [00:00<?, ?it/s],))

273.0234360694885


In [24]:
from sklearn.metrics import roc_auc_score
roc_auc_score(tsl.y_test, tsl.model.predict_proba(tsl.X_test)[:,1])

0.9898257028365199

In [25]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(tsl.y_test, tsl.model.predict(tsl.X_test))

0.8284343565320835

In [26]:
from sklearn.metrics import classification_report
print(classification_report(tsl.y_test, tsl.model.predict(tsl.X_test)))

                      precision    recall  f1-score   support

diabetic nephropathy       1.00      0.66      0.79      1173
              normal       0.85      1.00      0.92      2357

            accuracy                           0.89      3530
           macro avg       0.93      0.83      0.86      3530
        weighted avg       0.90      0.89      0.88      3530



In [27]:
sum(res2["P-values"]<0.011)/len(gs)*100

2.5853496851176665

# Compare both methods

In [28]:
pval_gsea= res.res2d["NOM p-val"]
pval_gsea.index = res.res2d["Term"]
pval_treeshap = res2["P-values"].loc[pval_gsea.index]

pvals = pd.DataFrame({"pval_gsea":pval_gsea,"pval_gshapa":pval_treeshap})

In [29]:
print(pd.crosstab(pvals["pval_gsea"]<0.011, pvals["pval_gshapa"]<0.011).to_latex())

\begin{tabular}{lrr}
\toprule
pval\_gshapa &  False &  True  \\
pval\_gsea &        &        \\
\midrule
False     &   1400 &     29 \\
True      &    823 &     30 \\
\bottomrule
\end{tabular}



In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also contains additional functionality.


In [30]:
from scipy.stats import fisher_exact
fisher_exact(pd.crosstab(pvals["pval_gsea"]<0.011, pvals["pval_gshapa"]<0.011))

SignificanceResult(statistic=1.7597519587715256, pvalue=0.039951580623442826)

In [31]:
pvals.sort_values("pval_gshapa")[:10]

Unnamed: 0_level_0,pval_gsea,pval_gshapa
Term,Unnamed: 1_level_1,Unnamed: 2_level_1
canonical glycolysis (GO:0061621),0.00578,0.0
hemopoiesis (GO:0030097),0.023438,0.0
negative regulation of neurogenesis (GO:0050768),0.0,0.0
regulation of T cell proliferation (GO:0042129),0.0,0.0
pore complex assembly (GO:0046931),0.0,0.0
glycolytic process through glucose-6-phosphate (GO:0061620),0.001931,0.0
negative regulation of axonogenesis (GO:0050771),0.12069,0.0
negative regulation of amyloid precursor protein catabolic process (GO:1902992),0.107071,0.001
negative regulation of developmental growth (GO:0048640),0.022449,0.001
positive regulation of ERBB signaling pathway (GO:1901186),0.063241,0.001


# Check specific patients

In [32]:
bdata_test = bdata[tsl.X_test.index]
bdata_test_patients = bdata_test[bdata_test.obs["Sample Characteristic[disease]"]=='diabetic nephropathy']
individuals = list(set(bdata_test_patients.obs["Sample Characteristic[individual]"]))

In [33]:
individuals_tests = {}
for i in individuals:
    print(i)
    bdataexplain = bdata_test_patients[bdata_test_patients.obs["Sample Characteristic[individual]"] == i]
    tsl.shap(bdata_explain.to_df())
    res_i = tsl.explain_gene_sets(gs,'diabetic nephropathy',n_tests=1000)
    individuals_tests[i] = res_i
    

Diabetes_1


NameError: name 'bdata_explain' is not defined

In [None]:
p_values_patients = {}
for i in individuals_tests:
    res_i = individuals_tests[i]
    p_values_patients[i] = res_i["corrected P-values"]
p_values_patients = pd.DataFrame(p_values_patients)
scores = -np.log10(p_values_patients+1e-100)

In [None]:
significant_donors = scores[scores.sum(axis=1)>0]
significant_donors.columns = ["Donor "+c.split("_")[-1] for c in significant_donors.columns]

In [None]:
import seaborn as sns
sns.set(font_scale=1.4)
ax = sns.clustermap(significant_donors,figsize=(4,5),yticklabels=True,)
ax.cax.set_visible(False)