Differential Correlation across Ranked Samples
DCARS is a flexible statistical approach which uses local weighted correlations to build a powerful and robust statistical test to identify significant variation in levels of concordance across a ranking of samples. This has the potential to discover biologically informative relationships between genes across a variable of interest, such as survival outcome. Read more about DCARS in Bioinformatics.
# Install the development version from GitHub: # install.packages("devtools") devtools::install_github("shazanfar/DCARS") library(DCARS)
The following example calculates the DCARS test statistic using the skin cutaneous melanoma (SKCM) TCGA dataset, along survival ranking. First it does so for a single gene pair, SKP1 and SKP2, and then EIF3C and EIF5B. Then DCARS selects the top 10 gene pairs after calculating statistics across the STRING network.
data(STRING) data(SKCM) SKCM_rank = t(apply(SKCM,1,rank)) # highly significantly DCARS gene pair: SKP1 and SKP2 # calculates p-value based on permutation DCARS(SKCM_rank,"SKP1","SKP2",plot=TRUE) # extract only the test statistic DCARS(SKCM_rank,"SKP1","SKP2", extractTestStatisticOnly = TRUE) # plot scatterplot split by early, mid, late in the sample ranking plotColouredExpression(SKCM, genepair = c("SKP1","SKP2")) # plot ribbon plot of genes along sample ranking plotOrderedExpression(SKCM, gene = c("SKP1", "SKP2"), facet = FALSE) # not significantly DCARS gene pair: EIF3C and EIF5B # calculates p-value based on permutation DCARS(SKCM_rank,"EIF3C","EIF5B",plot=TRUE) # extract only the test statistic DCARS(SKCM_rank,"EIF3C","EIF5B", extractTestStatisticOnly = TRUE) # build weight matrix W = weightMatrix(ncol(SKCM_rank), type = "triangular", span = 0.5, plot = TRUE) # extract DCARS test statistics # should take about 30 seconds SKCM_stats = DCARSacrossNetwork(SKCM_rank,edgelist = STRING, W = W, extractTestStatisticOnly = TRUE, verbose = FALSE) sort(SKCM_stats,decreasing=TRUE)[1:10]
To calculate p-values associated with these tests quickly, we use permutation tests over a stratified sample of gene pairs
# first take a stratified sample of gene pairs sampleindices = stratifiedSample(SKCM_stats) # What niter is needed for Bonferroni-corrected p-value of 0.05? # Need to ensure that our choice of iterations is above this 'niter' value # i do this by selecting 110% of niter signif = 0.05 bonfSig = signif/length(SKCM_stats) niter = (length(SKCM_stats)*(1/signif)) / length(sampleindices) niter # calculate a large number of permutation statistics from these stratified sample pairs # this should take about 2 minutes permstats = DCARSacrossNetwork(SKCM_rank, edgelist = STRING[sampleindices,], W = W, niter = round(1.1*niter), verbose = TRUE, extractPermutationTestStatistics = TRUE) permstatsVector = unlist(permstats) permpvals = sapply(SKCM_stats, function(x) mean(permstatsVector >= x)) permpvals[permpvals==0] <- min(permpvals[permpvals!=0])/2 # plot DCARS statistic against estimated (unadjusted) p-value # note if there are not enough iterations then the top-most may flatten out plot(SKCM_stats, -log10(permpvals), col = "red", pch = 16) # add bonferroni adjusted line abline(h = -log10(bonfSig), lty = 2, col = "red") # highlight FDR adjusted points permpvals_FDR = p.adjust(permpvals, method = "BH") FDR_sig = permpvals_FDR < 0.05 sum(permpvals_FDR < 0.05) points(SKCM_stats[FDR_sig], -log10(permpvals)[FDR_sig], col = "blue", pch = 16, cex = 1.2) # What are the most significant (FDR < 0.05) edges? SKCM_signif_edges_FDR = STRING[FDR_sig,] SKCM_signif_edges_FDR
Thresholding on unadjusted DCARS p-values (P-value < 0.05) to explore an unweighted network
# extract these significant edges SKCM_signif_edges = STRING[permpvals < 0.05,] # and graph them as a network library(igraph) SKCM_signif_graph = graph.edgelist(SKCM_signif_edges, directed = FALSE) V(SKCM_signif_graph)$size = 0.01 V(SKCM_signif_graph)$label.cex = 0.6 plot(SKCM_signif_graph) # plot network with pathway information overlaid # pathway information downloaded from MSigDB REACTOME pathways plotNetworkPathway(SKCM_signif_graph) # extract network communities and plot cm = walktrap.community(SKCM_signif_graph) plot(cm, SKCM_signif_graph) plot(induced_subgraph(SKCM_signif_graph, V(SKCM_signif_graph)[cm$membership==1])) plot(induced_subgraph(SKCM_signif_graph, V(SKCM_signif_graph)[cm$membership==2])) plot(induced_subgraph(SKCM_signif_graph, V(SKCM_signif_graph)[cm$membership==3])) plot(induced_subgraph(SKCM_signif_graph, V(SKCM_signif_graph)[cm$membership==4]))
Treating DCARS statistics as a network edge weight
library(igraph) SKCM_graph = graph.edgelist(STRING, directed = FALSE) E(SKCM_graph)$weight = SKCM_stats E(SKCM_graph)$pvalweight = -log10(permpvals) # Graph the Ego network associated with SKP1 with plotEgoNetwork(hubnode = c("SKP1"), network = SKCM_graph, weight = "weight") # only display unadjusted P < 0.05 significant pairs plotEgoNetwork(hubnode = c("SKP1"), network = SKCM_graph, weight = "pvalweight", subset = TRUE, thresh = -log10(0.05)) SKCM_meanStrength = strength(SKCM_graph, weights = E(SKCM_graph)$weight)/degree(SKCM_graph) topNodes = names(degree(SKCM_graph))[order(SKCM_meanStrength, decreasing = TRUE)[1:20]] plot(degree(SKCM_graph), SKCM_meanStrength) text(degree(SKCM_graph)[topNodes], SKCM_meanStrength[topNodes], names(degree(SKCM_graph)[topNodes])) library(ggplot2) library(ggrepel) df = data.frame(node = names(degree(SKCM_graph)), degree = degree(SKCM_graph), meanStrength = SKCM_meanStrength, topNodes = (names(degree(SKCM_graph)) %in% topNodes)) g = ggplot(df, aes(x = degree, y = meanStrength)) + geom_point() + geom_text_repel(data = subset(df,topNodes==TRUE), aes(label = node)) + theme_minimal() + NULL g # this shows that the strongest genes are IMPDH1 and SKP2, these genes have the highest mean edge weights
Examine the weighted correlation vector for given genepairs. e.g. we observe a positive association between SKP1 and SKP2, which deteriorates along the sample rankings.
# calculate the weighted correlation vectors across the network SKCM_wcor = t(DCARSacrossNetwork(SKCM_rank, edgelist = STRING, W = W, verbose = FALSE, extractWcorSequenceOnly = TRUE)) plotWCorLine(SKCM_wcor, gene = "SKP1")
Examine the significant weighted correlations in terms of clustered pathways
SKCM_signif_wcor = t(DCARSacrossNetwork(SKCM_rank, edgelist = SKCM_signif_edges, W = W, verbose = FALSE, extractWcorSequenceOnly = TRUE)) plotWcorsClusterPathway(SKCM_signif_wcor,cluster = TRUE)
- Shila Ghazanfar - @shazanfar