/
progenySuppFunc.r
210 lines (195 loc) · 8.95 KB
/
progenySuppFunc.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#' Calculate Progeny Scores with Permutations
#'
#'This function generate a series of scatter plot with marginal distribution
#'(in the form of an arrangeGrob object), for each progeny pathway and
#'sample/contrast. Each scatter plot has progeny weights as x-axis and the gene
#'level stat used to compute progeny score as the y-axis. The marginal
#'distribution of the gene level stats is displayed on the right of the plot
#'to give visual support of the significance of each gene contributing to
#'the progeny pathway score. The green and red colors represent the positive
#'and negative contribution of genes to the progeny pathway, respectively.
#'For each gene contribution, 4 cases are possible, as the combinations of
#'the sign of the gene level stat and the sign of the gene level weight.
#'Positive weight will lead to a positive(green)/negative(red) gene contribution
#'if the gene level stat is positive/negative. Negative weight will lead to
#'a negative(red)/positive(green) gene contribution if the gene level stat
#'is positive/negative.
#'
#'@param df an n*m data frame, where n is the number of omic features (genes).
#'m isn't really important, as long as at least one column corresponds to a
#'sample or contrast statistic. One of the columns should correspond to the gene
#'symbols.
#'@param weight_matrix A progeny coefficient matrix. the first column should be
#'the identifiers of the omic features, and should be coherent with
#'the identifiers provided in df.
#'@param dfID an integer corresponding to the column number of
#'the gene identifiers of df.
#'@param weightID an integer corresponding to the column number of the gene
#'identifiers of the weight matrix.
#'@param statName The name of the stat used, to be displayed on the plot
#'@param verbose Logical indicating whether we want to have the messages
#'indicating the different computed weights.
#'@importFrom stats ecdf
#'@import ggplot2
#'@import ggrepel
#'@import gridExtra
#'@return The function returns a list of list of arrangeGrob objects.
#'The first level list elements correspond to samples/contrasts.
#'The second level correspond to pathways.
#'The plots can be saved in a pdf format using the saveProgenyPlots function.
#'@examples
#' # use example gene expression matrix
#'
#' gene_expression <- read.csv(system.file("extdata",
#' "human_input.csv", package = "progeny"))
#'
#' # getting a model matrix with 100 top significant genes and converting to df
#' weight_matrix <- getModel("Human", top=100)
#' weight_matrix <- data.frame(names = row.names(weight_matrix),
#' row.names = NULL, weight_matrix)
#'
#' #use progenyScatter function
#' plots <- progenyScatter(gene_expression, weight_matrix)
#'@export
progenyScatter <- function(df,weight_matrix,dfID = 1, weightID = 1,
statName = "gene stats", verbose = FALSE) {
weight <- color <- ID <- NULL
plot_list_contrasts <- list(0)
for (i in 2:length(df[1,])) {
plot_list_pathways <- list(0)
for (j in 2:length(weight_matrix[1,])) {
sub_df <- df[,c(dfID,i)]
pathway_weights <- weight_matrix[,c(weightID,j)]
names(sub_df) <- c("ID","stat")
minstat <- min(sub_df$stat)
maxstat <- max(sub_df$stat)
histo <- ggplot(sub_df, aes(x = stat, fill = "blue")) +
geom_density() + coord_flip() +
scale_fill_manual(values = c("#00c5ff")) +
xlim(minstat, maxstat) + theme_minimal() +
theme(legend.position = "none", axis.text.x = element_blank(),
axis.ticks.x = element_blank(), axis.title.y = element_blank(),
axis.text.y = element_blank(), axis.ticks.y = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
names(pathway_weights) <- c("ID","weight")
pathway_weights <- pathway_weights[pathway_weights$weight != 0,]
percentile <- ecdf(sub_df$stat)
sub_df <- merge(sub_df,pathway_weights,by = "ID")
sub_df$color <- "3"
sub_df[(sub_df$weight > 0 & sub_df$stat > 0),"color"] <- "1"
sub_df[(sub_df$weight > 0 & sub_df$stat < 0),"color"] <- "2"
sub_df[(sub_df$weight < 0 & sub_df$stat > 0),"color"] <- "2"
sub_df[(sub_df$weight < 0 & sub_df$stat < 0),"color"] <- "1"
sub_df[(percentile(sub_df$stat) < .95 &
percentile(sub_df$stat) > .05),1] <- NA
if (verbose){
message(paste("weights of ",names(weight_matrix)[j], sep = ""))
}
title <- paste("weights of ",names(weight_matrix)[j], sep = "")
scatterplot <- ggplot(sub_df, aes(x = weight, y = stat,
color = color)) + geom_point() +
scale_colour_manual(values = c("red","royalblue3","grey")) +
geom_label_repel(aes(label = ID)) +
ylim(minstat, maxstat) + theme_minimal() +
theme(legend.position = "none") +
geom_vline(xintercept = 0, linetype = 'dotted') +
geom_hline(yintercept = 0, linetype = 'dotted') +
labs(x = title, y = statName)
lay <- t(as.matrix(c(1,1,1,1,2)))
gg <- arrangeGrob(scatterplot, histo, nrow = 1, ncol = 2,
layout_matrix = lay)
plot_list_pathways[[j-1]] <- gg
}
names(plot_list_pathways) <- names(weight_matrix[,-weightID])
plot_list_contrasts[[i-1]] <- plot_list_pathways
}
return(plot_list_contrasts)
}
#'Function to save Progeny plots
#'
#'This function is designed to save the plots (in pdf format) of a nested
#'(2 level) list of arrangeGrob objects, such as the one returned by
#'the progenyScatter function.
#'
#'@param plots a list of list of arrangeGrob object (such as the one returned
#'by the progenyScatter function.).The first level list elements correspond
#'to samples/contrasts. The second level corresponds to pathways.
#'The plots can be saved in a pdf format using the saveProgenyPlots function.
#'@param contrast_names a vector of the same length as the first level of
#'the plot list corresponding to the names of each sample/contrast
#'@param dirpath the path to the directory where the plots should be saved
#'@import ggplot2
#'@examples
#' #create plots using progneyScatter function
#' gene_expression <- read.csv(system.file("extdata",
#' "human_input.csv", package = "progeny"))
#'
#' # getting a weight_matrix
#' weight_matrix <- getModel("Human", top=100)
#' weight_matrix <- data.frame(names = row.names(weight_matrix),
#' row.names = NULL, weight_matrix)
#' plots <- progenyScatter(gene_expression, weight_matrix)
#'
#' #create a list with contrast names
#' contrast_names <- names(gene_expression[2:ncol(gene_expression)])
#'
#' #assign a path to store your plots
#' dirpath <- "./progeny_plots/"
#'
#' # save it
#' # saveProgenyPlots(plots, contrast_names, dirpath)
#' @return This function produces the pdf files of plots taken from the
#' progenyScatter function
#'@export
saveProgenyPlots <- function(plots, contrast_names, dirpath) {
i <- 1
for (condition in plots) {
dirname <- paste(dirpath,contrast_names[i], sep = "")
dir.create(dirname, recursive = TRUE, showWarnings = FALSE)
j <- 1
for (pathway in condition) {
filename <- paste(dirname,names(condition)[j],sep = "/")
filename <- paste(filename,".pdf",sep = "")
ggsave(filename, pathway,device = "pdf", dpi = 300)
j <- j+1
}
i <- i+1
}
}
#'Returns the Progeny Model
#'
#'This function is designed for getting a model matrix with top significant
#'genes for each pathway
#'
#'@param organism "Human" or "Mouse" taken from the main function's argument.
#'Default to "Human"
#'@param top Desired top number of genes for each pathway according to their
#'significance(p.value). Default to 100
#'@examples #getting a model matrix according to the desired top n significant
#'model <- getModel("Human", top=100)
#'@return This function returns model matrix according to the top n significant
#'@importFrom dplyr group_by top_n ungroup select
#'@importFrom tidyr spread %>%
#'@export
getModel <- function(organism = "Human", top= 100) {
pathway <- p.value <- weight <- NULL
if (organism == "Human") {
full_model <- get("model_human_full", envir = .GlobalEnv)
} else if (organism == "Mouse") {
full_model <- get("model_mouse_full", envir = .GlobalEnv)
} else {
stop("Wrong organism name. Please specify 'Human' or 'Mouse'.")
}
if (!(is.numeric(top)) || top < 1){
stop("perm should be an integer value")
}
model <- full_model %>%
dplyr::group_by(pathway) %>%
dplyr::top_n(top, wt = -p.value) %>%
dplyr::ungroup(pathway) %>%
dplyr::select(-p.value) %>%
tidyr::spread(pathway, weight, fill=0) %>%
data.frame(row.names = 1, check.names = FALSE, stringsAsFactors = FALSE)
return(model)
}