In [22]:
library('SGL')
library('grpregOverlap')
library('MASS')
library('glmnet')
library('parallel')

In [23]:
dataFileBase <- '/homes/gws/psturm/simulatedData/regressionData/df%i.csv'

In [24]:
dataFile <- sprintf(dataFileBase, 0)

In [25]:
numPheno <- 100
numGenes <- 1000
numSamples <- 100
df <- read.csv(dataFile, header=TRUE)
df <- as.data.frame(df)
df_t <-  as.data.frame(t(df))
colnames(df_t) <- c(paste('p', sep='', 1:numPheno), paste('g', sep='', 1:numGenes))
data_mat <- df_t[1:numSamples, ]
group_index <- unlist(df['gene_group'][-(1:numPheno), ])
y <- data_mat$p1
x <- data.matrix(data_mat[, paste('g', sep='', 1:numGenes)])

In [26]:
true_y_coeff <- df$coeff0[-(1:numPheno)]
phenotype_genes <- which(true_y_coeff != 0)

In [27]:
getCounts <- function(coeffs, reference_index) {
    discovered_genes <- which(coeffs != 0)
    discovered_genes <- discovered_genes[order(abs(coeffs[discovered_genes]), decreasing=TRUE)]
    num_discovered <- length(discovered_genes)
    total_count <- cumsum(discovered_genes %in% reference_index)
    return(total_count)
}

In [28]:
elasticnet <- glmnet(x, y, alpha = 0.500000, lambda = 0.0160776944350486)
coef1 <- as.matrix(coef(elasticnet, s=elasticnet$lambda))[-1]
total_count <- getCounts(coef1, phenotype_genes)

In [29]:
#total_count <- if (length(total_count) < numGenes) c(total_count, integer(numGenes - length(total_count)) + total_count[length(total_count)]) else total_count

In [30]:
sparse_gl <- SGL(data=list(x=x, y=y), index=group_index, lambda = 0.01, type="linear", alpha=0.5)
coef_sgl  <- sparse_gl$beta

In [31]:
group_df <- lapply(unique(group_index), function(o, gi) { 
    paste('g', which(gi == 0), sep='')
}, 
group_index)
overlap_gl <- grpregOverlap(x, y, group_df, penalty="grLasso", alpha=1, lambda=0.1)
coef_overlap <- as.matrix(overlap_gl$beta)[-1]

In [None]:
numReps = 99 #should be 99
cumulative_overlap_counts <- lapply(0:numReps, function(i) {
    overlap_counts <- list()
    
    dataFile <- sprintf(dataFileBase, 0)
    df <- read.csv(dataFile, header=TRUE)
    df <- as.data.frame(df)
    df_t <-  as.data.frame(t(df))
    colnames(df_t) <- c(paste('p', sep='', 1:numPheno), paste('g', sep='', 1:numGenes))
    data_mat <- df_t[1:numSamples, ]
    group_index <- unlist(df['gene_group'][-(1:numPheno), ])
    group_df <- lapply(unique(group_index), function(o, gi) { 
                        paste('g', which(gi == 0), sep='')
                        }, 
                group_index)
    y <- data_mat$p1
    x <- data.matrix(data_mat[, paste('g', sep='', 1:numGenes)])
    
    true_y_coeff <- df$coeff0[-(1:numPheno)]
    phenotype_genes <- which(true_y_coeff != 0)
    
#   ELASTIC NET
    elasticnet <- glmnet(x, y, alpha = 0.500000, lambda = 0.0160776944350486)
    coef_enet <- as.matrix(coef(elasticnet, s=elasticnet$lambda))[-1]
    count_enet <- getCounts(coef_enet, phenotype_genes)
    overlap_counts$elastic_net <- count_enet

#   SPARSE GROUP LASSO
    sparse_gl <- SGL(data=list(x=x, y=y), index=group_index, lambda = 0.01, type="linear", alpha=0.5)
    coef_sgl  <- sparse_gl$beta
    count_sgl <- getCounts(coef_sgl, phenotype_genes)
    overlap_counts$sparse_gl <- count_sgl
    
#   OVERLAPPING GROUP LASSO
    overlap_gl <- grpregOverlap(x, y, group_df, penalty="grLasso", alpha=1, lambda=0.1)
    coef_overlap <- as.matrix(overlap_gl$beta)[-1]
    count_overlap <- getCounts(coef_overlap, phenotype_genes)
    overlap_counts$overlap_gl <- count_overlap
    
    overlap_counts
})

In [None]:
df_from_counts <- function(counts, name) {
    name_list <- lapply(counts, `[[`, name)
    maxlength <- max(lengths(name_list))
    name_list <- lapply(name_list, function(o, m) {
    o <- if (length(o) < m) c(total_count, integer(m - length(o)) + o[length(o)]) else o
}, maxlength)
    name_list <- do.call('cbind', name_list)
    name_list <- as.data.frame(name_list)
    colnames(name_list) <- paste('run', 1:length(name_list), sep='')
    name_list
}

In [None]:
elastic_net_df <- df_from_counts(cumulative_overlap_counts, 'elastic_net')
sparse_gl_df   <- df_from_counts(cumulative_overlap_counts, 'sparse_gl')
overlap_gl_df  <- df_from_counts(cumulative_overlap_counts, 'overlap_gl')

In [None]:
baseSaveDir <- 'DataFrames/mvRegression/%s'
write.csv(elastic_net_df, file=sprintf(baseSaveDir, 'elastic_net.csv'), row.names=FALSE)
write.csv(sparse_gl_df, file=sprintf(baseSaveDir, 'sparse_gl.csv'), row.names=FALSE)
write.csv(overlap_gl_df, file=sprintf(baseSaveDir, 'overlap_gl.csv'), row.names=FALSE)