# Armadillo ASE analysis (200219)
Simple model (common p for each quad for each SNP)

In [171]:
require(viridis)
require(ggplot2)
require(dplyr)
require(parallel)

set.seed(42)

In [172]:
load("data/exprs_all.Rdata")
load("data/metadata.Rdata")
#source("data/useful.r")
load("data/ase_ratios.test_train.Rdata")
#load("data/gene_annotations_v0.95.Rdata")
#load("data/armadillo.helper.Rdata")

In [173]:
# Variables 
n_quads = 5
n_times = 3 
n_qt = n_quads * n_times
n_q = 4 
n_samp = n_quads * n_q
r_samp = 1:n_samp 

# Labels 
quad = as.numeric(pData$Quad)
sex = as.numeric(pData$Sex)
lane = as.numeric(pData$Batches)
tlab = c("t1", "t2", "t3")
quads = unique(substr(pData$ID, 1,4))

labels = paste(sapply(1:n_quads, function(i) rep(quads[i],n_times)), tlab  ) 
sexlabels = unlist(lapply(1:n_quads, function(i) rep(unique(cbind(pData$Quad, pData$Sex) )[i,2],n_times)) ) 
timelabels = rep(1:n_times, n_quads)
quadlabels = unlist(lapply(1:n_quads, function(i) rep(unique(cbind(pData$Quad, pData$Sex) )[i,1],n_times))   ) 


In [174]:
exprs.all.filt = lapply( 1:5, function(i) exprs.all[[i]][!is.na(exprs.all[[i]][,1]),]) 
ratios = list() 
for(j in 1:5){ 
  X.temp = (exprs.all.filt[[j]][,7:18]) 
  nj = dim(X.temp)[1]/2
  ratios[[j]] = cbind((X.temp[(1:nj),1:4] / (X.temp[(1:nj)+nj,1:4] +X.temp[(1:nj),1:4] ) ),
                      (X.temp[(1:nj),5:8] / (X.temp[(1:nj)+nj,5:8] +X.temp[(1:nj),5:8] ) ),
                      (X.temp[(1:nj),9:12] / (X.temp[(1:nj)+nj,9:12] +X.temp[(1:nj),9:12] ) ))
  
}
density.plot = lapply(1:5, function(i) density( ratios[[i]][!is.na(ratios[[i]])] )  )




In [175]:
ase_cov = list()
for (i in 1:5) {
    X.temp = exprs.all.filt[[i]][,7:18]
    nj = dim(X.temp)[1]/2
    allele_a = X.temp[1:nj,]
    allele_b = X.temp[1:nj+nj,]
    ase_cov[[i]] = lapply(1:12, function(j) {
        df = data.frame(major=allele_a[,j], minor=allele_b[,j])
        df = cbind(df, cov=rowSums(df))
        df = cbind(df, ratio=allele_a[,j]/df$cov)
        return(df)
    })
}

summarize.one.pair.data.simple <- function(data) {
    tdf <- data %>% group_by(gene) %>% summarise(
       mean_cov = mean(cov),
       cor = min(cor),
       scor = min(scor),
       prob = sum(prob, na.rm=TRUE),
       pvalue  = sum(pvalue, na.rm=TRUE),
       prob_fixed = sum(prob_fixed, na.rm=TRUE),
        pvalue_fixed = sum(pvalue_fixed, na.rm=TRUE),
        p = min(p),
        p_fixed = min(p_fixed)
    )
    tdf <- cbind(tdf, exp_pvalue = exp(tdf$pvalue))
    tdf <- cbind(tdf, exp_pvalue_fixed = exp(tdf$pvalue_fixed))
    tdf <- cbind(tdf, exp_prob=exp(tdf$prob))
    tdf <- cbind(tdf, exp_prob_fixed=exp(tdf$prob_fixed))
    return(tdf)
}
summarize.one.data.simple <- function(data) {
    data[,'gene'] = as.numeric(as.character(data[,'gene']))
    tdf <- data %>% group_by(gene) %>% summarise(
       mean_cov = mean(cov),
       prob = sum(prob, na.rm=TRUE),
       pvalue  = sum(pvalue, na.rm=TRUE),
       prob_fixed = sum(prob_fixed, na.rm=TRUE),
       pvalue_fixed = sum(pvalue_fixed, na.rm=TRUE),
       p = min(p),
       p_fixed = min(p_fixed)      
    )
    return(tdf)
}

In [176]:
estimate.p.for.quad <- function(ase_cov, ident, time, g) {
    n = sum(unlist(lapply(1:4, function(x){return(ase_cov[[ident]][[time*4+x]][g, 3])})), na.rm=T)
    ny = sum(unlist(lapply(1:4, function(x){return(ase_cov[[ident]][[time*4+x]][g, 1])})), na.rm=T)
    return(ny/n)
}
estimate.p.for.quad.fixed <- function(ase_cov, ident, time, g) {
    return(mean(estimate.p.for.each(ase_cov, ident, time, g), na.rm=T))
}
estimate.p.for.each <- function(ase_cov, ident, time, g) {
    return(unlist(lapply(1:4, function(x){return(ase_cov[[ident]][[time*4+x]][g,4])})))
}

In [177]:
compute.prob <- function(major, cov, p) {
    return(unlist(sapply(1:4, function(x) {return(dbinom(major[x], cov[x], p, log=TRUE))})))
}
compute.pvalue <- function(major, cov, p) {
    return(unlist(sapply(1:4, function(x) {
                #if (cov[x] == 0) return(1);
                if (cov[x] == 0) return(0.5);
                if (major[x]/cov[x] < p) return(pbinom(major[x], cov[x], p, log=TRUE, lower.tail=TRUE))
                else return(pbinom(major[x], cov[x], p, log=TRUE, lower.tail=FALSE))
    })))
}


In [178]:
# add excess_prob independently
compute.simple.pvalue <- function(ase_cov, ident, time, gene_set=NULL, verbose=FALSE) {
    if (is.null(gene_set))
        gene_set = 1:dim(ase_cov[[ident]][[1]][1])[1]
    df <- do.call(rbind, lapply(gene_set, function(g) {
        x_cord <- unlist(lapply(1:4, function(x){return(ase_cov[[ident]][[time*4+x]][g,4])}))
        x_major <- unlist(lapply(1:4, function(x){return(ase_cov[[ident]][[time*4+x]][g,1])}))
        x_cov <- unlist(lapply(1:4, function(x){return(ase_cov[[ident]][[time*4+x]][g,3])}))
        if (sum(is.nan(x_cord)) >= 2)
            return(NULL)
        p <- estimate.p.for.quad(ase_cov, ident, time, g)
        p_fixed <- estimate.p.for.quad.fixed(ase_cov, ident, time, g)
        prob = compute.prob(x_major, x_cov, p)
        pvalue = compute.pvalue(x_major, x_cov, p)
        p_fixed <- estimate.p.for.quad.fixed(ase_cov, ident, time, g)
        prob_fixed = compute.prob(x_major, x_cov, p_fixed)
        pvalue_fixed = compute.pvalue(x_major, x_cov, p_fixed)
#         x_cov <- x_cov[order(x_cord)]
#         x_cord <- x_cord[order(x_cord)]
        suppressWarnings({
            temp = data.frame(x=x_cord, gene=g, cov=x_cov, 
                              prob=prob, pvalue=pvalue, prob_fixed=prob_fixed, pvalue_fixed=pvalue_fixed, p=rep(p, 4), p_fixed=rep(p_fixed, 4))
            temp = temp[order(x_cord),]
        })
        return(temp)
    }))
    stopifnot(typeof(df) != 'character')
    return(df)
}
compute.simple.pvalue.parallel <- function(ident) {
    result <- list()
    gene_set = sample(1:dim(ase_cov[[ident]][[1]][1])[1], 2000)
    gene_set = c(941, 583, 3602)
    result <- mclapply(0:2, function(x) {return(compute.simple.pvalue(ase_cov, ident, x))}, mc.cores=1)
    return(result)
}


In [179]:
rdf <- mclapply(1:5, compute.simple.pvalue.parallel, mc.cores=5)
saveRDS(rdf, 'simple_independent_result.rds')
# print(compute.simple.pvalue.parallel(1))
print(names(rdf))
print(length(rdf))
print(length(rdf[[1]]))
print(head(rdf[[1]][[1]]))
print(head(rdf[[5]][[1]]))

NULL
[1] 5
[1] 3
           x gene cov      prob     pvalue prob_fixed pvalue_fixed         p
3  0.2222222    1   9 -2.341022 -2.0618795  -2.164130   -1.8613140 0.4666667
2  0.4444444    1   9 -1.355322 -0.5420633  -1.346407   -0.4600742 0.4666667
4  0.5333333    1  15 -1.727874 -1.5201483  -1.825002   -1.7535983 0.4666667
1  0.5833333    1  12 -1.803462 -1.9959498  -1.931558   -2.2424771 0.4666667
11 0.3181818    2  22 -2.295162 -1.6101182  -2.305712   -1.6240151 0.4302326
41 0.3809524    2  21 -1.836914 -0.8890645  -1.841369   -0.8983318 0.4302326
     p_fixed
3  0.4458333
2  0.4458333
4  0.4458333
1  0.4458333
11 0.4312771
41 0.4312771
             x gene  cov      prob     pvalue prob_fixed pvalue_fixed
2  0.000000000    1  932 -2.852931 -2.8529311  -2.475721   -2.4757215
4  0.000000000    1  679 -2.078477 -2.0784766  -1.803664   -1.8036640
3  0.003436426    1  873 -1.514084 -1.2772353  -1.586807   -1.5910511
1  0.007174888    1 1115 -4.209643 -4.7908547  -4.894508   -5.6771184
21 

In [166]:
for (i in 1:5) {
    for (j in 1:3) {
        print(head(rdf[[i]][[j]]))
    }
}


           x gene cov      prob     pvalue prob_fixed pvalue_fixed         p
3  0.2222222    1   9 -2.341022 -2.0618795  -2.164130   -1.8613140 0.4666667
2  0.4444444    1   9 -1.355322 -0.5420633  -1.346407   -0.4600742 0.4666667
4  0.5333333    1  15 -1.727874 -0.2467698  -1.825002   -0.1901317 0.4666667
1  0.5833333    1  12 -1.803462 -0.1460489  -1.931558   -0.1122678 0.4666667
11 0.3181818    2  22 -2.295162 -1.6101182  -2.305712   -1.6240151 0.4302326
41 0.3809524    2  21 -1.836914 -0.8890645  -1.841369   -0.8983318 0.4302326
     p_fixed
3  0.4458333
2  0.4458333
4  0.4458333
1  0.4458333
11 0.4312771
41 0.4312771
           x gene cov      prob      pvalue prob_fixed pvalue_fixed         p
2  0.3181818    1  22 -3.447371 -2.98114939  -4.417393  -4.04339153 0.5147059
4  0.4000000    1  20 -2.245480 -1.55472488  -2.801312  -2.25184154 0.5147059
1  0.7692308    1  13 -3.154605 -0.01480632  -2.559613  -0.03436101 0.5147059
3  0.7692308    1  13 -3.154605 -0.01480632  -2.559613  -0

In [180]:
extract.gene.info <- function(genes, ident) {
    #print(head(exprs.all.filt[[ident]]))
    return(exprs.all.filt[[ident]][genes,1:5])
}

In [181]:
plot_histogram <- function(data, x, header, quant_value=-1) {
    png(paste0(header, '.png'))
    g <- ggplot(data, aes_string(x=x))+geom_histogram()+theme_bw()
    if (quant_value > 0) {
        q <- quantile(data[,x], quant_value)
        print(q)
        g <- g + geom_vline(xintercept=q, color='red', size=2) +
        geom_text(label = paste(quant_value, ':', q), x=q, y=-50)
    }
    plot(g) 
    dev.off()
}

In [182]:
all_df <- NULL
sig_df <- NULL
for (i in 1:5) {
    for (j in 1:3) {
        print(head(rdf[[i]][[j]]))
        tdf <- summarize.one.data.simple(rdf[[i]][[j]])
        new_df <- data.frame(gene=tdf$gene, prob=tdf$prob, pvalue=tdf$pvalue, prob_fixed=tdf$prob_fixed, pvalue_fixed=tdf$pvalue_fixed, mean_cov=tdf$mean_cov)
        for (val in c('prob', 'pvalue', 'pvalue_fixed', 'prob_fixed')) {
            if (any(val == c('prob', 'prob_fixed')))
                new_df[new_df[,val] < -50, val] = -50
            else
                new_df[new_df[,val] < -10, val] = -10 
        }
        plot_histogram(new_df, 'prob', paste0('hist_prob_each_null_', i, '_', j), quant_value=0.01)
        plot_histogram(new_df, 'pvalue', paste0('hist_pvalue_each_null_', i, '_', j), quant_value=0.01)
        new_df[,'pvalue'] = exp(tdf[,'pvalue'])
        plot_histogram(new_df, 'pvalue', paste0('hist_pvalue_each_null_', i, '_', j, '_exp'))
        tdf <- cbind(tdf, extract.gene.info(tdf$gene, i))
        tdf[,'ident'] = i
        tdf[,'time'] = j
        if (is.null(all_df)) all_df <- tdf
        else all_df <- rbind(all_df, tdf)
        new_sdf <- subset(new_df, new_df$pvalue < quantile(new_df$pvalue, 0.01))
        new_sdf <- cbind(new_sdf, extract.gene.info(new_sdf$gene, i))
        new_sdf <- cbind(new_sdf, data.frame(ident=rep(i, dim(new_sdf)[1]), time=rep(j, dim(new_sdf)[1])))
        if (is.null(sig_df)) sig_df <- new_sdf
        else sig_df <- rbind(sig_df, new_sdf)
            
    }
}

           x gene cov      prob     pvalue prob_fixed pvalue_fixed         p
3  0.2222222    1   9 -2.341022 -2.0618795  -2.164130   -1.8613140 0.4666667
2  0.4444444    1   9 -1.355322 -0.5420633  -1.346407   -0.4600742 0.4666667
4  0.5333333    1  15 -1.727874 -1.5201483  -1.825002   -1.7535983 0.4666667
1  0.5833333    1  12 -1.803462 -1.9959498  -1.931558   -2.2424771 0.4666667
11 0.3181818    2  22 -2.295162 -1.6101182  -2.305712   -1.6240151 0.4302326
41 0.3809524    2  21 -1.836914 -0.8890645  -1.841369   -0.8983318 0.4302326
     p_fixed
3  0.4458333
2  0.4458333
4  0.4458333
1  0.4458333
11 0.4312771
41 0.4312771
       1% 
-16.32892 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



 1% 
-10 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



           x gene cov      prob     pvalue prob_fixed pvalue_fixed         p
2  0.3181818    1  22 -3.447371 -2.9811494  -4.417393    -4.043392 0.5147059
4  0.4000000    1  20 -2.245480 -1.5547249  -2.801312    -2.251842 0.5147059
1  0.7692308    1  13 -3.154605 -4.2200953  -2.559613    -3.387964 0.5147059
3  0.7692308    1  13 -3.154605 -4.2200953  -2.559613    -3.387964 0.5147059
21 0.2272727    2  22 -5.220454 -4.9310733  -6.478870    -6.241880 0.5061728
41 0.4666667    2  30 -2.026239 -0.9132975  -2.384143    -1.513907 0.5061728
     p_fixed
2  0.5641608
4  0.5641608
1  0.5641608
3  0.5641608
21 0.5532468
41 0.5532468
       1% 
-16.65352 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



 1% 
-10 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



           x gene cov      prob     pvalue prob_fixed pvalue_fixed         p
3  0.3333333    1  12 -2.602680 -2.2236312  -2.680292   -2.3125222 0.5526316
1  0.4285714    1   7 -1.441334 -0.9515665  -1.467263   -0.9914065 0.5526316
4  0.7272727    1  11 -2.051683 -2.6961940  -1.997081   -2.6055910 0.5526316
2  0.7500000    1   8 -1.834923 -2.7327864  -1.789937   -2.6548506 0.5526316
41 0.3500000    2  20 -1.807569 -0.8999534  -1.832847   -0.9490956 0.4025974
21 0.3750000    2  16 -1.622353 -0.6566439  -1.633442   -0.6923252 0.4025974
     p_fixed
3  0.5597944
1  0.5597944
4  0.5597944
2  0.5597944
41 0.4081019
21 0.4081019
       1% 
-15.43038 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



 1% 
-10 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



           x gene cov      prob     pvalue prob_fixed pvalue_fixed         p
2  0.4841270    1 126 -3.357472 -2.0067758  -3.245784   -1.8531903 0.5372263
1  0.5297297    1 185 -2.856451 -0.8045504  -2.839283   -0.7046047 0.5372263
3  0.5572917    1 192 -3.005079 -1.3289216  -3.079355   -1.4831153 0.5372263
4  0.5604396    1 182 -3.019603 -1.4206445  -3.099982   -1.5771072 0.5372263
21 0.4796748    2 123 -3.150529 -1.7332837  -3.038069   -1.5661860 0.5255172
11 0.5133690    2 187 -2.897625 -0.9232330  -2.859994   -0.7898201 0.5255172
     p_fixed
2  0.5328970
1  0.5328970
3  0.5328970
4  0.5328970
21 0.5202376
11 0.5202376
       1% 
-16.29455 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



 1% 
-10 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



           x gene cov      prob     pvalue prob_fixed pvalue_fixed         p
4  0.4862385    1 109 -2.837769 -1.3332593  -2.951253   -1.5175344 0.5210526
1  0.5238095    1 105 -2.555624 -0.8225874  -2.557459   -0.6828132 0.5210526
2  0.5299145    1 117 -2.625652 -0.9452607  -2.608235   -0.8072670 0.5210526
3  0.5714286    1  49 -2.416912 -1.6181775  -2.354194   -1.4876392 0.5210526
21 0.4642857    2 112 -3.324694 -2.0338727  -3.444164   -2.1914224 0.5217391
31 0.5348837    2  43 -2.124703 -0.9848472  -2.116315   -0.9264095 0.5217391
     p_fixed
4  0.5278478
1  0.5278478
2  0.5278478
3  0.5278478
21 0.5261970
31 0.5261970
       1% 
-16.88029 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



 1% 
-10 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



           x gene cov      prob     pvalue prob_fixed pvalue_fixed         p
3  0.4821429    1  56 -2.512775 -1.2982305  -2.501559   -1.2794711 0.5312500
1  0.5098039    1  51 -2.243443 -0.8374370  -2.239044   -0.8240923 0.5312500
2  0.5500000    1  60 -2.314591 -1.0844179  -2.319354   -1.1017869 0.5312500
4  0.5789474    1  57 -2.501049 -1.6257113  -2.512375   -1.6477053 0.5312500
11 0.4117647    2  51 -2.705482 -1.6636656  -2.619324   -1.5403366 0.4831933
31 0.4615385    2  52 -2.252178 -0.8399208  -2.228309   -0.7611362 0.4831933
     p_fixed
3  0.5302235
1  0.5302235
2  0.5302235
4  0.5302235
11 0.4770234
31 0.4770234
       1% 
-16.26572 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



 1% 
-10 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



           x gene cov      prob     pvalue prob_fixed pvalue_fixed         p
4  0.5000000    1  72 -2.572720 -1.2046148  -2.562770   -1.1864119 0.5376884
2  0.5140187    1 107 -2.684489 -1.0610952  -2.675271   -1.0407552 0.5376884
1  0.5533333    1 150 -2.801057 -1.1357215  -2.810026   -1.1612021 0.5376884
3  0.5797101    1  69 -2.580141 -1.5786514  -2.591023   -1.6000075 0.5376884
41 0.4805195    2  77 -2.703434 -1.3754821  -2.648903   -1.2842935 0.5248756
21 0.5096154    2 104 -2.598723 -0.8796165  -2.575773   -0.8016926 0.5248756
     p_fixed
4  0.5367655
2  0.5367655
1  0.5367655
3  0.5367655
41 0.5206955
21 0.5206955
       1% 
-16.60202 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



 1% 
-10 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



           x gene cov      prob     pvalue prob_fixed pvalue_fixed         p
2  0.4444444    1  27 -2.429437 -1.6376130  -2.175801    -1.270692 0.5454545
1  0.4615385    1  26 -2.228767 -1.3737024  -2.032134    -1.053751 0.5454545
4  0.5312500    1  32 -1.977554 -0.6846932  -1.974510    -0.977307 0.5454545
3  0.6379310    1  58 -3.239089 -2.8142119  -3.896390    -3.689295 0.5454545
21 0.3055556    2  36 -4.423615 -3.8959803  -4.343671    -3.808354 0.4883721
41 0.4090909    2  44 -2.664173 -1.6934616  -2.622267    -1.635201 0.4883721
     p_fixed
2  0.5187910
1  0.5187910
4  0.5187910
3  0.5187910
21 0.4853119
41 0.4853119
       1% 
-15.85913 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



 1% 
-10 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



           x gene cov      prob    pvalue prob_fixed pvalue_fixed         p
2  0.4512195    1  82 -3.678249 -2.652726  -3.520898    -2.459672 0.5384615
3  0.4747475    1  99 -3.330071 -2.112410  -3.193046    -1.929261 0.5384615
1  0.6000000    1  95 -3.215439 -2.350681  -3.354435    -2.551735 0.5384615
4  0.6052632    1 114 -3.607340 -2.763339  -3.787775    -3.006509 0.5384615
41 0.4787234    2  94 -2.651805 -1.119089  -2.672069    -1.158897 0.5072115
31 0.4806202    2 129 -2.839351 -1.194392  -2.865366    -1.243123 0.5072115
     p_fixed
2  0.5328075
3  0.5328075
1  0.5328075
4  0.5328075
41 0.5090439
31 0.5090439
       1% 
-15.69983 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



 1% 
-10 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



           x gene cov      prob     pvalue prob_fixed pvalue_fixed         p
3  0.4531250    1  64 -3.420593 -2.4625179  -3.291199   -2.3018918 0.5463576
2  0.5373134    1  67 -2.340148 -0.7171945  -2.330756   -0.6451400 0.5463576
4  0.5652174    1  92 -2.547072 -1.1366240  -2.591550   -1.2600432 0.5463576
1  0.6075949    1  79 -2.993992 -2.1765658  -3.107009   -2.3478203 0.5463576
41 0.5000000    2 100 -3.046262 -1.7183533  -3.242597   -1.9911834 0.5506329
31 0.5466667    2  75 -2.385916 -0.6594024  -2.408319   -0.7857091 0.5506329
     p_fixed
3  0.5408127
2  0.5408127
4  0.5408127
1  0.5408127
41 0.5594424
31 0.5594424
       1% 
-16.22136 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



 1% 
-10 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



           x gene cov      prob    pvalue prob_fixed pvalue_fixed         p
2  0.2500000    1  20 -3.134498 -2.696680  -3.308600    -2.891428 0.4387755
3  0.3611111    1  36 -2.433709 -1.507770  -2.567885    -1.694024 0.4387755
4  0.5882353    1  17 -2.405614 -2.672445  -2.295989    -2.496411 0.4387755
1  0.6000000    1  25 -3.132932 -3.372238  -2.958541    -3.127999 0.4387755
21 0.3703704    2  27 -4.643244 -4.212594  -5.121762    -4.726339 0.5963303
31 0.5142857    2  35 -2.491324 -1.579606  -2.731736    -1.905558 0.5963303
     p_fixed
2  0.4498366
3  0.4498366
4  0.4498366
1  0.4498366
21 0.6143458
31 0.6143458
       1% 
-16.03198 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



 1% 
-10 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



           x gene cov      prob     pvalue prob_fixed pvalue_fixed         p
4  0.4444444    1  18 -1.946621 -1.1738892  -1.854406   -1.0200934 0.5306122
2  0.4838710    1  31 -2.085982 -1.0066651  -2.007467   -0.8288151 0.5306122
3  0.5000000    1  14 -1.589450 -0.6676469  -1.568828   -0.5753549 0.5306122
1  0.6285714    1  35 -2.661331 -2.3989310  -2.905425   -2.7637708 0.5306122
11 0.4722222    2  36 -2.113594 -0.9219849  -2.128824   -0.9559860 0.5076923
21 0.4893617    2  47 -2.187548 -0.7813771  -2.198197   -0.8161256 0.5076923
     p_fixed
4  0.5142217
2  0.5142217
3  0.5142217
1  0.5142217
11 0.5105573
21 0.5105573
       1% 
-15.47152 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



 1% 
-10 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



             x gene  cov      prob     pvalue prob_fixed pvalue_fixed
2  0.000000000    1  932 -2.852931 -2.8529311  -2.475721   -2.4757215
4  0.000000000    1  679 -2.078477 -2.0784766  -1.803664   -1.8036640
3  0.003436426    1  873 -1.514084 -1.2772353  -1.586807   -1.5910511
1  0.007174888    1 1115 -4.209643 -4.7908547  -4.894508   -5.6771184
21 0.000000000    2  965 -2.321883 -2.3218832  -2.285634   -2.2856345
41 0.001408451    2  710 -1.171610 -0.7112377  -1.160693   -0.6944801
             p     p_fixed
2  0.003056405 0.002652829
4  0.003056405 0.002652829
3  0.003056405 0.002652829
1  0.003056405 0.002652829
21 0.002403204 0.002365730
41 0.002403204 0.002365730
       1% 
-16.36286 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



 1% 
-10 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



             x gene cov       prob     pvalue prob_fixed pvalue_fixed
1  0.000000000    1 534 -0.2796544 -0.2796544 -0.3028069   -0.3028069
2  0.000000000    1 505 -0.2644671 -0.2644671 -0.2863624   -0.2863624
3  0.000000000    1 430 -0.2251898 -0.2251898 -0.2438333   -0.2438333
4  0.002267574    1 441 -1.6962405 -3.7788202 -1.6357982   -3.6322118
31 0.000000000    2 436 -0.6648032 -0.6648032 -0.6006761   -0.6006761
41 0.000000000    2 451 -0.6876748 -0.6876748 -0.6213416   -0.6213416
              p      p_fixed
1  0.0005235602 0.0005668934
2  0.0005235602 0.0005668934
3  0.0005235602 0.0005668934
4  0.0005235602 0.0005668934
31 0.0015236160 0.0013767489
41 0.0015236160 0.0013767489
       1% 
-16.07205 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



 1% 
-10 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



             x gene cov       prob     pvalue prob_fixed pvalue_fixed
3  0.000000000    1 378 -0.5392298 -0.5392298 -0.5498367   -0.5498367
4  0.000000000    1 332 -0.4736092 -0.4736092 -0.4829254   -0.4829254
2  0.002659574    1 376 -1.1585819 -2.2906032 -1.1496391   -2.2583409
1  0.003154574    1 317 -1.2451038 -2.5787446 -1.2345054   -2.5454746
11 0.000000000    2 327 -1.1240987 -1.1240987 -1.0960119   -1.0960119
21 0.002577320    2 388 -1.0440469 -0.4853152 -1.0360672   -0.4664067
             p     p_fixed
3  0.001425517 0.001453537
4  0.001425517 0.001453537
2  0.001425517 0.001453537
1  0.001425517 0.001453537
11 0.003431709 0.003346108
21 0.003431709 0.003346108
       1% 
-15.86158 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



 1% 
-10 


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



In [183]:
print(head(all_df))
print(head(sig_df))

    gene mean_cov       prob     pvalue prob_fixed pvalue_fixed         p
175    1    11.25  -7.227680  -6.120041  -7.267098    -6.317464 0.4666667
176    2    21.50  -8.529938  -6.369567  -8.530129    -6.362432 0.4302326
177    3    17.50 -10.969050 -10.680789 -10.975048   -10.740385 0.4285714
179    4    15.00  -6.652393  -3.799885  -6.654387    -3.833605 0.4833333
181    5    13.25  -7.455729  -6.183900  -7.466073    -6.250732 0.4716981
182    6    17.50  -8.670934  -7.544426  -8.688785    -7.772065 0.5714286
      p_fixed          ensemblID           type name     chrm    pos ident time
175 0.4458333 ENSDNOG00000042290 protein_coding  SYK JH563979 766458     1    1
176 0.4312771 ENSDNOG00000042290 protein_coding  SYK JH563979 766677     1    1
177 0.4221014 ENSDNOG00000042290 protein_coding  SYK JH563979 766712     1    1
179 0.4874084 ENSDNOG00000042290 protein_coding  SYK JH563979 766874     1    1
181 0.4618437 ENSDNOG00000042290 protein_coding  SYK JH563979 767289     1    1
18

In [184]:
print(head(sig_df))


saveRDS(all_df, 'simple_independent_all_summary.rds')
saveRDS(sig_df, 'simple_independent_sig_summary.rds')

    gene      prob       pvalue prob_fixed pvalue_fixed mean_cov
34    34 -13.18620 1.239248e-06  -13.18660          -10    18.75
95    95 -16.11601 7.742660e-07  -16.14406          -10    54.25
398  398 -12.91890 9.006745e-07  -12.92004          -10    12.50
491  491   0.00000 0.000000e+00    0.00000          -10    30.00
522  522 -16.17796 3.527600e-08  -16.17965          -10    14.25
628  628 -19.36105 8.053221e-08  -19.40842          -10    93.00
             ensemblID           type    name     chrm    pos ident time
34  ENSDNOG00000042290 protein_coding     SYK JH563979 769371     1    1
95  ENSDNOG00000042290 protein_coding     SYK JH563979 780421     1    1
398 ENSDNOG00000045844 protein_coding  FAM49B JH564091 896260     1    1
491 ENSDNOG00000051200        lincRNA         JH564093  73771     1    1
522 ENSDNOG00000049566 protein_coding    CD69 JH564093 252291     1    1
628 ENSDNOG00000035200 protein_coding CLEC12A JH564093 412505     1    1


In [128]:
sdf <- readRDS('simple_t0_result.rds')
print(head(sdf))
print(head(rdf))
# png(paste0('hist_p.png'))
# g <- ggplot(new_df, aes(x=value, color=label))+geom_histogram(position = "dodge", fill="white", alpha=0.4)+theme_bw()
# plot(g)
# dev.off()
# pdf <- new_df
# for (cnum in c(10, 25, 50, 75, 100)) {
#     pdf <- rbind(pdf, data.frame(value=rbinom(dim(all_df)[1], cnum, 0.5)/cnum, label=rep(paste0('p_simu_', cnum), dim(all_df)[1])))
# }
# pdf <- subset(pdf, pdf$label != 'p_fixed')
# png(paste0('hist_p_sim.png'))
# g <- ggplot(pdf, aes(x=value, color=label))+geom_density(alpha=1, adjust=5)+theme_bw()
# plot(g)
# dev.off()
# print(subset(pdf, pdf$label == "p"))
# print(whats)
# return()
        
        
# #         png(paste0('hist_cor_', i, '_', j, '.png'))
# #         g <- ggplot(tdf, aes(x=cor))+geom_histogram()+theme_bw()
# #         plot(g)
# #         dev.off()
# #         png(paste0('density_cor_cov_', i, '_', j, '.png'))
# #         g <- ggplot(tdf, aes(x=mean_cov, y=cor) ) +
# #           geom_bin2d(bins = 50) +
# #           scale_fill_continuous(type = "viridis") +
# #           theme_bw()
# #         plot(g)
# #         dev.off()
# #         png(paste0('density_cor_cov_log', i, '_', j, '.png'))
# #         g <- ggplot(tdf, aes(x=mean_cov, y=cor) ) +
# #           geom_bin2d(bins = 50) + scale_x_continuous(trans='log10') +
# #           scale_fill_continuous(type = "viridis") +
# #           theme_bw()
# #         plot(g)
# #         dev.off()
#         if (is.null(all_df)) all_df <- tdf
#         else all_df <- rbind(all_df, tdf)
#     }
# }
# print(head(all_df))
# png(paste0('hist_cor_all.png'))
# g <- ggplot(all_df, aes(x=cor))+geom_histogram()+theme_bw()
# plot(g)
# dev.off()
# png(paste0('hist_scor_all.png'))
# g <- ggplot(all_df, aes(x=scor))+geom_histogram()+theme_bw()
# plot(g)
# dev.off()
# new_df <- data.frame(value=all_df$p, label=rep('p', dim(all_df)[1]))
# new_df <- rbind(new_df, data.frame(value=all_df$p_fixed, label=rep('p_fixed', dim(all_df)[1])))
# png(paste0('hist_p.png'))
# g <- ggplot(new_df, aes(x=value, color=label))+geom_histogram(position = "dodge", fill="white", alpha=0.4)+theme_bw()
# plot(g)
# dev.off()
# pdf <- new_df
# for (cnum in c(10, 25, 50, 75, 100)) {
#     pdf <- rbind(pdf, data.frame(value=rbinom(dim(all_df)[1], cnum, 0.5)/cnum, label=rep(paste0('p_simu_', cnum), dim(all_df)[1])))
# }
# pdf <- subset(pdf, pdf$label != 'p_fixed')
# png(paste0('hist_p_sim.png'))
# g <- ggplot(pdf, aes(x=value, color=label))+geom_density(alpha=1, adjust=5)+theme_bw()
# plot(g)
# dev.off()
# print(subset(pdf, pdf$label == "p"))
# print(whats)
# return()
# new_df <- data.frame(prob=all_df$prob, pvalue=all_df$pvalue, prob_fixed=all_df$prob_fixed, pvalue_fixed=all_df$pvalue_fixed, scor=all_df$cor, cor=all_df$cor, mean_cov=all_df$mean_cov)
# for (val in c('prob', 'pvalue', 'pvalue_fixed', 'prob_fixed')) {
#     new_df[new_df[,val] < -50, val] = -50
# }
# png(paste0('hist_prob_all.png'))
# g <- ggplot(new_df, aes(x=prob))+geom_histogram()+theme_bw()
# plot(g)
# dev.off()
# png(paste0('hist_pvalue_all.png'))
# g <- ggplot(new_df, aes(x=pvalue))+geom_histogram()+theme_bw()
# plot(g)
# dev.off()


# for (cor_value in c('cor', 'scor')) {
#     if (cor_value == "cor")
#         bins = 200
#     else
#         bins = 50
#     for (prob_value in c('prob', 'pvalue', 'pvalue_fixed', 'prob_fixed')) {
#         png(paste0('density_', cor_value, '_', prob_value, '_all.png'))
#         g <- ggplot(all_df, aes_string(x=cor_value, y=prob_value) ) +
#         geom_bin2d(bins = bins) +
#         scale_fill_continuous(type = "viridis") +
#         theme_bw()
#         plot(g)
#         dev.off()
#         png(paste0('scatter_', cor_value, '_', prob_value, '_all.png'))
#         g <- ggplot(all_df, aes_string(x=cor_value, y=prob_value) ) +
#           geom_point(alpha=0.4) +
#           theme_bw()
#         plot(g)
#         dev.off()
#         png(paste0('scatter_', cor_value, '_', prob_value, '_exp_all.png'))
#         g <- ggplot(all_df, aes_string(x=cor_value, y=paste0("exp_", prob_value)) ) +
#           geom_point(alpha=0.4) +
#           theme_bw()
#         plot(g)
#         dev.off()
#         png(paste0('density_', cor_value, '_', prob_value, '_exp_all.png'))
#         g <- ggplot(all_df, aes_string(x=cor_value, y=paste0("exp_", prob_value))) +
#         geom_bin2d(bins = bins) +
#         scale_fill_continuous(type = "viridis") + ylim(0, 1) + 
#         theme_bw()
#         plot(g)
#         dev.off()
#         png(paste0('density_', cor_value, '_', prob_value, '_min_all.png'))
#         g <- ggplot(new_df, aes_string(x=cor_value, y=paste0(prob_value))) +
#         geom_bin2d(bins = bins) +
#         scale_fill_continuous(type = "viridis") + 
#         theme_bw()
#         plot(g)
#         dev.off()
#         png(paste0('density_cov_log_', prob_value, '_min_all.png'))
#         g <- ggplot(new_df, aes(x=mean_cov, y=prob) ) +
#           geom_bin2d(bins = 200) + scale_x_continuous(trans='log10') + 
#           scale_fill_continuous(type = "viridis") +
#           theme_bw()
#         plot(g)
#         dev.off()
#     }
# }
# # png(paste0('density_cor_cov_all.png'))
# # g <- ggplot(all_df, aes(x=mean_cov, y=cor) ) +
# #   geom_bin2d(bins = 50) +
# #   scale_fill_continuous(type = "viridis") +
# #   theme_bw()
# # plot(g)
# # dev.off()
# # png(paste0('density_scor_cov_all.png'))
# # g <- ggplot(all_df, aes(x=mean_cov, y=scor) ) +
# #   geom_bin2d() +
# #   scale_fill_continuous(type = "viridis") +
# #   theme_bw()
# # plot(g)
# # dev.off()
# # png(paste0('density_cor_cov_log_all.png'))
# # g <- ggplot(all_df, aes(x=mean_cov, y=cor) ) +
# #   geom_bin2d(bins = 50) + scale_x_continuous(trans='log10') +
# #   scale_fill_continuous(type = "viridis") +
# #   theme_bw()
# # plot(g)
# # dev.off()
# # png(paste0('density_scor_cov_log_all.png'))
# # g <- ggplot(all_df, aes(x=mean_cov, y=scor) ) +
# #   geom_bin2d() + scale_x_continuous(trans='log10') +
# #   scale_fill_continuous(type = "viridis") +
# #   theme_bw()
# # plot(g)
# # dev.off()

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

