## Prepare count files

Install package based on [here](https://github.com/kauralasoo/rasqual/tree/master/rasqualTools)
using `devtools::install_github("kauralasoo/rasqual/rasqualTools")`. This has to be done in R console opened on a real terminal window. I installed it in conda env `testother`.

In [19]:
library(rasqualTools)
library(tibble)
library(dplyr)

In [2]:
# Prepare count file
countfile <- "/project2/xinhe/ATAC-seq_10252018/NSC/newNSC_peaks.20_samples.counts.txt"
countm <- read.table(countfile, header=T,stringsAsFactors = F)
head(countm)

chr,start,end,peak_name,NSC_02,NSC_03,NSC_04,NSC_05,NSC_06,NSC_07,⋯,NSC_12,NSC_13,NSC_14,NSC_15,NSC_16,NSC_17,NSC_18,NSC_19,NSC_20,NSC_21
chr1,9908,10714,newNSC1,664,1353,583,758,612,995,⋯,1365,640,739,358,770,202,788,347,567,608
chr1,19907,20551,newNSC2,57,79,22,72,91,81,⋯,68,53,89,23,49,39,68,40,89,47
chr1,22587,22860,newNSC3,32,31,14,23,32,24,⋯,18,14,28,0,26,9,28,0,34,20
chr1,28683,29970,newNSC4,270,206,385,322,297,284,⋯,285,263,318,382,282,205,214,456,311,237
chr1,31228,31456,newNSC5,28,19,18,23,36,28,⋯,36,14,31,29,47,1,24,39,21,22
chr1,34710,35096,newNSC6,39,31,17,28,24,36,⋯,27,22,35,11,17,20,33,17,46,12


In [3]:
counts_matrix <- countm[,5:dim(countm)[2]]
rownames(counts_matrix) <-countm$peak_name
head(counts_matrix)

Unnamed: 0,NSC_02,NSC_03,NSC_04,NSC_05,NSC_06,NSC_07,NSC_08,NSC_09,NSC_10,NSC_11,NSC_12,NSC_13,NSC_14,NSC_15,NSC_16,NSC_17,NSC_18,NSC_19,NSC_20,NSC_21
newNSC1,664,1353,583,758,612,995,640,557,988,879,1365,640,739,358,770,202,788,347,567,608
newNSC2,57,79,22,72,91,81,72,46,64,46,68,53,89,23,49,39,68,40,89,47
newNSC3,32,31,14,23,32,24,18,25,37,32,18,14,28,0,26,9,28,0,34,20
newNSC4,270,206,385,322,297,284,312,231,158,296,285,263,318,382,282,205,214,456,311,237
newNSC5,28,19,18,23,36,28,33,19,34,34,36,14,31,29,47,1,24,39,21,22
newNSC6,39,31,17,28,24,36,35,46,33,40,27,22,35,11,17,20,33,17,46,12


In [4]:
saveRasqualMatrices(list("cellTypeNSC" = counts_matrix), "../datarun/NSC", file_suffix = "atac20")

[1] "../datarun//cellTypeNSC.atac20.txt"


In [5]:
# size factor file
size_factors = rasqualCalculateSampleOffsets(counts_matrix, gc_correct = FALSE)
saveRasqualMatrices(list("cellTypeNSC" = size_factors), "../datarun/NSC", file_suffix = "size_factors")

[1] "../datarun//cellTypeNSC.size_factors.txt"


In [27]:
# count SNP number per feature and generate batch run script.
gene_data =  as_data_frame(dplyr::select(countm, peak_name,chr,start,end))
gene_data = add_column(gene_data,strand= as.integer(1),.before="start")
colnames(gene_data) <- c("gene_id", "chr", "strand","exon_starts","exon_ends")
gene_data <- gene_data %>% mutate(exon_starts = as.character(exon_starts)) %>% mutate(exon_ends = as.character(exon_ends))

In [28]:
print(gene_data[1:5,])

# A tibble: 5 x 5
  gene_id chr   strand exon_starts exon_ends
  <chr>   <chr>  <int> <chr>       <chr>    
1 newNSC1 chr1       1 9908        10714    
2 newNSC2 chr1       1 19907       20551    
3 newNSC3 chr1       1 22587       22860    
4 newNSC4 chr1       1 28683       29970    
5 newNSC5 chr1       1 31228       31456    


In [53]:
gen_command <- function(chrom, gdf, vcfdir, outdir, outtag) {
    qstart <- min(as.integer(gdf$exon_starts))
    qend <- max(as.integer(gdf$exon_ends))
    tmp_file <- tempfile(pattern = paste(chrom,qstart,qend,sep="_"), tmpdir = tempdir(), fileext = ".snplist")
    system(paste0("tabix ",vcfdir,"/",chrom,".hg38withchr.AS.vcf.gz ",chrom,":",qstart,"-",qend, "| cut -f 1-3> ", tmp_file))
    snpdf <- try(read.table(tmp_file, header=F, stringsAsFactors = F))
    if(inherits(snpdf, "try-error")){
        print(paste0("No line for ",chrom, outtag,". No output .sh file."))
        return(0)
    }
    file.remove(tmp_file)
    snpdf <- as_data_frame(snpdf)
    colnames(snpdf) <- c("chr", "pos", "snp_id")
    snp_counts <- countSnpsOverlapingExons(gdf, snpdf, cis_window = 2e3)
    cmd_mainfn <- paste0(outdir,"/","mainrun_",chrom, "_", outtag)
    cmd_permfn <- paste0(outdir,"/","permrun_",chrom, "_", outtag)
    mainbashf<-file(paste0(cmd_mainfn,".sh"),"w")
    permbashf<-file(paste0(cmd_permfn,".sh"),"w")
    writeLines(paste0("rm ", cmd_mainfn, ".result.txt"), mainbashf)
    writeLines(paste0("rm ",cmd_permfn, ".permres.txt"), permbashf)
    for (rowi in 1:nrow(snp_counts)){
        featureID <- snp_counts$gene_id[rowi]
        rSNP <- snp_counts$cis_snp_count[rowi]
        fSNP <- snp_counts$feature_snp_count[rowi]
        rstart <- snp_counts$range_start[rowi]
        rend <- snp_counts$range_end[rowi]
        fstart <- snp_counts$exon_starts[rowi]
        fend <- snp_counts$exon_ends[rowi]
        cmd_main <- paste0("tabix ",vcfdir,"/",chrom,".hg38withchr.AS.vcf.gz ",chrom,":", rstart, "-",rend,
                       "|/home/simingz/run_rasqual/rasqual/bin/rasqual -y ",outdir,"/cellTypeNSC.atac20.bin -k ",
                       outdir, "/cellTypeNSC.size_factors.bin -n 20 -j ",featureID," -l ",rSNP," -m ",fSNP,
                       " -s ",fstart," -e ", fend, " -f ", featureID," -z")
        cmd_perm <- paste0(cmd_main, " -r -t |  cut -f 1,10-12,25") # keep lead SNP only, select column
        writeLines(paste0(cmd_main," >> ",cmd_mainfn, ".result.txt"), mainbashf)
        writeLines(paste0("for i in `seq 1 1000`; do ", cmd_perm," >> ",cmd_permfn, ".permres.txt; done" ), permbashf)
    }
    close(mainbashf)
    close(permbashf)
    return(snp_counts)
}

In [55]:
nlinesperjob <- 220
vcfdir <- "/home/simingz/run_rasqual/phased_data"
datarundir <- "/home/simingz/run_rasqual/datarun/NSC"

gene_data_bychr <- split(gene_data,gene_data$chr)
gene_data_bychr$chrX <- NULL
gene_data_bychr$chrY <- NULL

for (chrom in names(gene_data_bychr)){
    n <- nrow(gene_data_bychr[[chrom]])
    r <- rep(1:ceiling(n/nlinesperjob),each=nlinesperjob)[1:n]
    chromlist <- split(gene_data_bychr[[chrom]],r)
    for (tag in names(chromlist)){
        genedf <- chromlist[[tag]]
        gen_command(chrom,genedf,vcfdir,datarundir, paste0("NSC",tag))
    }
}

[1] "No line for chr1NSC43. No output .sh file."
[1] "No line for chr1NSC44. No output .sh file."
[1] "No line for chr1NSC45. No output .sh file."
[1] "No line for chr1NSC46. No output .sh file."
[1] "No line for chr1NSC47. No output .sh file."
[1] "No line for chr1NSC48. No output .sh file."
[1] "No line for chr1NSC49. No output .sh file."
[1] "No line for chr10NSC12. No output .sh file."
[1] "No line for chr10NSC13. No output .sh file."
[1] "No line for chr13NSC1. No output .sh file."
[1] "No line for chr13NSC2. No output .sh file."
[1] "No line for chr13NSC3. No output .sh file."
[1] "No line for chr13NSC4. No output .sh file."
[1] "No line for chr17NSC9. No output .sh file."
[1] "No line for chr20NSC9. No output .sh file."
[1] "No line for chr20NSC10. No output .sh file."
[1] "No line for chr21NSC1. No output .sh file."
[1] "No line for chr9NSC15. No output .sh file."
