<h2>Create BAM files</h2>



<h3>Load Metadata</h3>

In [1]:
# find out the target label and other metadata of all the experiments
metadataDir = "/homes/gws/slund1/projects/genomic-structure-learning/data/metadata"
metadata = Dict()
for file in readdir(metadataDir)
    obj = open(f->JSON.parse(readall(f)), "$metadataDir/$file")
    metadata[obj["accession"]] = obj
end

<h3>Download the FASTQ Files for all controls</h3>

In [3]:
currDir = pwd()
dataRoot = "/scratch2/slund1/ENCSR000BQS_notebook"
cd(dataRoot)
for id in keys(metadata)
    obj = metadata[id]
    if ismatch(r"[Cc]ontrol", obj["target"]["label"])
        for file in obj["files"]
            asc = file["accession"]
            if file["file_format"] == "fastq" && !isfile("$asc.fastq.gz")
                try 
                    println("Downloading ", file["accession"], "...")
                    run(`wget -q https://www.encodeproject.org/files/$asc/@@download/$asc.fastq.gz`)
                catch e
                    println("Error!\n")
                end
            end
        end
    end
end
cd(currDir)

Downloading ENCFF000OSI...
Error!

Downloading ENCFF000OCZ...
Error!

Downloading ENCFF000ODA...
Error!

Downloading ENCFF000POE...
Error!

Downloading ENCFF000POJ...
Error!



<h3>Create Sorted BAM Files</h3>

In [15]:
function map_file(gzippedFastqFile, outBamFile)
    rootName = replace(gzippedFastqFile, r"\.fastq.gz", "")
    bowtie2Index = "/homes/gws/slund1/projects/genomic-structure-learning/data/bowtie/hg38"
    run(
        `zcat $gzippedFastqFile` |>
        `bowtie2 -p 20 -x $bowtie2Index -U -` |> 
        `samtools view -bS -` |>
        "$(rootName)_unsorted.bam"
    )
    outBamFile = replace(outBamFile, r"\.bam", "") # samtools will add the .bam automatically
    run(`samtools sort $(rootName)_unsorted.bam $outBamFile -@ 10`)
    rm("$(rootName)_unsorted.bam")
end

cd(dataRoot)
for id in keys(metadata)
    obj = metadata[id]
    if ismatch(r"[Cc]ontrol", obj["target"]["label"])
        for file in obj["files"]
            asc = file["accession"]
            if file["file_format"] == "fastq" && isfile("$asc.fastq.gz")
                if !isfile("$dataRoot/$asc.bam")
                    println("Mapping $asc...")
                    map_file("$dataRoot/$asc.fastq.gz", "$dataRoot/$asc.bam")
                end
            end
        end
    end
end
cd(currDir)