<h2>Create LibSVM files</h2>

Here we create libSVM files that we can use for running XGBoost.

In [12]:
using PureSeq

In [13]:
function write_libsvm_data(stream, targetFile, controlFiles; contextSize=4, blockSize=1000, blockLimit=Inf)
    
    # create our readers
    readers = [ContextMap(BinnedReader(controlFiles[i]), contextSize, contextSize) for i in 1:length(controlFiles)]
    target = BinnedReader(targetFile)
    
    # we want to compute XtX and Xty block by block
    printInterval = max(1,int(min(blockLimit, int(sum(ReferenceContigs_hg38.sizes)/blockSize))/10))
    
    # use denseblocks to iterate over blocks of the target and control data
    count = 0
    for (target,control) in zip(denseblocks([target], blockSize), denseblocks(readers, blockSize, loop=true))
        count += 1
        if count > blockLimit break end
        
        for i in 1:blockSize
            print(stream, int(target[i])," ")
            for j in 1:size(control)[2]
                if control[i,j] != 0.0
                    print(stream, j,":",int(control[i,j])," ")
                end
            end
            println(stream)
        end
        
        if count % printInterval == 0
            println("Processed $(count*blockSize) positions...")
        end
    end
end

write_libsvm_data (generic function with 1 method)

In [16]:
# find out the target label and other metadata of all the experiments
metadataDir = "/homes/gws/slund1/projects/genomic-structure-learning/data/metadata"
metadata = Dict()
for file in readdir(metadataDir)
    obj = open(f->JSON.parse(readall(f)), "$metadataDir/$file")
    metadata[obj["accession"]] = obj
end

In [18]:
ids = collect(map(o->o["accession"], filter(o->"GM12878" == o["biosample_term_name"], values(metadata))));

In [24]:
first(values(metadata))

Dict{String,Any} with 41 entries:
  "system_slims"         => {}
  "alternate_accessions" => {}
  "lab"                  => ["postal_code"=>"94305-5120","state"=>"CA","name"=>…
  "run_type"             => "Single-ended"
  "documents"            => {}
  "visualize_ucsc"       => "http://genome.ucsc.edu/cgi-bin/hgHubConnect?hgHub_…
  "date_created"         => "2014-02-12T22:36:47.854769+00:00"
  "award"                => ["name"=>"U54HG004558","status"=>"disabled","rfa"=>…
  "biosample_type"       => "immortalized cell line"
  "related_files"        => {}
  "biosample_synonyms"   => {" K562 cell","K-562","K-562 cell"}
  "aliases"              => {}
  "month_released"       => "May, 2012"
  "assembly"             => {"hg19"}
  "status"               => "released"
  "accession"            => "ENCSR000EGG"
  "developmental_slims"  => {}
  "files"                => {["lab"=>"/labs/michael-snyder/","alternate_accessi…
  "dbxrefs"              => {"UCSC-ENCODE-hg19:wgEncodeEH002814","GEO:GSM93

In [28]:
targetIds = ASCIIString[]
for id in keys(metadata)
    obj = metadata[id]
    if "GM12878" == obj["biosample_term_name"] && !ismatch(r"[Cc]ontrol", obj["target"]["label"])
        for file in obj["files"]
            asc = file["accession"]
            if file["file_format"] == "fastq"# && isfile("$asc.fastq.gz")
                if isfile("/scratch2/slund1/pure-seq/binned_data/$asc.bam") && asc != "ENCFF000NYK"
                    push!(targetIds, asc)
                    println("Mapped $asc...")
                end
            end
        end
    end
end

Mapped ENCFF000WGW...
Mapped ENCFF000WGX...
Mapped ENCFF000NWT...
Mapped ENCFF000NWV...
Mapped ENCFF000NWE...
Mapped ENCFF000NWG...
Mapped ENCFF002BFM...
Mapped ENCFF002BFN...
Mapped ENCFF002EJZ...
Mapped ENCFF002EKA...
Mapped ENCFF002EKB...
Mapped ENCFF002EKC...
Mapped ENCFF001HHX...
Mapped ENCFF001HIA...
Mapped ENCFF000OFK...
Mapped ENCFF000OFM...
Mapped ENCFF002EDB...
Mapped ENCFF002EDC...
Mapped ENCFF000NWM...
Mapped ENCFF000NWN...
Mapped ENCFF000WDW...
Mapped ENCFF000WDY...
Mapped ENCFF000WHO...
Mapped ENCFF000WHR...
Mapped ENCFF002BFQ...
Mapped ENCFF002BFR...
Mapped ENCFF002EBQ...
Mapped ENCFF002EBR...
Mapped ENCFF000AUC...
Mapped ENCFF000AUD...
Mapped ENCFF000VSY...
Mapped ENCFF000VTI...
Mapped ENCFF002EKI...
Mapped ENCFF002EKJ...
Mapped ENCFF002EKM...
Mapped ENCFF002EKT...
Mapped ENCFF000VSS...
Mapped ENCFF000VSV...
Mapped ENCFF000OEH...
Mapped ENCFF000OEJ...
Mapped ENCFF000VUU...
Mapped ENCFF000VUW...
Mapped ENCFF002DOM...
Mapped ENCFF002DOQ...
Mapped ENCFF002EHO...
Mapped ENC

In [29]:
length(targetIds)

119

In [21]:
for id in ids
    if isfile("/scratch2/slund1/pure-seq/binned_data/$id.bam")
        println(id)
    end
end

In [20]:
ids

167-element Array{Any,1}:
 "ENCSR000DZL"
 "ENCSR000BGY"
 "ENCSR000BRU"
 "ENCSR501DKS"
 "ENCSR000BGJ"
 "ENCSR597VGC"
 "ENCSR000DRZ"
 "ENCSR000BGE"
 "ENCSR459FTB"
 "ENCSR000BGC"
 "ENCSR000DYU"
 "ENCSR000DNQ"
 "ENCSR016UEH"
 ⋮            
 "ENCSR000DYW"
 "ENCSR000DZF"
 "ENCSR000EYX"
 "ENCSR009MBP"
 "ENCSR769ZTN"
 "ENCSR000EYV"
 "ENCSR900XDB"
 "ENCSR000DYP"
 "ENCSR000DZM"
 "ENCSR000DZE"
 "ENCSR000DYX"
 "ENCSR553NUG"

In [31]:
targetId = "ENCFF000NYK"
controlIds = [
    35 => vec(readdlm("/scratch2/slund1/pure-seq/controlIds_35.txt")),
    100 => vec(readdlm("/scratch2/slund1/pure-seq/controlIds_100.txt")),
    442 => vec(readdlm("/scratch2/slund1/pure-seq/controlIds_442.txt")),
    "REST-NYK" => ["ENCFF000NYY", "ENCFF000NZA", "ENCFF000NYN"],
    "NYK_119" => vec(readdlm("/scratch2/slund1/pure-seq/targetIds_NYK_119.txt"))
]
root = "/scratch2/slund1/pure-seq/binned_data";

In [15]:
contextSize = 4
numControls = "NYK_119"#35
open(f->write_libsvm_data(
    f,
    "$root/$targetId.bam.fbin100",
    ["$root/$id.bam.fbin100" for id in controlIds[numControls]],
    contextSize=contextSize
), "/scratch2/slund1/pure-seq/forwardReads_c$(numControls)_b$(contextSize)_a$(contextSize).libsvm", "w")
# open(f->write_libsvm_data(
#     f,
#     "$root/$targetId.bam.rbin100",
#     ["$root/$id.bam.rbin100" for id in controlIds[numControls]],
#     contextSize=contextSize
# ), "/scratch2/slund1/pure-seq/reverseReads_c$(numControls)_b$(contextSize)_a$(contextSize).libsvm", "w")

In [None]:
open(f->write_libsvm_data(
    f,
    "$root/$targetId.bam.fbin100",
    ["$root/$id.bam.fbin100" for id in controlIds452],
    blockLimit=Inf, blockSize=1000,
    contextSize=4
), "/scratch2/slund1/pure-seq/forwardReads_c452_b4_a4.libsvm", "w")

In [20]:
open(f->write_libsvm_data(
    f,
    "$root/$targetId.bam.rbin100",
    ["$root/$id.bam.rbin100" for id in controlIds452],
    blockLimit=Inf, blockSize=1000,
    contextSize=4
), "/scratch2/slund1/pure-seq/reverseReads_c452_b4_a4.libsvm", "w")

LoadError: interrupt
while loading In[20], in expression starting on line 1