In [53]:
@everywhere using PureSeq

In [2]:
function train_against_controls(targetFile, controlFiles; blockSize=100000, blockLimit=Inf)
    
    # build our control feature maps
    readers = [BinningMap(BamReader(controlFiles[i], false, ReferenceContigs_hg38), 1000) for i in 1:length(controlFiles)]
    
    # our target reader
    target = BinningMap(BamReader(targetFile, false, ReferenceContigs_hg38), 1000)

    # we want to compute XtX and Xty block by block
    P = length(controlFiles) + 1
    XtX = zeros(P,P)
    Xty = zeros(P)
    printInterval = int(min(blockLimit, int(sum(ReferenceContigs_hg38.sizes)/blockSize))/10)
    
    # use denseblocks to iterate over blocks of the target and control data
    count = 0
    sse = 0.0
    sseTest = Float64[]
    for (target,control) in zip(denseblocks([target], blockSize), denseblocks(readers, blockSize, constantColumn=true, loop=true))
        count += 1
        if count > blockLimit break end
        
        # compute the betas for this data up to this point and get our test error on the next block
        betas = inv(XtX + 0.00001*eye(P))*Xty
        sse = sum((target - control*betas).^2)
        push!(sseTest, sse)

        # update our estimates
        XtX .+= control'control
        Xty .+= control'target
        
        if count % printInterval == 0
            println("Processed $(count*blockSize) positions...")
        end
    end
    betas = inv(XtX + 0.00001*eye(P))*Xty
    betas,sseTest
end

train_against_controls (generic function with 1 method)

In [54]:
@everywhere root = "/scratch2/slund1/ENCSR000BQS_notebook"
targetId = "ENCFF000NYK"
controlIds7 = [
    "ENCFF000ODO", # designated control
    "ENCFF000ODV", # another GM12878 control (rep1)
    "ENCFF000ODZ", # another GM12878 control (rep2)
    "ENCFF000OEB", # another GM12878 control (rep3)
    "ENCFF000NGK",
    "ENCFF000NGM",
    "ENCFF000NGN"
]
# @time betas7,sseTest7 = train_against_controls(
#     "$root/$targetId.bam",
#     ["$root/$id.bam" for id in controlIds7],
#     blockLimit=40
# )

7-element Array{ASCIIString,1}:
 "ENCFF000ODO"
 "ENCFF000ODV"
 "ENCFF000ODZ"
 "ENCFF000OEB"
 "ENCFF000NGK"
 "ENCFF000NGM"
 "ENCFF000NGN"

In [55]:
@everywhere function convert_to_binned(bamFile, binSize, useReverseReads)
    bm = BinningMap(BamReader(bamFile, useReverseReads, ReferenceContigs_hg38), binSize)
    out = open(bamFile*"."*(useReverseReads ? "r" : "f")*"bin$binSize", "w")
    while !eof(bm)
        write(out, uint32(bm.position))
        write(out, uint32(bm.value))
        advance!(bm)
    end
    close(out)
end

In [56]:
idsToMap = [
    "ENCFF000NYK", # target
    "ENCFF000ODO", # designated control
    "ENCFF000ODV", # another GM12878 control (rep1)
    "ENCFF000ODZ", # another GM12878 control (rep2)
    "ENCFF000OEB", # another GM12878 control (rep3)
    "ENCFF000NGK",
    "ENCFF000NGM",
    "ENCFF000NGN",
    "ENCFF000QET",
    "ENCFF000QEU",
    "ENCFF000QFL",
    "ENCFF000QFS",
    "ENCFF000RCB",
    "ENCFF000RCC",
    "ENCFF000RCF",
    "ENCFF000RPT",
    "ENCFF000SAZ",
    "ENCFF000VPI",
    "ENCFF000VPK",
    "ENCFF000WIQ",
    "ENCFF000WIX",
    "ENCFF000WPT",
    "ENCFF000WPV",
    "ENCFF000XOO",
    "ENCFF000XRH",
    "ENCFF000XRI",
    "ENCFF000XTE",
    "ENCFF000YPF",
    "ENCFF000YPM",
    "ENCFF000YRC",
    "ENCFF000YRN",
    "ENCFF000ZSI",
    "ENCFF000ZVJ",
    "ENCFF000ZVV",
    "ENCFF001HAV",
    "ENCFF001HGV"
]

@parallel for id in idsToMap
    convert_to_binned("$root/$id.bam", 100, false)
    convert_to_binned("$root/$id.bam", 100, true)
end

In [24]:
@time convert_to_binned("$root/$targetId.bam", 100, false)

elapsed time: 33.6072523 seconds (1634475420 bytes allocated, 4.29% gc time)


In [49]:
function read_binned(fileName, binSize)
    f = open(fileName)
    pair = zeros(Uint32, 2)
    count = 0
    while !eof(f)
        read!(f, pair)
        count += 1
    end
    close(f)
end

read_binned (generic function with 1 method)

In [50]:
@time read_binned("$root/ENCFF000NYK.bam.fbin100", 100)

5475880
elapsed time: 0.203038833 seconds (594768 bytes allocated)


In [None]:
type BinnedReader
    fileStream
    position::Int64
    value::Float64
end

function BinnedReader(fileName::ASCIIString)
    f = open(fileName)
    br = BinnedReader(f, 0, 0.0)
    advance!(br)
    br
end
close(fm::BinnedReader) = close(fm.reader)
value(fm::BinnedReader) = fm.value
position(fm::BinnedReader) = fm.position
eof(fm::BinnedReader) = fm.position <= 0

function advance!(fm::BinnedReader)
    fm.position = floor((fm.reader.position-1)/fm.binSize) + 1
    binEnd = fm.position*fm.binSize
    
    # Fill in the bin
    fm.value = 0.0
    while fm.reader.position != -1 && fm.reader.position <= binEnd
        fm.value += 1
        PureSeq.advance!(fm.reader)
    end
end