In [None]:
using PureSeq
using DataStructures
using Gadfly
using GZip

In [4]:
type Poisson_regression
    #array of weights
    w
    #w0 constnat
    w_0::Float64
    #learning rate
    eta::Float64
    #decay rate for the learning rate 
    alpha::Float64
    #number of samples parsed through (will be incremnted automatically)
    n::Int64
    #number of features
    d::Int64
end

In [5]:
#constructor for convenience
function poisson_regression(;eta::Float64=0.0001, alpha::Float64=1.00, d::Int64=0)
    if d != 0
        model = Poisson_regression(zeros(d), 0.0, eta, alpha, 0, d)
    else
        model = Poisson_regression(nothing, 0.0, eta, alpha, 0, d)
    end
    
    #return the model
    model
end 
    

poisson_regression (generic function with 1 method)

In [6]:
#prediction using n examles (nxd matrix)
function predict(model::Poisson_regression, x::Array{Float64,2})
    linear_prediction = x*model.w+model.w_0
    prediction = exp(linear_prediction)
end

predict (generic function with 1 method)

In [7]:
#prediction using 1xd array 
function predict(model::Poisson_regression, x::Array{Float64,1})
    linear_prediction = x*model.w+model.w_0
    prediction = exp(linear_prediction)
end

predict (generic function with 2 methods)

In [12]:
#takes in nxd batch data as an input, conducts stochastic gradient descent
function fit(model::Poisson_regression, y::Array{Float64, 1}, x::Array{Float64, 2})
    #checking if y and x match in size
    if length(y)!=size(x)[1]
        return nothing
    end 
    
    #initiating weight array if necessary
    if model.d == 0
        model.d = size(x)[2]
        model.w = zeros(model.d)
    end
    
    #updating info (right now its just the number of examples parsed)
    model.n += length(y)
    num_data = length(y)
    
    #making prediction
    prediction = predict(model, x)
    
    #updating w_0
    model.w_0 = model.w_0 + model.eta*(sum(y-prediction, 1)[1]*1.0/num_data)
    
    #updating w
    model.w = model.w + model.eta*((transpose(x)*(y-prediction))/num_data)
    
    model
end 

fit (generic function with 1 method)

In [9]:
#Dense block object
type DenseBlocks
    readers::Array{Any}
    blockSize::Int64
end

In [10]:
#Dense block iterator
type DenseBlockIterator
    readers::Array{Any}
    blockSize::Int64
    blockWidth::Int64
    block::Array{Float64,2}
    offset::Int64
    done::Bool
    constantColumn::Bool
end

function denseblocks(readers, blockSize::Int64; constantColumn=false)
    #width is automatically generated by the size of readers array.
    blockWidth = constantColumn ? length(readers) + 1 : length(readers)
    DenseBlockIterator(readers, blockSize, blockWidth, zeros(Float64, blockSize, blockWidth), 0, false, constantColumn)
end

Base.start(it::DenseBlockIterator) = 0
Base.done(it::DenseBlockIterator, nil) = it.done

function Base.next(it::DenseBlockIterator, nil)
    it.done = true
    
    if it.constantColumn
        it.block[:,1:end-1] = 0 #set it back to 0 if needed 
    else
        it.block[:,:] = 0
    end

    # Fill in the block
    for i in 1:length(readers)
        reader = readers[i]

        #why do we have offset not it.offset here? (and also it.blockSize)
        while !reader.done && reader.position <= it.offset + it.blockSize
            #we want to log transform the reader.value using log(0.01+value) temporarily
            it.block[reader.position - it.offset, i] += 1 #not reader.value right?
            advance!(reader)
            it.done = false
        end
    end

    # See if we are really done or just found a blank block
    if it.done
        it.done = it.done && target.done
        for i in 1:length(readers)
            it.done = it.done && readers[i].done
        end
    end
    
    # update the offset
    it.offset += it.blockSize
    

    #log transform the block

    it.block[:,1], log(it.block[:,2:end] + 0.001)
end

next (generic function with 109 methods)

In [15]:
#this is a psuedo main function now

#making an array of bamfiles
#path of where the bam files are stored
root = "/scratch/hiranumn"

#reading in bam files
#target
target = BamReader("$root/ENCSR000AHE/ENCFF000QQG.bam", false, ReferenceContigs_hg38)
#controls 
c1 = BamReader("$root/ENCSR000AHE/ENCFF000QQG.bam", false, ReferenceContigs_hg38)
c2 = BamReader("$root/ENCSR000BVS/ENCFF000OXP.bam", false, ReferenceContigs_hg38)
readers = [target, c1, c2]

#create a dense block object for controls 
blocksize = 100000
db = denseblocks(readers, blocksize)

#create a dense block object for target

#creating a poission regressor
model = poisson_regression(eta=0.01)

#fit with poisson regression
itr = 1
itrlimit = 5000

while itr < itrlimit
    y, x = next(db, nil)
    model = fit(model, y, x)
    itr += 1
    if itr%100==0
        println("Iteration ", itr, " complete..")
        println("cur_weight", model.w)
    end
end 

println(blocksize*itrlimit," parsed.")

model 

    



Iteration 100 complete..
cur_weight[0.33703293703788434,0.309553597480865]
Iteration 200 complete..
cur_weight[0.3942148194910703,0.33785501505933657]
Iteration 300 complete..
cur_weight[0.43165249871283123,0.34520914931075497]
Iteration 400 complete..
cur_weight[0.4606544286076338,0.34764138397322475]
Iteration 500 complete..
cur_weight[0.48473605752518756,0.3468814751948659]
Iteration 600 complete..
cur_weight[0.5054484046119716,0.346744356946704]
Iteration 700 complete..
cur_weight[0.5234765498873708,0.3486239987076311]
Iteration 800 complete..
cur_weight[0.5391214268239045,0.35298945699526296]
Iteration 900 complete..
cur_weight[0.553050261605561,0.35278639099626724]
Iteration 1000 complete..
cur_weight[0.5659288864717692,0.3507264004445265]
Iteration 1100 complete..
cur_weight[0.578128790032471,0.3299439742785494]
Iteration 1200 complete..
cur_weight[0.5926118270270444,0.2887579845572454]
Iteration 1300 complete..
cur_weight[0.606404864629834,0.3007064702031472]
Iteration 1400 com

Poisson_regression([0.980274,0.0494543],-0.010645128901871245,0.01,1.0,499900000,2)