## Tools for building Henderson's Mixed Model Equations

Use of Gibbs sampler to compute posterior mean of effects 


In [1]:
include ("../../PedModule.jl/src/PedModule.jl")
using DataFrames


Use "include(" instead.
  likely near /Users/rohan/Box Sync/GitHub/PedModule.jl/src/PedModule.jl:189
  likely near /Users/rohan/Box Sync/GitHub/PedModule.jl/src/PedModule.jl:209


In [2]:
df = readtable("MTData", separator = ' ')

Unnamed: 0,y1,y2,y3,trt
1,1.0,2.0,1.2,1
2,1.1,,3.1,1
3,0.9,1.9,,2
4,1.2,1.7,1.9,2


In [3]:
randn()

-0.5318231998421711

In [12]:
function Gibbs(A,x,b,nIter;outFreq=100)
    n = size(x,1)
    xMean = zeros(n)
    for iter = 1:nIter
        if iter%outFreq==0
            println("at sample: ",iter)
        end
        for i=1:n
            cVarInv = 1.0/A[i,i]
            cMean   = cVarInv*(b[i] - A[:,i]'x)[1,1] + x[i]
            x[i]    = randn()*sqrt(cVarInv) + cMean 
        end
        xMean += (x - xMean)/iter
    end
    return xMean
end


function GaussSeidel(A,x,b;tol=0.000001)
    n = size(x,1)
    for i=1:n
        x[i] = ((b[i] - A[:,i]'x)/A[i,i])[1,1] + x[i]
    end
    diff = sum((A*x-b).^2)
    iter = 0
    while ((diff/n > tol) & (iter<1000))
        iter += 1
        for i=1:n
            x[i] = ((b[i] - A[:,i]'x)/A[i,i])[1,1] + x[i]
        end
        diff = sum((A*x-b).^2)
        println(iter," ",diff/n)
    end
    return x
end

function Jacobi(A,x,b,p;tol=0.000001)
    D       = diag(A)
    res     = A*x
    resid   = b-res
    tempSol = resid./D
    diff    = sum(resid.^2)
    n    = size(A,1)
    iter = 0
    while ((diff/n > tol) & (iter<1000))
        iter += 1
        x = p*tempSol + (1-p)*x
        res     = A*x
        resid   = b-res
        tempSol = resid./D + x
        diff    = sum(resid.^2)
        println(iter," ",diff/n)
    end
    return x
end


type TermStrVal
    str::AbstractString
    value::Float64
end

type TermLvlVal
    level::AbstractString
    value::Float64
end

type ModelTerm 
    iModel::Int64
    trmStr::AbstractString
    nFactors::Int64
    factors::Array{Symbol,1}
    str::Array{AbstractString,1}                    # used to store the data for this term as strings
    val::Array{Float64,1}
    startPos::Int64                         # start pos in HMME
    nLevels::Int64                           
    X::SparseMatrixCSC{Float64,Int64}
    names::Array{Any,1}
end
type MME
    modelVec::Array{AbstractString,1}
    modelTerms::Array{ModelTerm,1}
    modelTermDict::Dict{AbstractString,ModelTerm}
    lhsVec::Array{Symbol,1}
    covVec::Array{Symbol,1}
    pedTrmVec::Array{AbstractString,1}
    mmeLhs
    mmeRhs
    ped
    Gi::Array{Float64,2}
    R::Array{Float64,2}
    Ai
    mmePos::Int64
end

type ResVar
    R0::Array{Float64,2}
    RiDict::Dict{BitArray{1},Array{Float64,2}}
end   

function mkDict(a)
    aUnique = unique(a)
    d = Dict()
    names = Array(Any,size(aUnique,1))
    for (i,s) in enumerate(aUnique)
        names[i] = s
        d[s] = i
    end
    return d,names
end

function getRi(resVar::ResVar,sel::BitArray{1})
    if haskey(resVar.RiDict,sel)
        return resVar.RiDict[sel]
    end
    n = size(resVar.R0,1)
    RZ = zeros(n,n)
    RZ[sel,sel] = inv(resVar.R0[sel,sel])
    resVar.RiDict[sel] = RZ
    return RZ
end

function getTerm(trmStr,m)
    trm = ModelTerm(m,string(m)*":"*trmStr,0,[],[],[],0,0,spzeros(0,0),[])
    factorVec = split(trmStr,"*")
    trm.nFactors = length(factorVec)
    trm.factors = [symbol(strip(f)) for f in factorVec]
    return trm
end

function initMME(models::AbstractString,R::Array{Float64,2})
    # returns an MME object for muilding the mme corresponding 
    # to the input string
    if models==""
        println("modelEquation is empty\n")
        return
    end
    modelVec = split(models,[';','\n'],keep=false)
    nModels  = size(modelVec,1)
    lhsVec   = Symbol[]
    modelTerms = ModelTerm[]
    dict = Dict{AbstractString,ModelTerm}()
    for (m,model) = enumerate(modelVec)
        lhsRhs = split(model,"=")
        lhsVec = [lhsVec;symbol(strip(lhsRhs[1]))]
        rhs = strip(lhsRhs[2])
        rhsVec = split(rhs,"+")    
        mTrms = [getTerm(strip(trmStr),m) for trmStr in rhsVec]
        modelTerms = [modelTerms; mTrms]
        for (i,trm) = enumerate(modelTerms) 
            dict[trm.trmStr] = modelTerms[i]
        end 
    end
    return MME(modelVec,modelTerms,dict,lhsVec,[],[],0,0,0,Array(Float64,1,1),R,0,1)
end 

function getData(trm::ModelTerm,df::DataFrame,mme::MME)
    nObs = size(df,1)
    trm.str = Array(AbstractString,nObs)
    trm.val = Array(Float64,nObs)
    if(trm.factors[1] == :intercept)                     # from Melanie's HW
        str = fill(string(trm.factors[1]),nObs)
        val = fill(1.0,nObs)
    else
        myDf = df[trm.factors]
        if trm.factors[1] in mme.covVec
            str = fill(string(trm.factors[1]),nObs)
            val = df[trm.factors[1]]
        else
            str = [string(i) for i in df[trm.factors[1]]]
            val = fill(1.0,nObs)
        end
        for i=2:trm.nFactors
            if trm.factors[i] in mme.covVec
                str = str .* fill("*"*string(trm.factors[i]),nObs)
                val = val .* df[trm.factors[i]]
            else
                str = str .* fill("*",nObs) .* [string(j) for j in df[trm.factors[i]]]
                val = val .* fill(1.0,nObs)
            end
        end
    end
    trm.str = str
    trm.val = val
end

getFactor1(str) = [strip(i) for i in split(str,"*")][1]

function getX(trm,mme::MME)
    pedSize = 0
    nObs  = size(trm.str,1)
    if trm.trmStr in mme.pedTrmVec
        trm.names   = PedModule.getIDs(mme.ped)
        trm.nLevels = length(mme.ped.idMap)
        xj = int([mme.ped.idMap[getFactor1(i)].seqID for i in trm.str])
    else
        dict,trm.names  = mkDict(trm.str)
        trm.nLevels     = length(dict)
        xj    = round(Int64,[dict[i] for i in trm.str]) 
    end
    xi    = (trm.iModel-1)*nObs + collect(1:nObs)
    xv    = trm.val
    if mme.ped!=0
        pedSize = length(mme.ped.idMap)
        if trm.trmStr in mme.pedTrmVec
            # This is to ensure the X matrix for 
            # additive effect has the correct number of columns
            ii = 1         # adding a zero to
            jj = pedSize   # the last column in row 1
            vv = [0.0]
            xi = [xi;ii]
            xj = [xj;jj]
            xv = [xv;vv]
        end
    end 
    #make sure X has nObs*nModels rows
    nModels = size(mme.lhsVec,1)
    xi = [xi;1;nObs*nModels]
    xj = [xj;1;1]
    xv = [xv;0;0]
    trm.X = sparse(xi,xj,xv)
    trm.startPos = mme.mmePos
    mme.mmePos  += trm.nLevels
end

function mkRi(mme::MME,df::DataFrame)
    resVar = ResVar(mme.R,Dict())
    tstMsng = !isna(df[mme.lhsVec[1]])
    for i=2:size(mme.lhsVec,1)
        tstMsng = [tstMsng !isna(df[mme.lhsVec[i]])]
    end
    n    = size(tstMsng,2)
    nObs = size(tstMsng,1)
    ii = Array(Int64,nObs*n^2)
    jj = Array(Int64,nObs*n^2)
    vv = Array(Float64,nObs*n^2)
    pos = 1
    for i=1:size(tstMsng,1)
        sel = reshape(tstMsng[i,:],n)
        Ri  = getRi(resVar,sel)
        for ti=1:n
            tii = (ti-1)*nObs + i
            for tj=1:n
                tjj = (tj-1)*nObs + i
                ii[pos] = tii
                jj[pos] = tjj
                vv[pos] = Ri[ti,tj]
                pos += 1
            end
        end         
    end
    return sparse(ii,jj,vv)
end

function getMME(mme::MME, df::DataFrame)
    mme.mmePos = 1
    for trm in mme.modelTerms
        getData(trm,df,mme)
        getX(trm,mme)
    end
    n   = size(mme.modelTerms,1)
    trm = mme.modelTerms[1]
    X   = trm.X
    for i=2:n
        trm = mme.modelTerms[i]
        X = [X trm.X]
    end
    y = convert(Array,df[mme.lhsVec[1]],0.0)
    for i=2:size(mme.lhsVec,1)
        y    = [y, convert(Array,df[mme.lhsVec[i]],0.0)] 
    end
    N  = size(y,1)
    ii = 1:N
    jj = fill(1,N)
    vv = y
    ySparse = sparse(ii,jj,vv)
    nObs = size(df,1)
    Ri = mkRi(mme,df)
    mme.mmeLhs = X'Ri*X
    mme.mmeRhs = X'Ri*ySparse
    if mme.ped != 0
        ii,jj,vv = PedModule.HAi(mme.ped)
        HAi = sparse(ii,jj,vv)
        mme.Ai = HAi'HAi
        addA(mme::MME)
    end   
end

function getNames(mme)
    names = Array(String,0)
    for trm in mme.modelTerms
        for name in trm.names
            push!(names,trm.trmStr*": "*name)
        end
    end
    return names
end  

function covList(mme::MME, covStr::AbstractString)
    covVec = split(covStr," ",false) 
    mme.covVec = [symbol(i) for i in covVec]
    nothing
end

function getSolJ(mme::MME, df::DataFrame)
    if size(mme.mmeRhs)==() 
        getMME(mme,df)
    end
    p = size(mme.mmeRhs,1)
    return [getNames(mme) Jacobi(mme.mmeLhs,fill(0.0,p),mme.mmeRhs,0.3,tol=0.000001)]
end

function getSolG(mme::MME, df::DataFrame)
    if size(mme.mmeRhs)==() 
        getMME(mme,df)
    end
    p = size(mme.mmeRhs,1)
    return [getNames(mme) GaussSeidel(mme.mmeLhs,fill(0.0,p),mme.mmeRhs,tol=0.000001)]
end

function setAsRandom(mme::MME,randomStr::AbstractString,ped::PedModule.Pedigree, G::Array{Float64,2})
    pedTrmVec = split(randomStr," ",keep=false)
    res = []
    for trm in pedTrmVec
        for (m,model) = enumerate(mme.modelVec)
            strVec  = split(model,['=','+'])
            strpVec = [strip(i) for i in strVec]
            if trm in strpVec
                res = [res;string(m)*":"*trm]
            end
        end
    end
    mme.pedTrmVec = res
    mme.ped = ped
    mme.Gi = inv(G)
    nothing
end

function addA(mme::MME)
    pedTrmVec = mme.pedTrmVec
    for (i,trmi) = enumerate(pedTrmVec)
        pedTrmi  = mme.modelTermDict[trmi]
        startPosi  = pedTrmi.startPos
        endPosi    = startPosi + pedTrmi.nLevels - 1
        for (j,trmj) = enumerate(pedTrmVec)
            pedTrmj  = mme.modelTermDict[trmj]
            startPosj  = pedTrmj.startPos
            endPosj    = startPosj + pedTrmj.nLevels - 1  
            mme.mmeLhs[startPosi:endPosi,startPosj:endPosj] = 
            mme.mmeLhs[startPosi:endPosi,startPosj:endPosj] + mme.Ai*mme.Gi[i,j] 
        end
    end
end

addA (generic function with 1 method)

### One-Way Model 

In [13]:
R0 = Array(Float64,1,1)
R0[1,1] = 1.0
models = "y1 = intercept + trt"
mme = initMME(models,R0);

In [14]:
getSolG(mme, df)

  likely near In[14]:1
  likely near In[14]:1
  likely near In[14]:1


3x2 Array{Any,2}:
 "1:intercept: intercept"  1.05
 "1:trt: 1"                0.0 
 "1:trt: 2"                0.0 

In [15]:
sol = GaussSeidel(mme.mmeLhs,fill(0.0,3),mme.mmeRhs,tol=0.000001)
mme.mmeLhs*sol

3-element Array{Float64,1}:
 4.2
 2.1
 2.1

In [16]:
full(mme.mmeLhs)

3x3 Array{Float64,2}:
 4.0  2.0  2.0
 2.0  2.0  0.0
 2.0  0.0  2.0

In [17]:
mme.mmeRhs

3x1 sparse matrix with 3 Float64 entries:
	[1, 1]  =  4.2
	[2, 1]  =  2.1
	[3, 1]  =  2.1

In [18]:
nIter = 500
solGibbs = Gibbs(mme.mmeLhs,sol,mme.mmeRhs,nIter)
mme.mmeLhs*solGibbs

at sample: 100

3-element Array{Float64,1}:
 4.21441
 2.06226
 2.15215


at sample: 200
at sample: 300
at sample: 400
at sample: 500


In [19]:
nIter = 5000
solGibbs = Gibbs(mme.mmeLhs,sol,mme.mmeRhs,nIter)
mme.mmeLhs*solGibbs

at sample: 100

3-element Array{Float64,1}:
 4.14195
 2.07304
 2.06891


at sample: 200
at sample: 300
at sample: 400
at sample: 500
at sample: 600
at sample: 700
at sample: 800
at sample: 900
at sample: 1000
at sample: 1100
at sample: 1200
at sample: 1300
at sample: 1400
at sample: 1500
at sample: 1600
at sample: 1700
at sample: 1800
at sample: 1900
at sample: 2000
at sample: 2100
at sample: 2200
at sample: 2300
at sample: 2400
at sample: 2500
at sample: 2600
at sample: 2700
at sample: 2800
at sample: 2900
at sample: 3000
at sample: 3100
at sample: 3200
at sample: 3300
at sample: 3400
at sample: 3500
at sample: 3600
at sample: 3700
at sample: 3800
at sample: 3900
at sample: 4000
at sample: 4100
at sample: 4200
at sample: 4300
at sample: 4400
at sample: 4500
at sample: 4600
at sample: 4700
at sample: 4800
at sample: 4900
at sample: 5000


In [20]:
nIter = 50000
solGibbs = Gibbs(mme.mmeLhs,sol,mme.mmeRhs,nIter)
mme.mmeLhs*solGibbs

at sample: 100

3-element Array{Float64,1}:
 4.20914
 2.10865
 2.10049


at sample: 200
at sample: 300
at sample: 400
at sample: 500
at sample: 600
at sample: 700
at sample: 800
at sample: 900
at sample: 1000
at sample: 1100
at sample: 1200
at sample: 1300
at sample: 1400
at sample: 1500
at sample: 1600
at sample: 1700
at sample: 1800
at sample: 1900
at sample: 2000
at sample: 2100
at sample: 2200
at sample: 2300
at sample: 2400
at sample: 2500
at sample: 2600
at sample: 2700
at sample: 2800
at sample: 2900
at sample: 3000
at sample: 3100
at sample: 3200
at sample: 3300
at sample: 3400
at sample: 3500
at sample: 3600
at sample: 3700
at sample: 3800
at sample: 3900
at sample: 4000
at sample: 4100
at sample: 4200
at sample: 4300
at sample: 4400
at sample: 4500
at sample: 4600
at sample: 4700
at sample: 4800
at sample: 4900
at sample: 5000
at sample: 5100
at sample: 5200
at sample: 5300
at sample: 5400
at sample: 5500
at sample: 5600
at sample: 5700
at sample: 5800
at sample: 5900
at sample: 6000
at sample: 6100
at sample: 6200
at sample: 6300
at sample: 6400

In [21]:
ped = PedModule.mkPed("sim.ped");

In [22]:
ped.idMap

Dict{Any,Any} with 4200 entries:
  "10467" => PedModule.PedNode(3,"10040","10069",0.0)
  "11346" => PedModule.PedNode(6,"10009","10086",0.0)
  "15267" => PedModule.PedNode(13,"10269","10276",0.0)
  "10251" => PedModule.PedNode(15,"10046","10069",0.0)
  "10600" => PedModule.PedNode(18,"10015","10083",0.0)
  "15312" => PedModule.PedNode(25,"10311","10323",0.0)
  "15124" => PedModule.PedNode(29,"10122","10126",0.0)
  "16868" => PedModule.PedNode(35,"11949","11957",0.0)
  "10249" => PedModule.PedNode(38,"10038","10064",0.0)
  "11309" => PedModule.PedNode(41,"10013","10087",0.0)
  "11404" => PedModule.PedNode(44,"10017","10059",0.0)
  "11961" => PedModule.PedNode(45,"10013","10097",0.0)
  "15767" => PedModule.PedNode(51,"10794","10801",0.0)
  "15270" => PedModule.PedNode(54,"10269","10279",0.0)
  "16846" => PedModule.PedNode(59,"11928","11934",0.0)
  "15711" => PedModule.PedNode(65,"10731","10742",0.0)
  "10388" => PedModule.PedNode(67,"10023","10066",0.0)
  "15283" => PedModule.PedNode(74,

In [23]:
df2 = readtable("sim.phe", separator = ' ', names = [:Animal,:y1,:y2])

Unnamed: 0,Animal,y1,y2
1,10102,816.683,904.881
2,10103,838.391,895.255
3,10104,860.443,943.387
4,10105,473.299,827.87
5,10106,631.668,827.87
6,10107,680.528,847.123
7,10108,780.235,818.244
8,10109,962.951,895.255
9,10110,691.823,789.364
10,10111,420.898,847.123


In [24]:
models = "y1 = intercept + Animal;
          y2 = intercept + Animal"
R = [10 2; 2 1.0]
mme = initMME(models,R)
G0 = [5 1;1 1.0]
setAsRandom(mme,"Animal", ped,G0)


In [26]:
getMME(mme,df2)
p = size(mme.mmeLhs,1)



8402

 in depwarn at deprecated.jl:73
 in int at deprecated.jl:276
 in getX at In[12]:195
 in getMME at In[12]:259
 in include_string at loading.jl:266
 in execute_request_0x535c5df2 at /Users/rohan/.julia/v0.4/IJulia/src/execute_request.jl:177
 in eventloop at /Users/rohan/.julia/v0.4/IJulia/src/IJulia.jl:141
 in anonymous at task.jl:447
while loading In[26], in expression starting on line 1
 in depwarn at deprecated.jl:73
 in vect at abstractarray.jl:32
 in getMME at In[12]:270
 in include_string at loading.jl:266
 in execute_request_0x535c5df2 at /Users/rohan/.julia/v0.4/IJulia/src/execute_request.jl:177
 in eventloop at /Users/rohan/.julia/v0.4/IJulia/src/IJulia.jl:141
 in anonymous at task.jl:447
while loading In[26], in expression starting on line 1


In [27]:
sol = GaussSeidel(mme.mmeLhs,fill(0.0,p),mme.mmeRhs,tol=0.000001)
nothing

1 9.35653850010938e6
2 1.498145111538499e6
3 240291.01023671633
4 38688.13273444056
5 6299.250706668958
6 1064.4339792897754
7 203.60856781332572
8 54.1827858705337
9 23.59441047327077
10 14.46725706310057
11 10.12917838213495
12 7.395065296640927
13 5.477828326400974
14 4.084689040973578
15 3.0593299288979128
16 2.300737126771161
17 1.738223374429457
18 1.3206586052019962
19 1.0105096337809147
20 0.7800340021865547
21 0.6086604596345528
22 0.4811148978165983
23 0.38605236390383074
24 0.315048010977928
25 0.261848822207481
26 0.22181723062857428
27 0.19151692591675293
28 0.1684043752996519
29 0.15059908846299322
30 0.13671257308964224
31 0.12572105100701417
32 0.11687079820971702
33 0.1096077959361424
34 0.10352549417329235
35 0.0983260674891208
36 0.0937916979372287
37 0.0897633310580548
38 0.08612497041842879
39 0.08279209207984785
40 0.07970311281688093
41 0.0768131186960571
42 0.07408926815593504
43 0.07150743042476106
44 0.06904973540480762
45 0.06670279484466682
46 0.064456415546

In [28]:
nIter = 500
solGibbs = Gibbs(mme.mmeLhs,sol,mme.mmeRhs,nIter)
[mme.mmeLhs*solGibbs mme.mmeRhs]

at sample: 100
at sample: 200
at sample: 300
at sample: 400
at sample: 500

8402x2 Array{Float64,2}:
   -5.84109e5    -5.8411e5
   -0.0431949     0.0     
   -0.0418085     0.0     
 -123.268      -123.285   
   -0.017898      0.0     
   -0.0105783     0.0     
 -166.627      -166.581   
    0.051062      0.0     
    0.0482699     0.0     
 -175.686      -175.919   
   -0.0284482     0.0     
   -0.0478879     0.0     
 -208.23       -208.198   
    ⋮                     
  991.704       991.65    
 1148.32       1148.4     
 1128.84       1128.9     
 1181.41       1181.43    
 1009.35       1009.41    
 1156.02       1156.2     
 1075.46       1075.4     
 1250.43       1250.42    
 1214.82       1214.78    
 1096.17       1096.2     
 1320.46       1320.34    
 1109.36       1109.33    


