## Tools for building Henderson's Mixed Model Equations

Here we will see how the mixed model equations (HMME) can be built given a data set and a model string. 


In [1]:
using PedModule

In [2]:
FILE = "../../data/small.ped";

In [3]:
;cat $FILE

S1 0 0
D1 0 0
O1 S1 D1
O2 S1 D1
O3 S1 D1


In [4]:
ped = PedModule.mkPed(FILE);

In [5]:
ped.idMap

Dict{Any,Any} with 5 entries:
  "O1" => PedModule.PedNode(3,"S1","D1",0.0)
  "S1" => PedModule.PedNode(1,"0","0",0.0)
  "O3" => PedModule.PedNode(4,"S1","D1",0.0)
  "D1" => PedModule.PedNode(2,"0","0",0.0)
  "O2" => PedModule.PedNode(5,"S1","D1",0.0)

In [6]:
ii,jj,vv = PedModule.HAi(ped);

### A small data set to test the tools

In [7]:
A = [1,1,1,1]
B = ["S1","D1","O1","O3"]
y = [100.0, 50.0, 150.0, 40.0];

In [8]:
using DataFrames

In [9]:
df1 = DataFrame(intercept=A,Animal=B,y=y)

Unnamed: 0,intercept,Animal,y
1,1,S1,100.0
2,1,D1,50.0
3,1,O1,150.0
4,1,O3,40.0


### Data with repeated measures

In [10]:
Animal = repeat(B,inner=[3])
Age = repmat([1,2,3],4)
intercept = ones(12,1)
df2 = DataFrame(Animal=Animal,Age=Age,y=randn(12))

Unnamed: 0,Animal,Age,y
1,S1,1,0.2647053315154632
2,S1,2,-0.0731311110739266
3,S1,3,0.0425940328250158
4,D1,1,1.5332740260431432
5,D1,2,1.4164743253084766
6,D1,3,-0.7860008547136742
7,O1,1,-0.2858582188035473
8,O1,2,-0.2046883311927751
9,O1,3,0.8209338615854505
10,O3,1,-0.8070041989165463


### Data with maternal effect

In [11]:
df3 = [df1[3:4,:]  DataFrame(mat = ["D1","D1"])]

Unnamed: 0,intercept,Animal,y,mat
1,1,O1,150.0,D1
2,1,O3,40.0,D1


In [1]:
include("../../Module/solver.jl");

### <font color="red"> types and functions below are same (or very similar) as before.</font>

In [27]:
function mkDict(a)
    aUnique = unique(a)
    d = Dict()
    names = Array(Any,size(aUnique,1))
    for (i,s) in enumerate(aUnique)
        names[i] = s
        d[s] = i
    end
    return d,names
end

function getNames(mme)
    names = Array(AbstractString,0)
    for trm in mme.modelTerms
        for name in trm.names
            push!(names,trm.trmStr*": "*name)
        end
    end
    return names
end  

function getTerm(trmStr) #only declaration of trm is changed
    trm = ModelTerm(trmStr,0,[],[],[],0,0,spzeros(0,0),[])
    factorVec = split(trmStr,"*")
    trm.nFactors = length(factorVec)
    trm.factors = [symbol(strip(f)) for f in factorVec]
    return trm
end

function initMME(modelEquation::AbstractString) #Difference: made modelTerms into a dictionary
    if modelEquation==""
        error("modelEquation is empty\n")
    end
    lhsRhs = split(modelEquation,"=")
    lhs = symbol(strip(lhsRhs[1]))
    rhs = strip(lhsRhs[2])
    rhsVec = split(rhs,"+")    
    dict = Dict{AbstractString,ModelTerm}()
    modelTerms = [getTerm(strip(trmStr)) for trmStr in rhsVec]
    for (i,trm) = enumerate(modelTerms)
        dict[trm.trmStr] = modelTerms[i]
    end    
    return MME(modelEquation,modelTerms,dict,lhs,[],[],0,0,0,Array(Float64,1,1),0,1)
end 

function getData(trm,df::DataFrame,mme::MME)
    nObs = size(df,1)
    trm.str = Array(AbstractString,nObs)
    trm.val = Array(Float64,nObs)
    
    if trm.factors[1] == :intercept ##modified from Melanie's HW ##new
        str = fill(string(trm.factors[1]),nObs) 
        val = fill(1.0,nObs)
        trm.str = str
        trm.val = val
        return
    end
    
    myDf = df[trm.factors]   
        
    if trm.factors[1] in mme.covVec
        str = fill(string(trm.factors[1]),nObs)
        val = df[trm.factors[1]]
    else
        str = [string(i) for i in df[trm.factors[1]]]
        val = fill(1.0,nObs)
    end
    for i=2:trm.nFactors
        if trm.factors[i] in mme.covVec
            str = str .* fill(" x "*string(trm.factors[i]),nObs)
            val = val .* df[trm.factors[i]]
        else
            str = str .* fill(" x ",nObs) .* [string(j) for j in df[trm.factors[i]]]
            val = val .* fill(1.0,nObs)
        end
    end
    trm.str = str
    trm.val = val
end

function covList(mme::MME, covStr::AbstractString)
    covVec = split(covStr," ",keep=false) 
    mme.covVec = [symbol(i) for i in covVec]
    nothing
end

function getSolJ(mme::MME, df::DataFrame)
    if size(mme.mmeRhs)==() 
        getMME(mme,df)
    end
    p = size(mme.mmeRhs,1)
    return [getNames(mme) Jacobi(mme.mmeLhs,fill(0.0,p),mme.mmeRhs,0.3,tol=0.000001)]
end

function getSolG(mme::MME, df::DataFrame)
    if size(mme.mmeRhs)==() 
        getMME(mme,df)
    end
    p = size(mme.mmeRhs,1)
    return [getNames(mme) GaussSeidel(mme.mmeLhs,fill(0.0,p),mme.mmeRhs,tol=0.000001)]
end

getSolG (generic function with 1 method)

### Types

In [26]:
#type TermStrVal
#    str::AbstractString
#    value::Float64
#end

#type TermLvlVal
#    level::AbstractString
#    value::Float64
#end

type ModelTerm 
    trmStr::AbstractString
    nFactors::Int64
    factors::Array{Symbol,1}
    str::Array{AbstractString,1}            # used to store the data for this term as strings
    val::Array{Float64,1}
    startPos::Int64                         # start pos in HMME
    nLevels::Int64                           
    X::SparseMatrixCSC{Float64,Int64}
    names::Array{Any,1}
end

type MME
    modelEquation::AbstractString
    modelTerms::Array{ModelTerm,1}
    modelTermDict::Dict{AbstractString,ModelTerm}
    lhs::Symbol
    covVec::Array{Symbol,1}
    pedTrmVec::Array{AbstractString,1}
    mmeLhs
    mmeRhs
    ped
    Gi::Array{Float64,2}
    Ai
    mmePos::Int64
end

### Functions

In [15]:
type ModelTerm 
    trmStr::AbstractString
    nFactors::Int64
    factors::Array{Symbol,1}
    str::Array{AbstractString,1}            # used to store the data for this term as strings
    val::Array{Float64,1}
    startPos::Int64                         # start pos in HMME
    nLevels::Int64                           
    X::SparseMatrixCSC{Float64,Int64}
    names::Array{Any,1}
end

In [16]:
getFactor1(str) = [strip(i) for i in split(str,"x")][1] #using in may be better. maybe age*animal

function getX(trm::ModelTerm,mme::MME)
    pedSize = 0
    nObs  = size(trm.str,1)
    if trm.trmStr in mme.pedTrmVec #"Animal"
        trm.names   = PedModule.getIDs(mme.ped)
        trm.nLevels = length(mme.ped.idMap)
        xj = round(Int64,[mme.ped.idMap[getFactor1(i)].seqID for i in trm.str]) #column index
    else
        dict,trm.names  = mkDict(trm.str)
        trm.nLevels     = length(dict)
        xj    =  round(Int64,[dict[i] for i in trm.str])
    end
    xi    = 1:nObs  #row index 
    xv    = trm.val #value
    if mme.ped!=0 #Because some animals in pedigree may not in the data
        pedSize = length(mme.ped.idMap)
        if trm.trmStr in mme.pedTrmVec
            # This is to ensure the X matrix for 
            # additive effect has the correct number of columns
            ii = 1         # adding a zero to
            jj = pedSize   # the last column in row 1
            vv = [0.0]
            xi = [xi;ii]
            xj = [xj;jj]
            xv = [xv;vv]
        end
    end  
    trm.X = sparse(xi,xj,xv)
    trm.startPos = mme.mmePos
    mme.mmePos  += trm.nLevels
end

getX (generic function with 1 method)

In [17]:
function getMME(mme::MME, df::DataFrame)
    for trm in mme.modelTerms
        getData(trm,df,mme)
        getX(trm,mme)
    end
    n   = size(mme.modelTerms,1)
    trm = mme.modelTerms[1]
    X   = trm.X
    for i=2:n
        trm = mme.modelTerms[i]
        X = [X trm.X]
    end
    y    = df[mme.lhs]
    nObs = size(y,1)
    ii = 1:nObs
    jj = fill(1,nObs)
    vv = y
    nRowsX = size(X,1)
    if nRowsX > nObs  ###??????
        ii = [ii,nRowsX]
        jj = [jj,1]
        vv = [vv,0.0]
    end
    ySparse = sparse(ii,jj,vv)
    mme.mmeLhs = X'X
    mme.mmeRhs = X'ySparse
    if mme.ped != 0
        ii,jj,vv = PedModule.HAi(mme.ped)#cholesky??
        HAi = sparse(ii,jj,vv)
        mme.Ai = HAi'HAi
        addA(mme::MME)
    end
end

getMME (generic function with 1 method)

In [18]:
function setAsRandom(mme::MME,randomStr::AbstractString,ped::PedModule.Pedigree, G::Array{Float64,2})
    mme.pedTrmVec = split(randomStr," ",keep=false)
    mme.ped = ped
    mme.Gi = inv(G)
    nothing
end

setAsRandom (generic function with 1 method)

In [28]:
function addA(mme::MME)
    pedTrmVec = mme.pedTrmVec
    #pedTrm = mme.modelTermDict[mme.pedTrmVec[1]]
    for (i,trmi) = enumerate(pedTrmVec)
        pedTrmi  = mme.modelTermDict[trmi]
        startPosi  = pedTrmi.startPos
        endPosi    = startPosi + pedTrmi.nLevels - 1
        for (j,trmj) = enumerate(pedTrmVec)
            pedTrmj  = mme.modelTermDict[trmj]
            startPosj  = pedTrmj.startPos
            endPosj    = startPosj + pedTrmj.nLevels - 1       
            mme.mmeLhs[startPosi:endPosi,startPosj:endPosj] = 
            mme.mmeLhs[startPosi:endPosi,startPosj:endPosj] + mme.Ai*mme.Gi[i,j] 
        end
    end
end

addA (generic function with 1 method)

In [29]:
mme = initMME("y = intercept + Age + Animal + Animal*Age")
covList(mme,"Age")
nothing

In [30]:
setAsRandom(mme,"Animal",ped,reshape([1.0],1,1))

In [31]:
resG = getSolG(mme,df2);

10 

In [32]:
mme = initMME("y = intercept + Age + Animal + Animal*Age")
covList(mme,"Age")
G = [1 0.1; 0.1 1.0]
setAsRandom(mme,"Animal Animal*Age",ped,G)
resG = getSolG(mme,df2);

0.004596883334625979
20 0.0014040115488734484
30 0.0004776419930430202
40 0.00016327131964199662
50 5.5822785736460476e-5
60 1.9086077557632178e-5
70 6.525622408268345e-6
80 2.2311419048512457e-6
10 

In [33]:
mme3 = initMME("y = intercept + Age + Animal + Animal*Age")
#covList(mme3,"Age")
G = [1 0.1; 0.1 1.0]
setAsRandom(mme3,"Animal",ped,reshape([1.0],1,1))
resG = getSolG(mme3,df2);
round(full(mme3.mmeLhs),1);

0.004787113947862691
20 0.0018128013119535448
30 0.0009656596691756127
40 0.0006126023610504133
50 0.00040426931598860726
60 0.0002588318063222115
70 0.00015731883389153348
80 9.05082202408139e-5
90 4.940725467547827e-5
100 2.5670866153581953e-5
110 1.2725249477744462e-5
120 6.025643921210171e-6
130 2.7255643056254705e-6
140 1.1762468905424163e-6
10 

In [34]:
mme = initMME("y = intercept + Animal + mat")
G = [1 0.1; 0.1 1.0]
setAsRandom(mme,"Animal mat",ped,G)
resG = getSolG(mme,df3)

11x2 Array{Any,2}:
 "intercept: intercept"   95.0        
 "Animal: S1"              0.0        
 "Animal: D1"              0.0        
 "Animal: O1"             18.3333     
 "Animal: O3"            -18.3333     
 "Animal: O2"              0.0        
 "mat: S1"                 1.75859e-16
 "mat: D1"                 0.0        
 "mat: O1"                 1.83333    
 "mat: O3"                -1.83333    
 "mat: O2"                 8.79297e-17

4.5966603254437086e-5
20 1.1878899122117194e-6
