## Tools for building Henderson's Mixed Model Equations

Here we will see how the mixed model equations (HMME) can be built given a data set and a model string. 

We will start by building the "OLS" part of the equations. 

In [5]:
using DataFrames

In [6]:
#functions below are same as before
function mkDict(a)
    aUnique = unique(a)
    d = Dict()
    names = Array(Any,size(aUnique,1))
    for (i,s) in enumerate(aUnique)
        names[i] = s
        d[s] = i
    end
    return d,names
end

mkDict (generic function with 1 method)

### A small data set to test the tools

In [7]:
sex   = ["s","s","s","d","d","d"]
breed = ["Angus","Angus","Hereford","Hereford","Angus","Angus"]
age   = ["40","38","38","38","40","40"];

In [8]:
df1 = DataFrame(sex=sex,breed=breed,age=age,y=round(randn(6),3))

Unnamed: 0,sex,breed,age,y
1,s,Angus,40,-0.14
2,s,Angus,38,-0.179
3,s,Hereford,38,-0.295
4,d,Hereford,38,-1.177
5,d,Angus,40,1.192
6,d,Angus,40,0.157


### Types and functions for MME

In [9]:
type ModelTerm 
    trmStr::AbstractString    #"A*B"          #"A"
    nFactors::Int64           # 2             # 1
    factors::Array{Symbol,1}  # :A, :B        # :A
    data::Array{AbstractString,1}
    X::SparseMatrixCSC{Float64,Int64}
    names::Array{Any,1}
end

In [10]:
type MME
    modelEquation::AbstractString   #"y = A + A*B"
    modelTerms::Array{ModelTerm,1}  #[modelTerm("A") , modelTerm("A*B")]
    lhs::Symbol                     #:y 
    mmeLhs
    mmeRhs
end

In [11]:
function getTerm(trmStr) #"A*B" 
    trm = ModelTerm(trmStr,0,[],[],spzeros(0,0),[])
    if length(trmStr)==1
        trm.nFactors = 1
        trm.factors  = [symbol(strip(trmStr))]
    else
        factorVec = split(trmStr,"*")     #"A" ,"B"
        trm.nFactors = length(factorVec)  # 2
        trm.factors = [symbol(strip(f)) for f in factorVec] #:A, :B
    end
    return trm
end

getTerm (generic function with 1 method)

In [12]:
function initMME(modelEquation::AbstractString)  # "y = A + A*B"
    if modelEquation==""
        error("modelEquation is empty\n")
    end
    lhsRhs = split(modelEquation,"=") # "y", "A+A*B"
    lhs = symbol(strip(lhsRhs[1]))    # :y
    rhs = strip(lhsRhs[2])            #"A+A*B" 
    rhsVec = split(rhs,"+")           #"A", "A*B"
    modelTerms = [getTerm(strip(trmStr)) for trmStr in rhsVec]
    return MME(modelEquation,modelTerms,lhs,0,0) #returns an MME object
end 

initMME (generic function with 1 method)

In [13]:
function getData(trm::ModelTerm,df::DataFrame) # modelterm("A*B")   
    nObs = size(df,1)
    trm.data = Array(AbstractString,nObs)
    myDf = df[trm.factors] #df[:A,:B]
    for i=1:nObs #example: A1 B1
        res = string(myDf[i,trm.nFactors]) #"B1"
        for j = trm.nFactors-1:-1:1
            res = string(myDf[i,j])*" x "*res # "A1 X B1"
        end
        trm.data[i] = res
    end
    end #data: ["A1 X B1", "A1 X B2", "A1 X B1".... ]

getData (generic function with 1 method)

In [14]:
function getX(trm)
    dict,trm.names  = mkDict(trm.data)
    xj    = round(Int64,[dict[i] for i in trm.data]) #column index
    xi    = 1:size(trm.data,1) #row index :  1: nObs
    trm.X = sparse(xi,xj,1.0)  
end

getX (generic function with 1 method)

In [15]:
function getMME(mme::MME, df::DataFrame)
    for trm in mme.modelTerms #[modelTerm("A*B"), modelTerm("A")]
        getData(trm,df)
        getX(trm)
    end
    n   = size(mme.modelTerms,1)
    trm = mme.modelTerms[1]
    X   = trm.X
    for i=2:n
        trm = mme.modelTerms[i]
        X = [X trm.X]
    end
    y      = convert(Array,df[mme.lhs]) #df[:y]
    mme.mmeLhs = X'X
    mme.mmeRhs = X'y
    nothing
end

getMME (generic function with 1 method)

In [16]:
function getNames(mme)
    names = Array(AbstractString,0)
    for trm in mme.modelTerms
        for name in trm.names
            push!(names,trm.trmStr*": "*name)
        end
    end
    return names
end    

getNames (generic function with 1 method)

### run it

In [17]:
df1

Unnamed: 0,sex,breed,age,y
1,s,Angus,40,-0.14
2,s,Angus,38,-0.179
3,s,Hereford,38,-0.295
4,d,Hereford,38,-1.177
5,d,Angus,40,1.192
6,d,Angus,40,0.157


In [18]:
mme = initMME("y = sex + breed + sex*breed*age");

In [19]:
getMME(mme,df1)

In [20]:
[getNames(mme) mme.mmeRhs]

9x2 Array{Any,2}:
 "sex: s"                            -0.614
 "sex: d"                             0.172
 "breed: Angus"                       1.03 
 "breed: Hereford"                   -1.472
 "sex*breed*age: s x Angus x 40"     -0.14 
 "sex*breed*age: s x Angus x 38"     -0.179
 "sex*breed*age: s x Hereford x 38"  -0.295
 "sex*breed*age: d x Hereford x 38"  -1.177
 "sex*breed*age: d x Angus x 40"      1.349