# Simulating phenotypes

- Mock example with very small sample size and number of SNPs
- Simple example using 2 components (genetic and noise), and bi-allelic SNPs
- Frequencies to sample alleles: 0.1, 0.2, 0.3

In [168]:
# load libraries
library(data.table)

In [345]:
simGenotype = function(N = 50, nSNP = 30, frequencies = c(0.1, 0.2, 0.4)) {
    sps = paste0("%0", nchar(as.character(N)), "d")
    spsn = paste0("%0", nchar(as.character(nSNP)), "d")
    
    samples = paste0("ID_", sprintf(sps, 1:N))
    snps = paste0("SNP_", sprintf(spsn, 1:nSNP))
    freq = sample(frequencies, nSNP, replace = TRUE)
    X <- sapply(1:nSNP, function(x) rbinom(N, 2, freq[x]))
    colnames(X) <- snps
    rownames(X) <- samples
    return(X)
}


geneticEffects = function (genotype, causalSNP = 10, mBeta = 0, sdBeta = 1) 
{
    cSNP = sort(sample(colnames(genotype), causalSNP, replace = FALSE))
    betas = rnorm(causalSNP, mBeta, sdBeta)
    g = colnames(genotype)
    effects = rep(0, length(g))
    effects[g %in% cSNP] = betas
    names(effects) = g
    return(effects)
}


createCouples = function(genotype) {
    temp = rownames(genotype)
    pairs = NULL
        for (i in seq_along(ids)) {
            if (length(temp) > 0) {
            e = sample(temp, 1)
            temp = temp[!temp %in% e]
            a = sample(temp, 1)
            temp = temp[!temp %in% a]
            pairs = rbind(pairs, c(e, a))
        } else {
            break
        }
    }

    couples = list()
    snps = colnames(genotype)
    for (i in 1:nrow(pairs)) {
        temp = data.table(t(genotype[pairs[i,], ]))
        rownames(temp) = snps 
        couples[[i]] = temp
    }
    return(couples)
}

reproduce = function(couples, nkids = 2) {

    # mendelian rules
    m = matrix(c(1, 0, 0, 0.5, 0.5, 0, 0, 1, 0, 0.25, 0.50, 0.25, 0, 0.50, 0.5, 0, 0, 1), 
        nrow = 6, ncol = 3, byrow  = TRUE
    )       
    colnames(m) = c(0, 1, 2)
    rownames(m) = c("00", "01", "02", "11", "12", "22")
    
    kids = list()
    
    family = data.table()
    j = 1

    for (i in seq_along(couples)) {
        for (ii in 1:nkids) {
            temp = couples[[i]]
            fids = colnames(temp)
            temp$k = temp[, apply(.SD, 1, getKidGenotype, matrix = m)]
            temp = temp[, .(k)]
            setnames(temp, "k", paste0("ID_", j))
            temp[, snp := rownames(couples[[1]])]
            kids[[paste0(i, ii)]] = temp

            family = rbind(family, data.table(father = fids[1], mother = fids[2], kid = paste0("ID_", j)))
            j = j+1
        }
}
    kk = Reduce(function(...) merge(...,  by = "snp"), kids)
    kk[, snp := NULL]
    kk = as.matrix(t(kk))
    colnames(kk) = rownames(couples[[1]])
    return(list(kk, family))

}


# function to get offspring genotype
getKidGenotype = function(values, matrix) {
    comba = paste0(values, collapse = "")
    combb = paste0(rev(values), collapse = "")
    s = grep(paste0(comba, "|", combb), rownames(matrix))
    prob = as.vector(matrix[s, ])
    return(sample(0:2, size = 1, prob = prob))
}

simKidTrait = function(kids, ge) {
    return(kids[[1]] %*% ge)
}

getNoiseComponent = function(genotype, mNoise = 0, sdNoise = 1) {
    n = nrow(genotype)
    return(rnorm(n, mNoise, sdNoise))
}

rescaleVar = function(component, prop) {
    component = as.vector(component) 
    var_component = var(component)
    scale_factor = sqrt(prop/var_component)
    return(component * scale_factor)
}

scalingFactor = function(component, prop) {
    component = as.vector(component) 
    var_component = var(component)
    return(sqrt(prop/var_component))
}


In [361]:
gen = 0.3
noise = 1 - gen

genotype = simGenotype(1000, 50)
ge = geneticEffects(genotype)
parentGComp = genotype %*% ge
parentNComp = getNoiseComponent(genotype)

pgsf = scalingFactor(parentGComp, gen)
pnsf = scalingFactor(parentNComp, noise)
parentTrait = parentGComp * pgsf + parentNComp * pnsf


In [379]:
dparentTrait = data.table(id = rownames(parentTrait), trait = parentTrait[, 1])

In [372]:
couples = createCouples(genotype)
kids = reproduce(couples)

In [373]:
kidGComp = kids[[1]] %*%  ge
kidNComp = getNoiseComponent(kids[[1]])
kidTrait = kidGComp * pgsf + kidNComp * pnsf

In [409]:
dkidTrait = data.table(id = rownames(kidTrait), trait = kidTrait[, 1])

In [410]:
head(dkidTrait)

id,trait
<chr>,<dbl>
ID_1,0.10013859
ID_2,0.25888747
ID_3,0.7573589
ID_4,0.01589459
ID_5,1.18199695
ID_6,0.32450443


In [411]:
family = kids[[2]]

In [412]:
dkidTrait = merge(dkidTrait, family, by.x = "id", by.y = "kid")

In [413]:
dkidTrait = merge(dkidTrait, dparentTrait[, .(id, tfather = trait)], by.x = "father", by.y = "id", all.x = TRUE)
dkidTrait = merge(dkidTrait, dparentTrait[, .(id, tmother = trait)], by.x = "mother", by.y = "id", all.x = TRUE)

In [415]:
cor(dkidTrait[, .(trait, tfather, tmother)])

Unnamed: 0,trait,tfather,tmother
trait,1.0,0.14539667,0.12722615
tfather,0.1453967,1.0,0.03631034
tmother,0.1272261,0.03631034,1.0


In [349]:
var(parentGComp * pgsf) /var(parentTrait)
var(parentNComp * pnsf) /var(parentTrait)

0
0.3005956


0
0.7013897


In [343]:
var(kidGComp * pgsf) /var(kidTrait)
var(kidNComp * pnsf) /var(kidTrait)

0
0.289476


0
0.7550101
