# Simulating phenotypes

- Mock example with very small sample size and number of SNPs
- Simple example using 2 components (genetic and noise), and bi-allelic SNPs
- Frequencies to sample alleles: 0.1, 0.2, 0.3

In [42]:
# load libraries
library(data.table)

# Functions

In [43]:
simGenotype = function(N = 50, nSNP = 30, frequencies = c(0.1, 0.2, 0.4)) {
    sps = paste0("%0", nchar(as.character(N)), "d")
    spsn = paste0("%0", nchar(as.character(nSNP)), "d")
    
    samples = paste0("ID_", sprintf(sps, 1:N))
    snps = paste0("SNP_", sprintf(spsn, 1:nSNP))
    freq = sample(frequencies, nSNP, replace = TRUE)
    X <- sapply(1:nSNP, function(x) rbinom(N, 2, freq[x]))
    colnames(X) <- snps
    rownames(X) <- samples
    return(X)
}


geneticEffects = function (genotype, causalSNP = 10, mBeta = 0, sdBeta = 1) 
{
    cSNP = sort(sample(colnames(genotype), causalSNP, replace = FALSE))
    betas = rnorm(causalSNP, mBeta, sdBeta)
    g = colnames(genotype)
    effects = rep(0, length(g))
    effects[g %in% cSNP] = betas
    names(effects) = g
    return(effects)
}


createCouples = function(genotype) {
    temp = rownames(genotype)
    ids = temp
    pairs = NULL
        for (i in seq_along(ids)) {
            if (length(temp) > 0) {
            e = sample(temp, 1)
            temp = temp[!temp %in% e]
            a = sample(temp, 1)
            temp = temp[!temp %in% a]
            pairs = rbind(pairs, c(e, a))
        } else {
            break
        }
    }

    couples = list()
    snps = colnames(genotype)
    for (i in 1:nrow(pairs)) {
        temp = data.table(t(genotype[pairs[i,], ]))
        rownames(temp) = snps 
        couples[[i]] = temp
    }
    return(couples)
}


reproduce = function(couples, nkids = 2) {

    # mendelian rules
    m = matrix(c(1, 0, 0, 0.5, 0.5, 0, 0, 1, 0, 0.25, 0.50, 0.25, 0, 0.50, 0.5, 0, 0, 1), 
        nrow = 6, ncol = 3, byrow  = TRUE
    )       
    colnames(m) = c(0, 1, 2)
    rownames(m) = c("00", "01", "02", "11", "12", "22")
    
    kids = list()
    
    family = data.table()
    j = 1

    for (i in seq_along(couples)) {
        for (ii in 1:nkids) {
            temp = couples[[i]]
            fids = colnames(temp)
            temp$k = temp[, apply(.SD, 1, getKidGenotype, matrix = m)]
            temp = temp[, .(k)]
            setnames(temp, "k", paste0("ID_", j))
            temp[, snp := rownames(couples[[1]])]
            kids[[paste0(i, ii)]] = temp

            family = rbind(family, data.table(father = fids[1], mother = fids[2], kid = paste0("ID_", j)))
            j = j+1
        }
}
    kk = Reduce(function(...) merge(...,  by = "snp"), kids)
    kk[, snp := NULL]
    kk = as.matrix(t(kk))
    colnames(kk) = rownames(couples[[1]])
    return(list(kk, family))

}


# function to get offspring genotype
getKidGenotype = function(values, matrix) {
    comba = paste0(values, collapse = "")
    combb = paste0(rev(values), collapse = "")
    s = grep(paste0(comba, "|", combb), rownames(matrix))
    prob = as.vector(matrix[s, ])
    return(sample(0:2, size = 1, prob = prob))
}


simKidTrait = function(kids, ge) {
    return(kids[[1]] %*% ge)
}


getNoiseComponent = function(genotype, mNoise = 0, sdNoise = 1) {
    n = nrow(genotype)
    return(rnorm(n, mNoise, sdNoise))
}


rescaleVar = function(component, prop) {
    component = as.vector(component) 
    var_component = var(component)
    scale_factor = sqrt(prop/var_component)
    return(component * scale_factor)
}


scalingFactor = function(component, prop) {
    component = as.vector(component) 
    var_component = var(component)
    return(sqrt(prop/var_component))
}

# Initial example
- 1000 people
- 100 snps
- 50 causal

In [84]:
gen = 0.3
noise = 1 - gen

genotype = simGenotype(1000, 300)
ge = geneticEffects(genotype, 100)
parentGComp = genotype %*% ge
parentNComp = getNoiseComponent(genotype)

pgsf = scalingFactor(parentGComp, gen)
pnsf = scalingFactor(parentNComp, noise)
parentTrait = parentGComp * pgsf + parentNComp * pnsf
dparentTrait = data.table(id = rownames(parentTrait), trait = parentTrait[, 1])

In [85]:
head(genotype)

Unnamed: 0,SNP_001,SNP_002,SNP_003,SNP_004,SNP_005,SNP_006,SNP_007,SNP_008,SNP_009,SNP_010,⋯,SNP_291,SNP_292,SNP_293,SNP_294,SNP_295,SNP_296,SNP_297,SNP_298,SNP_299,SNP_300
ID_0001,0,0,0,1,1,0,0,0,0,1,⋯,0,1,0,0,0,0,0,0,0,1
ID_0002,1,0,0,1,0,1,0,0,0,1,⋯,0,0,1,0,0,0,0,0,0,2
ID_0003,1,0,0,1,0,1,0,0,0,1,⋯,1,1,0,0,1,0,0,0,2,0
ID_0004,1,1,0,1,1,0,0,0,1,1,⋯,0,0,0,1,0,0,0,0,1,1
ID_0005,2,1,0,2,1,2,1,0,1,0,⋯,1,1,0,0,0,1,0,0,1,0
ID_0006,0,1,1,1,0,0,0,0,0,1,⋯,0,2,0,0,0,0,1,0,1,1


In [86]:
# random couples (not efficient code)
couples = createCouples(genotype)
kids = reproduce(couples)

In [87]:
kidGComp = kids[[1]] %*%  ge
kidNComp = getNoiseComponent(kids[[1]])
kidTrait = kidGComp * pgsf + kidNComp * pnsf


In [88]:
head(kids[[1]])

Unnamed: 0,SNP_001,SNP_002,SNP_003,SNP_004,SNP_005,SNP_006,SNP_007,SNP_008,SNP_009,SNP_010,⋯,SNP_291,SNP_292,SNP_293,SNP_294,SNP_295,SNP_296,SNP_297,SNP_298,SNP_299,SNP_300
ID_1,0,0,0,0,2,1,1,0,0,0,⋯,1,0,1,0,0,1,0,0,1,1
ID_2,0,0,0,1,2,1,1,1,0,0,⋯,1,1,1,0,1,0,1,0,1,0
ID_3,1,0,0,0,0,0,0,0,0,1,⋯,1,1,1,0,0,0,0,0,2,0
ID_4,1,0,0,0,1,0,1,0,0,0,⋯,0,1,1,0,0,0,0,0,1,1
ID_5,1,0,0,2,1,2,0,1,0,0,⋯,1,1,0,0,0,1,1,0,1,0
ID_6,1,1,0,0,1,2,0,2,1,0,⋯,1,2,0,2,0,1,0,0,1,1


In [89]:
dkidTrait = data.table(id = rownames(kidTrait), trait = kidTrait[, 1])
family = kids[[2]]
dkidTrait = merge(dkidTrait, family, by.x = "id", by.y = "kid")
dkidTrait = merge(dkidTrait, dparentTrait[, .(id, tfather = trait)], by.x = "father", by.y = "id", all.x = TRUE)
dkidTrait = merge(dkidTrait, dparentTrait[, .(id, tmother = trait)], by.x = "mother", by.y = "id", all.x = TRUE)

In [90]:
# correlation between phenotypes
cor(dkidTrait[, .(trait, tfather, tmother)])

Unnamed: 0,trait,tfather,tmother
trait,1.0,0.18313253,0.15145904
tfather,0.1831325,1.0,0.01713349
tmother,0.151459,0.01713349,1.0


In [91]:
# variance composition parents
paste0("Genetic part: ", round(var(parentGComp[, 1] * pgsf)/var(parentTrait[, 1]), 2))
paste0("Noise part: ", round(var(parentNComp * pnsf) /var(parentTrait[, 1]), 2))

In [92]:
# variance composition kids
paste0("Genetic part: ", round(var(kidGComp[, 1] * pgsf)/var(kidTrait[, 1]), 2))
paste0("Noise part: ", round(var(kidNComp * pnsf) /var(kidTrait[, 1]), 2))