# Simulating phenotypes

- Mock example with very small sample size and number of SNPs
- Simple example using 2 components (genetic and noise), and bi-allelic SNPs
- Frequencies to sample alleles: 0.1, 0.2, 0.3

In [168]:
# load libraries
library(data.table)
library(PhenotypeSimulator)

In [344]:
# simulate simple bi-allelic genotypes 
# 100 people, 30 SNP
# allele frequencies 0.1, 0.2, 0.3, we need to think about better values for this
# nsim = 100
# genotypes = simulateGenotypes(N = nsim, NrSNP = 30, 
#         frequencies = c(0.1, 0.2, 0.3), verbose = FALSE)
# names(genotypes)
# ids = genotypes$id_samples

In [345]:
simGenotype = function(N = 50, nSNP = 30, frequencies = c(0.1, 0.2, 0.4)) {
    sps = paste0("%0", nchar(as.character(N)), "d")
    spsn = paste0("%0", nchar(as.character(nSNP)), "d")
    
    samples = paste0("ID_", sprintf(sps, 1:N))
    snps = paste0("SNP_", sprintf(spsn, 1:nSNP))
    freq = sample(frequencies, nSNP, replace = TRUE)
    X <- sapply(1:nSNP, function(x) rbinom(N, 2, freq[x]))
    colnames(X) <- snps
    rownames(X) <- samples
    return(X)
}


geneticEffects = function (genotype, causalSNP = 10, mBeta = 0, sdBeta = 1) 
{
    cSNP = sort(sample(colnames(genotype), causalSNP, replace = FALSE))
    betas = rnorm(causalSNP, mBeta, sdBeta)
    g = colnames(genotype)
    effects = rep(0, length(g))
    effects[g %in% cSNP] = betas
    names(effects) = g
    return(effects)
}


createCouples = function(genotype) {
    temp = rownames(genotype)
    pairs = NULL
        for (i in seq_along(ids)) {
            if (length(temp) > 0) {
            e = sample(temp, 1)
            temp = temp[!temp %in% e]
            a = sample(temp, 1)
            temp = temp[!temp %in% a]
            pairs = rbind(pairs, c(e, a))
        } else {
            break
        }
    }

    couples = list()
    snps = colnames(genotype)
    for (i in 1:nrow(pairs)) {
        temp = data.table(t(genotype[pairs[i,], ]))
        rownames(temp) = snps 
        couples[[i]] = temp
    }
    return(couples)
}

reproduce = function(couples, nkids = 2) {

    # mendelian rules
    m = matrix(c(1, 0, 0, 0.5, 0.5, 0, 0, 1, 0, 0.25, 0.50, 0.25, 0, 0.50, 0.5, 0, 0, 1), 
        nrow = 6, ncol = 3, byrow  = TRUE
    )       
    colnames(m) = c(0, 1, 2)
    rownames(m) = c("00", "01", "02", "11", "12", "22")
    
    kids = list()
    
    family = data.table()
    j = 1

    for (i in seq_along(couples)) {
        for (ii in 1:nkids) {
            temp = couples[[i]]
            fids = colnames(temp)
            temp$k = temp[, apply(.SD, 1, getKidGenotype, matrix = m)]
            temp = temp[, .(k)]
            setnames(temp, "k", paste0("ID_", j))
            temp[, snp := rownames(couples[[1]])]
            kids[[paste0(i, ii)]] = temp

            family = rbind(family, data.table(father = fids[1], mother = fids[2], kid = paste0("ID_", j)))
            j = j+1
        }
}
    kk = Reduce(function(...) merge(...,  by = "snp"), kids)
    kk[, snp := NULL]
    kk = as.matrix(t(kk))
    colnames(kk) = rownames(couples[[1]])
    return(list(kk, family))

}


# function to get offspring genotype
getKidGenotype = function(values, matrix) {
    comba = paste0(values, collapse = "")
    combb = paste0(rev(values), collapse = "")
    s = grep(paste0(comba, "|", combb), rownames(matrix))
    prob = as.vector(matrix[s, ])
    return(sample(0:2, size = 1, prob = prob))
}

simKidTrait = function(kids, ge) {
    return(kids[[1]] %*% ge)
}

getNoiseComponent = function(genotype, mNoise = 0, sdNoise = 1) {
    n = nrow(genotype)
    return(rnorm(n, mNoise, sdNoise))
}

rescaleVar = function(component, prop) {
    component = as.vector(component) 
    var_component = var(component)
    scale_factor = sqrt(prop/var_component)
    return(component * scale_factor)
}

scalingFactor = function(component, prop) {
    component = as.vector(component) 
    var_component = var(component)
    return(sqrt(prop/var_component))
}


In [361]:
gen = 0.3
noise = 1 - gen

genotype = simGenotype(1000, 50)
ge = geneticEffects(genotype)
parentGComp = genotype %*% ge
parentNComp = getNoiseComponent(genotype)

pgsf = scalingFactor(parentGComp, gen)
pnsf = scalingFactor(parentNComp, noise)
parentTrait = parentGComp * pgsf + parentNComp * pnsf


In [379]:
dparentTrait = data.table(id = rownames(parentTrait), trait = parentTrait[, 1])

In [372]:
couples = createCouples(genotype)
kids = reproduce(couples)

In [373]:
kidGComp = kids[[1]] %*%  ge
kidNComp = getNoiseComponent(kids[[1]])
kidTrait = kidGComp * pgsf + kidNComp * pnsf

In [377]:
dkidTrait = data.table(id = rownames(kidTrait), trait = kidTrait[, 1])

In [382]:
family = kids[[2]]

In [349]:
var(parentGComp * pgsf) /var(parentTrait)
var(parentNComp * pnsf) /var(parentTrait)

0
0.3005956


0
0.7013897


In [343]:
var(kidGComp * pgsf) /var(kidTrait)
var(kidNComp * pnsf) /var(kidTrait)

0
0.289476


0
0.7550101


In [352]:
parentGComp = kids[[1]] %*% ge
parentNComp = getNoiseComponent(kids[[1]])
parentTrait = parentGComp * pgsf + parentNComp * pnsf
var(parentGComp * pgsf) /var(parentTrait)
var(parentNComp * pnsf) /var(parentTrait)

0
0.3317153


0
0.6863526


In [353]:
couples = createCouples(kids[[1]])
kids = reproduce(couples)

In [355]:
kidGComp = kids[[1]] %*%  ge
kidNComp = getNoiseComponent(kids[[1]])
kidTrait = kidGComp * pgsf + kidNComp * pnsf

In [356]:
var(kidGComp * pgsf) /var(kidTrait)
var(kidNComp * pnsf) /var(kidTrait)

0
0.3428243


0
0.7055084


In [358]:
kids[[2]]

father,mother,kid
<chr>,<chr>,<chr>
ID_502,ID_208,ID_1
ID_502,ID_208,ID_2
ID_135,ID_371,ID_3
ID_135,ID_371,ID_4
ID_13,ID_74,ID_5
ID_13,ID_74,ID_6
ID_213,ID_384,ID_7
ID_213,ID_384,ID_8
ID_984,ID_6,ID_9
ID_984,ID_6,ID_10


In [10]:
# to simulate the phenotype we need to define other factors
# in this case I am just adding noise, but we can add effect of covariates, culture, etc.
# I am only using independent effect because we are using only one trait.
noiseBg = noiseBgEffects(N = nsim, P = 1, share = FALSE)
t(noiseBg$cov_independent)


0
0.5476672


In [11]:
# to define the phenotype we set parameters on the variance composition of the phenotype

# parameters (pretty simple example)
genVar = 0.3 # genetic variability
noiseVar <- 1 - genVar # noise variability 

# rescale phenotype components so that variance composition match parameters above
genFixed_independent_scaled = rescaleVariance(genFixed$independent, genVar)
noiseBg_independent_scaled <- rescaleVariance(noiseBg$independent, noiseVar)

# total variance proportions have to add up to 1
total <-  noiseVar + genVar
total == 1

In [12]:
var_component <- var(genFixed$independent)
mean_var <- mean(diag(var_component))
scale_factor <- sqrt(0.4/mean_var)
component_scaled <- genFixed$independent * scale_factor

In [13]:
sqrt(0.4/mean_var)

In [62]:
noiseBg_independent_scaled$component[c(30, 40)]
noiseBg$independent[c(30, 40)]

In [270]:
genFixed_independent_scaled$component[c(30, 40)]
genFixed$independent[c(30, 40)]

In [237]:
# combine components into final phenotype
# Y1 <- genFixed_independent_scaled$component + noiseBg_independent_scaled$component
# Y2 <- genFixed$independent + noiseBg$independent
# summary(Y1)
# summary(Y2)
# hist(Y1)
# hist(Y2)


In [64]:
# variance composition
var(genFixed_independent_scaled$component)/var(Y1)
var(noiseBg_independent_scaled$component)/var(Y1)

var(genFixed$independent)/var(Y2)
var(noiseBg$independent)/var(Y2)

Unnamed: 0,Trait_1
Trait_1,0.2756157


Unnamed: 0,Trait_1
Trait_1,0.6431034


Unnamed: 0,Trait_1
Trait_1,0.7866207


Unnamed: 0,Trait_1
Trait_1,0.2012685


# Reproduction?

Let's assume individual 30 mates 40:

In [198]:
# function to get offspring genotype
getGenotype = function(values, matrix) {
    comba = paste0(values, collapse = "")
    combb = paste0(rev(values), collapse = "")
    s = grep(paste0(comba, "|", combb), rownames(matrix))
    prob = as.vector(matrix[s, ])
    return(sample(0:2, size = 1, prob = prob))
}

# getGenotype(2, 2, m)

In [214]:
reproduce = function(couples, nkids = 2) {

    # mendelian rules
    m = matrix(c(1, 0, 0, 0.5, 0.5, 0, 0, 1, 0, 0.25, 0.50, 0.25, 0, 0.50, 0.5, 0, 0, 1), 
        nrow = 6, ncol = 3, byrow  = TRUE
    )       
    colnames(m) = c(0, 1, 2)
    rownames(m) = c("00", "01", "02", "11", "12", "22")
    kids = list()
    
    family = data.table()
    j = 1

    for (i in seq_along(couples)) {
        for (ii in 1:nkids) {
            temp = couples[[i]]
            fids = colnames(temp)
            temp$k = temp[, apply(.SD, 1, getKidGenotype, matrix = m)]
            temp = temp[, .(k)]
            setnames(temp, "k", paste0("ID_", j))
            temp[, snp := rownames(couples[[1]])]
            kids[[paste0(i, ii)]] = temp

            family = rbind(family, data.table(father = fids[1], mother = fids[2], kid = paste0("ID_", j)))
            j = j+1
        }
}
    kk = Reduce(function(...) merge(...,  by = "snp"), kids)
    return(list(kk, family))

}



In [215]:
kids = reproduce(couples)


In [229]:
sk = t(kids[[1]])
as.matrix(sk) %*% as.vector(ge)

ERROR: Error in as.matrix(sk) %*% as.vector(ge): requires numeric/complex matrix/vector arguments


In [234]:
as.matrix(sk)[-1, ] %*% as.vector(ge)

ERROR: Error in as.matrix(sk)[-1, ] %*% as.vector(ge): requires numeric/complex matrix/vector arguments


In [224]:
matrix(kk[snp %in% colnames(causalSNPs), ], byrow = TRUE, nrow = 1, 10) 

ERROR: Error in matrix(kk[snp %in% colnames(causalSNPs), ], byrow = TRUE, nrow = 1, : object 'kk' not found


In [None]:
newY <- genV * genVar + noiseV * (1-genVar)
newY

In [None]:
# parents phenotype
Y[c(30, 40)]