In [2]:
#using Pkg
#Pkg.add(PackageSpec(name="JWAS",rev="master"))

In [3]:
#Pkg.develop(PackageSpec(path="/home/jovyan/rohan/Box Sync/JWAS.jl"))
#Pkg.free("JWAS")
#Pkg.add("StatsPlots")

In [4]:
using DataFrames              # package for working with data sets
using JWAS                    # package for Bayesian regression analyses, including BayesB and BayesCπ        
using JWAS:misc               # utility functions
using Distributions       
using Plots                   # package for plotting 
using LinearAlgebra,Statistics,Random,DelimitedFiles, DataFrames

### Input marker and phenotype data

In [5]:
function readMatBin(fileName)
    genStr = open(fileName)
    n = read(genStr,Int64)
    p = read(genStr,Int64)
    M = zeros(n,p)
    for j in 1:p
        for i in 1:n
            M[i,j] = read(genStr,Float64)
        end
    end
    close(genStr)
    return M
end

function removeCols!(M,cols)
    return M[:, [!(i in cols) for i=1:size(M,2)]]
end

removeCols! (generic function with 1 method)

In [6]:
posQTL  = Int64.(vec(readdlm("posQTL.csv")))
beta    = readdlm("beta.csv")
M = readMatBin("genotypes.bin");

In [7]:
n,p = size(M)
simData  = readtable("phenotypes.csv",header=false,names=[:y])# reading in the simulated phenotypes into a data frame
phenData = DataFrame(id=1:n, y=simData[:y])
first(phenData,5)

│   caller = ip:0x0
└ @ Core :-1


Unnamed: 0_level_0,id,y
Unnamed: 0_level_1,Int64,Float64⍰
1,1,-3.56863
2,2,1.73437
3,3,2.31795
4,4,-0.264018
5,5,-3.13096


In [8]:
phenTrain = phenData[1001:end,:]
first(phenTrain,5)

Unnamed: 0_level_0,id,y
Unnamed: 0_level_1,Int64,Float64⍰
1,1001,-6.22029
2,1002,-0.952557
3,1003,-9.66847
4,1004,-0.959437
5,1005,1.48486


In [9]:
resVar = var(simData[:y])/2
genVar = resVar

12.83523324152048

### Run BayesC$\pi$ using JWAS

In [10]:
ids = string.(1:size(M,1))                     # ids in genotype file are sequential numbers 1...n
model  = build_model("y = intercept",resVar)   # give model (except for marker part)
add_genotypes(model,M,genVar,header=false,rowID=ids,G_is_marker_variance=false);

21834 markers on 2000 individuals were added.


In [11]:
?runMCMC

search: [0m[1mr[22m[0m[1mu[22m[0m[1mn[22m[0m[1mM[22m[0m[1mC[22m[0m[1mM[22m[0m[1mC[22m Ze[0m[1mr[22moMeanF[0m[1mu[22mll[0m[1mN[22mor[0m[1mm[22mal[0m[1mC[22manon



```
runMCMC(model::MME,df::DataFrame;
        chain_length=1000,starting_value=false,burnin = 0,
        output_samples_frequency = 0,output_samples_file="MCMC_samples",
        printout_model_info=true,printout_frequency=100,
        methods="conventional (no markers)",Pi=0.0,estimatePi=false,
        single_step_analysis= false,pedigree = false,
        missing_phenotypes=false,constraint=false,
        update_priors_frequency::Int64=0,
        outputEBV=true,output_PEV=false,output_heritability=false)
```

**Run MCMC for Bayesian Linear Mixed Models with or without estimation of variance components.**

  * Available **methods** include "conventional (no markers)", "RR-BLUP", "BayesB", "BayesC", "Bayesian Lasso", and "GBLUP".
  * Single step analysis is allowed if **single*step*analysis** = `true` and **pedigree** is provided.
  * The **starting_value** can be provided as a vector of numbers for all location parameteres and marker effects, defaulting to `0.0`s.
  * The first **burnin** iterations are discarded at the beginning of a MCMC chain of length **chain_length**.
  * Save MCMC samples every **output*samples*frequency** iterations, defaulting to `false`, to files **output*samples*file**, defaulting to `MCMC_samples.txt`. MCMC samples for hyperparametes (variance componets) and marker effects are saved by default if **output*samples*frequency** is provided. MCMC samples for location parametes can be saved using `output_MCMC_samples()`. Note that saving MCMC samples too frequently slows down the computation.
  * In Bayesian variable selection methods, **Pi** for single-trait analyses is a number; **Pi** for multi-trait analyses is a dictionary such as `Pi=Dict([1.0; 1.0]=>0.7,[1.0; 0.0]=>0.1,[0.0; 1.0]=>0.1,[0.0; 0.0]=>0.1)`, defaulting to `all markers have effects (0.0)` in single-trait analysis and `all markers have effects on all traits` in multi-trait analysis. **Pi** is estimated if **estimatePi** = true
  * In multi-trait analysis, **missing_phenotypes**, defaulting to `true`, and **constraint** variance components, defaulting to `false`, are allowed. If **constraint**=true, constrain residual covariances between traits to be zeros.
  * Print out the model information in REPL if `printout_model_info=true`; print out the monte carlo mean in REPL with **printout_frequency**, defaulting to `false`.
  * Individual estimted breeding values (EBVs) are returned if **outputEBV**=`true`, defaulting to `true`. Heritability and genetic variances are returned if **output_heritability**=`true`, defaulting to `false`. Note that estimation of heritability is computaionally intensive.


In [None]:
MCMCFileNAME = "MCMCSamples"                  # place to put samples of marker effects
                                              # marker effect is set to zero if that locus is not in model
out=runMCMC(model, phenTrain,                 # tell JWAS to run analysis, for given model and data 
    Pi=0.99,                                  # intial value of π
    estimatePi=true,
    chain_length=6000 ,                       # length of chain
    printout_frequency=5000,                  # how often to show progress of analysis 
    printout_model_info=true,                 # tell JWAS to show the options used in this analysis
    methods="BayesC",                         # tell JWAS to run a BayesC analysis
    output_samples_frequency=20,              # how often to output sampled quantities
    output_samples_file=MCMCFileNAME,         # file name to output sampled marker effects
    output_PEV=true
);

In [13]:
keys(out)

UndefVarError: UndefVarError: out not defined

In [14]:
out["EBV_y"]

UndefVarError: UndefVarError: out not defined

In [15]:
res = GWAS("MCMCSamples_marker_effects_y.txt";header=true)

21834×2 Array{Any,2}:
 "1"      0.0       
 "2"      0.0       
 "3"      0.0       
 "4"      0.0       
 "5"      0.0       
 "6"      0.0       
 "7"      0.0       
 "8"      0.0145631 
 "9"      0.0       
 "10"     0.0       
 "11"     0.00485437
 "12"     0.00485437
 "13"     0.0       
 ⋮                  
 "21823"  0.0       
 "21824"  0.0       
 "21825"  0.0       
 "21826"  0.00485437
 "21827"  0.0       
 "21828"  0.0       
 "21829"  0.0       
 "21830"  0.0       
 "21831"  0.0       
 "21832"  0.0       
 "21833"  0.0       
 "21834"  0.0       

In [16]:
[res[posQTL,:] beta  out["Posterior mean of marker effects"][posQTL,2]]

UndefVarError: UndefVarError: out not defined

In [17]:
winVar = GWAS("MCMCSamples_marker_effects_y.txt",model.output_genotypes;header=true,window_size=100,threshold=0.001)

Compute the posterior probability of association of the genomic window that explains more than 0.001 of the total genetic variance


InterruptException: InterruptException:

In [18]:
sum(winVar[:prGenVar])

UndefVarError: UndefVarError: winVar not defined

In [19]:
sortPosQTL = sort(posQTL);

In [20]:
PPA = 0.34
bigPPA = winVar[PPA .<= winVar[:WPPA],: ]

lowPos  = [findlast(sortPosQTL .<= row[1]) for row in eachrow(bigPPA)] 
highPos = [findfirst(sortPosQTL .>= row[2]) for row in eachrow(bigPPA)]   
wPos = [findfirst(bigPPA[i,1] .<= sortPosQTL .< bigPPA[i,2]) for i=1:size(bigPPA,1) ]

lowQTL  = [i == nothing ? 0 : sortPosQTL[i] for i in lowPos]
highQTL = [i == nothing ? 0 : sortPosQTL[i] for i in highPos]
wQTL    = [i == nothing ? 0 : sortPosQTL[i] for i in wPos]

res = DataFrame(
    wStart = bigPPA[:wStart],
    wEnd = bigPPA[:wEnd],
    wQTL = wQTL,
    oQTL = min.(bigPPA[:wStart]-lowQTL,highQTL-bigPPA[:wEnd]),
    prVar  = bigPPA[:prGenVar],
    WPPA   = bigPPA[:WPPA],
    PPA_t = bigPPA[:PPA_t]
    )

UndefVarError: UndefVarError: winVar not defined

In [21]:
(1 - 8/28)*100

71.42857142857143

In [22]:
(1 - 1/15)*100

93.33333333333333

In [23]:
model.output_genotypes

2000×21834 Array{Float64,2}:
  0.323   1.404  -0.53   0.3835   0.383  …   0.898   0.497   0.4105   0.497
  0.323  -0.596  -0.53   0.3835   0.383     -0.102  -0.503   0.4105  -0.503
  0.323  -0.596  -0.53  -0.6165  -0.617      0.898   0.497   0.4105   0.497
 -0.677   0.404  -0.53   1.3835   1.383     -1.102  -1.503   0.4105  -1.503
 -0.677  -0.596   0.47   0.3835   0.383     -0.102   0.497  -0.5895   0.497
  1.323   0.404  -0.53  -0.6165  -0.617  …   0.898   0.497   0.4105   0.497
  0.323   1.404  -0.53   0.3835   0.383     -0.102   0.497  -0.5895   0.497
 -0.677   0.404  -0.53   1.3835   1.383     -1.102  -1.503   0.4105  -1.503
  0.323   0.404   0.47  -0.6165  -0.617      0.898   0.497   0.4105   0.497
  0.323  -0.596  -0.53  -0.6165  -0.617     -1.102  -0.503  -0.5895  -0.503
 -0.677  -0.596   0.47  -0.6165  -0.617  …  -0.102  -0.503   0.4105  -0.503
  0.323   0.404   0.47   0.3835   0.383     -1.102  -1.503   0.4105  -1.503
  0.323  -0.596   0.47  -0.6165  -0.617     -0.102  -0.503 

In [24]:
phenTrain[:y]

1000-element Array{Union{Missing, Float64},1}:
  -6.220285141426304  
  -0.9525568780170219 
  -9.668466774764997  
  -0.9594367446317245 
   1.4848637800225264 
  -7.302888576481342  
   3.2188372011450603 
  -9.409562338846678  
  -4.003042652155053  
  -1.4304634147079014 
   9.253674434757643  
  -2.4338561689814897 
  -2.454711549587484  
   ⋮                  
  -1.83251201576066   
   3.876437664717496  
  -9.125557120536078  
   6.1298551561385475 
  -4.337044216196259  
   0.35987596278692513
  -9.39921782570251   
 -15.05420785285153   
   2.0131166980489286 
 -10.210672438397063  
  -4.918261403552513  
 -14.978572207906616  

In [25]:
W,y,Va,Ve = M[1:1000,:],phenTrain[:y],0.07,13.3
#function t_test(M,y,Va,Ve)
    V = W*W'*Va + I*Ve
    Vi = inv(V)

    n,k = size(W)
    X = [ones(n) zeros(n)]  

1000×2 Array{Float64,2}:
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 ⋮       
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0

In [None]:
    tcalc = zeros(k)
    for i=1:k
        X[:,2] = W[:,i]
        lhsi   = inv(X'Vi*X)
        rhs = X'Vi*y
        betaHat = lhsi*rhs
        tcalc[i] = betaHat[2]/sqrt(lhsi[2,2])
    end 

In [None]:
pvals = ccdf.(Normal(0,1), tcalc[posQTL])
pvals[pvals .< 0.05]

In [None]:
[pvals posQTL]