## Across race prediction - CV2

In [44]:
.libPaths(c('/anaconda2/lib/R/library',.libPaths()))

library(rrBLUP)
library(ggplot2)
library(BGLR)

my.read.vcf <- function(file, special.char="##", ...) {
  my.search.term <- paste0(special.char, ".*")  # Making a search term that looks like: "##.*", tells R to find anything containing the pattern "##" followed by anything (* is wildcard)
  clean.lines <- sub(my.search.term, "", readLines(file)) # Replace any line containing the search term with nothing (in other words remove it)
  clean.lines2 <- sub("#CHROM", "CHROM", clean.lines) # Replace the #CHROM term in the header with CHROM, so R doesn't treat it as a special character
  read.table(..., text=paste(clean.lines2, collapse="\n")) # Pass the cleaned up lines to read.table
}

### make CV fold dataframe for each race five equal folds with sum total to ~ number of individuals in that race

In [49]:
setwd("/Users/ssapkot/Documents/Experiments/SAP_GS_PopStr")
Y <- read.csv('data/BLUEs_pheno_all.csv', header=T, row.names = 1)

cvf <- data.frame(matrix("",nrow=65, ncol=0))

Y$SN = 1:nrow(Y)
    
    for (j in 1:100){
    
    c.list = vector("list", 5)
    
    for (i in 1:5) {
      ctemp = Y[Y$Cluster==i,] #subset individuals from a cluster
      x2 = sample(ctemp$SN,13) # sample individuals randomly from a cluster and store their rowID
      c.list[[i]] = x2
    }
      cvf[j] <- c(c.list[[1]],c.list[[2]],c.list[[3]],c.list[[4]],c.list[[5]]) # Make dataframe for each cluster to a total of 100 reps
    }
nrow(cvf)

In [5]:
GBS=my.read.vcf(file= "data/SAP_all_taxa.vcf", header=TRUE, stringsAsFactors = TRUE, as.is=TRUE)


f.column <- grep("FORMAT", colnames(GBS))

##Function Parse vcf file to convert to -1,0,1 format
parse.GBS <- function(x) {
  unique.x <- unique(x)
  alleles <- setdiff(unique.x,union("H","N"))
  y <- rep(0,length(x))
  y[which(x==alleles[1])] <- -1
  y[which(x==alleles[2])] <- 1
  y[which(x=="N")] <- NA
  return(y)
}

X <- apply(GBS[, -c(1:f.column)],1,parse.GBS)

X[1:5,1:5]

0,1,2,3,4
-1,-1,-1,-1,-1
-1,-1,-1,-1,-1
-1,-1,-1,-1,1
-1,-1,-1,-1,-1
-1,-1,-1,-1,-1


In [6]:
head(Y)

Unnamed: 0,Subpopulation,Cluster,Race,Origin,DTA,PH,GN,GW,GY,FLH,PL,BL,SN
PI152651,Caudatum,4,0,,66,146.61,1286,27.32,43.95,97.0,14.33333,57.95,1
PI17548,Kafir,2,0,,66,214.06,1167,15.62,26.66,156.83333,22.83333,83.41667,2
PI24969,Durra,3,0,,80,182.06,1319,29.92,50.86,162.0,13.33333,41.83333,3
PI329435,Mixed,1,0,,80,95.5,1388,15.68,30.47,65.83333,26.0,72.58333,4
PI329440,Kafir,2,0,,69,93.61,2141,17.61,48.35,58.0,23.83333,67.16667,5
PI34911,Caudatum,4,0,,87,179.67,1587,25.5,52.56,150.0,14.33333,39.75,6


In [51]:
setwd("/Users/ssapkot/Documents/Experiments/SAP_GS_PopStr/Results/Prediction_Results/WR_AR_SameTPsize/AR_Pred_Accu/")
Total_accuracy <- vector("list",5)
for (j in 5:12) {

  for (i in 1:101) {
  CV.fold <- paste("V",toString(i-1),sep='')
 
  if (CV.fold == "V0") {
    Total_Result <- c()
      result<- c()
  }
  else {  
    
  Z <- cvf[,CV.fold]
  Z <- sort(Z) ##sort randomly selected individuals by taxa order, and so the pheno and geno will be in the same order when subsetted
  
  X1 <- X[Z,]
  
  A <- A.mat(X1)
  
  rownames(A) <- 1:nrow(X1)
  P <- Y[Z,]
  cvs <- P$Cluster#fold is determined by cluster each individual belongs to
  
  y = P[,j]
  col = names(P[j])
  
  yhat <- data.frame(cbind( y, yhat = 0))
  yhat$yhat <- as.numeric(yhat$yhat)
  row.names(yhat) <- row.names(y)
  
      result <- c()
      corr <- c()
      var_x <- c()
      var_y <- c()
      cov_xy <- c()
  
    # Make training (TRN) and testing (TST) dfs
    tst <- which(cvs == 1) ##cvs == whichever cluster/race is to be predicted
    yNA <- y
    yNA[tst] <- NA # Mask yields for validation set
    df <- data.frame(y=yNA,gid=1:nrow(A)) # Set up dataframe with traits and genotype labels (same order as in A1) 
    
    # Build rrBLUP model and save yhat for the masked values
    rrblup <- kin.blup(df,K=A,geno="gid",pheno="y") #optional parameters: fixed effects, gaussian kernel, covariates
    yhat$yhat[tst] = rrblup$pred[tst]
    
    corr <- cor(yhat$y[tst],yhat$yhat[tst],use="complete")
    var_x <- var(yhat$yhat[tst], use="complete")
    var_y <- var(yhat$y[tst], use="complete")
    cov_xy <- cov(yhat$y[tst],yhat$yhat[tst], use="complete")
  
      result <- c(corr,var_x,var_y,cov_xy)
      }
      Total_Result <- cbind(Total_Result,result)
  
  }
    rownames(Total_Result) <- c("corr","var_x","var_y","cov_xy")
    
  write.csv(Total_Result, file = paste("AR_Corr-Cov_Mixed_",col,".csv", sep=""))
    }

In [None]:
###When  writing the  file it would be useful to put in race and trait information,
###and also make a separate file/dataframe for mean and standard deviations
