# Kinship matrix

In [None]:
# Load libraries
library(data.table)  # Fast VCF file reading
library(AGHmatrix)   # Kinship matrix calculation

In [None]:
# Read VCF file, skipping header lines until the column names line (#CHROM) - file can be downloaded
vcf <- fread("quinoa_551accessions_genomic_prediction.vcf", skip="#CHROM", sep="\t")

# Extract sample IDs from columns 10 onwards (first 9 columns are VCF metadata)
samples <- colnames(vcf)[10:ncol(vcf)]  
# Extract genotype calls into a matrix (all columns after the first 9 VCF standard columns)
snp_matrix <- as.matrix(vcf[, ..samples]) 

### recode genotypes to numeric format
# convert diploid genotype calls to 0,1,2
# handle unphased (/) and phased (|) genotypes
snp_matrix[snp_matrix %in% c("0/0", "0|0")] <- 0
snp_matrix[snp_matrix %in% c("0/1", "1/0", "0|1", "1|0")] <- 1
snp_matrix[snp_matrix %in% c("1/1", "1|1")] <- 2

# rename column one in vcf to "CHROM" (it was read as "#CHROM")
colnames(vcf)[1] <- "CHROM"

# Convert genotype matrix from character to numeric
snp_matrix <- matrix(as.numeric(snp_matrix), nrow=nrow(snp_matrix))

# create unique SNP identifiers as "chromosome_position"
rownames(snp_matrix) <- paste0(vcf$CHROM, "_", vcf$POS)
colnames(snp_matrix) <- samples

### Create kinship matrix using VanRaden method
# Transpose matrix as needed for AGHmatrix where samples are rows and SNPs are columns
snp_matrix <- t(snp_matrix)

# Calculate additive relationship matrix with VanRaden method 
kinship_matrix_V2 <- Gmatrix(snp_matrix, 
                            method = "VanRaden", 
                            ploidy = 2,
                            thresh.missing = 0.2)     # our max missing 20% threshold instead of 50%

# save the kinship matrix as RData for future use
save(kinship_matrix_V2, file = "kinship_matrix_VanRaden_auspak_maxmissing20.RData")

# PCA

Instead of kinship matrix, the machine learning approach uses PCs.

In [None]:
# Load library
library(SNPRelate)  # VCF to GDS conversion and PCA

### convert VCF to GDS for PCA
# SNPRelate requires filepath to a VCF file - file can be downloaded
vcf.fn <- "quinoa_551accessions_genomic_prediction.vcf"


# convert VCF to GDS format with error handling
tryCatch({
  snpgdsVCF2GDS(vcf.fn, "quinoa_auspak.gds", 
                method="copy.num.of.ref",
                verbose=TRUE)
}, error = function(e) {
  print(paste("Error occurred:", e$message))
})

# open gds file for reading and PCA
pca_genofile<- snpgdsOpen("quinoa_auspak.gds")
# check summary statistics
# number of samples, SNPs, chromosomes
snpgdsSummary(pca_genofile)


##### perform PCA

pca_quinoa <- snpgdsPCA(pca_genofile, 
                        autosome.only=FALSE,  # include all chromosomes, not just autosomes
                        num.thread=20,        # use 20 CPU threads for parallel processing 
                        verbose=TRUE)         # show progress

### check variance explained by each PC
# check variance proportion (%)
pc.percent <- pca_quinoa$varprop*100
round(pc.percent, 2)

### extract all PCs
# Automatically determine number of PCs (all available)
n_pcs <- ncol(pca_quinoa$eigenvect)
# create data frame with sample IDs and all PCs
quinoa_PCs <- data.frame(sample.id = pca_quinoa$sample.id,
                  setNames(as.data.frame(pca_quinoa$eigenvect[,1:n_pcs]), 
                          paste0("PC", 1:n_pcs)),
                  stringsAsFactors = FALSE)


In [None]:
# write to a csv file 
write.csv(quinoa_PCs,"quinoa_551_AUSPAK_PCs.csv",quote = F,row.names = F)