## Use package bigsnpr to create PCA and remove outliers by using robust mahalanobis distance

In [89]:
library(bigsnpr)

## Do some QC with plink within bigsnpR

In [None]:
bedfile <- "/gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/ukb28374_exomedata/exome_data_OCT2020/ukb23155_c1_b0_v1.bed"
prefix <- sub_bed(bedfile)
plink <- "/home/dc2325/software/plink"

In [None]:
bedfile <- system.file("extdata", "example.bed", package = "bigsnpr")
prefix  <- sub_bed(bedfile)

In [None]:
snp_plinkQC(
  plink.path = plink,
  prefix.in = prefix,
  file.type = "--bfile",
  prefix.out = paste0(prefix.in, "_QC"),
  maf = 0.01,
  geno = 0.1,
  mind = 0.1,
  hwe = 1e-50,
  autosome.only = TRUE,
  extra.options = "",
  verbose = TRUE
)

## Calculate relatedness using Plink2 - KING

In [98]:
bedfile <- download_1000G("data")
plink2 <- download_plink2("data")
rel <- snp_plinkKINGQC_2(
  plink2.path = plink2,
  bedfile.in = bedfile,
  thr.king = 2^-4.5,
  make.bed = FALSE,
  ncores = 2
)
str(rel)

'data.frame':	31 obs. of  8 variables:
 $ FID1   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ IID1   : chr  "HG00120" "HG00240" "HG00542" "HG00595" ...
 $ FID2   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ IID2   : chr  "HG00116" "HG00238" "HG00475" "HG00584" ...
 $ NSNP   : int  1664852 1664852 1664852 1664852 1664852 1664852 1664852 1664852 1664852 1664852 ...
 $ HETHET : num  0.111 0.1105 0.1024 0.101 0.0992 ...
 $ IBS0   : num  0.0333 0.0367 0.0302 0.037 0.0367 ...
 $ KINSHIP: num  0.0821 0.068 0.0854 0.0541 0.0535 ...


In [97]:
snp_plinkKINGQC_2 <- function (plink2.path, bedfile.in, bedfile.out = NULL, thr.king = 2^-3.5, 
    make.bed = TRUE, ncores = 1, extra.options = "", verbose = TRUE) 
{
    v <- system(paste(plink2.path, "--version"), intern = TRUE)
    if (substr(v, 1, 8) != "PLINK v2") 
        stop2("This requires PLINK v2; got '%s' instead.", v)
    prefix.in <- sub_bed(bedfile.in)
    if (make.bed) {
        if (is.null(bedfile.out)) 
            bedfile.out <- paste0(prefix.in, "_norel.bed")
        assert_noexist(bedfile.out)
        bigsnpr:::system_verbose(paste(plink2.path, "--bfile", prefix.in, 
            "--make-bed --king-cutoff", thr.king, "--out", sub_bed(bedfile.out), 
            "--threads", ncores, extra.options), verbose = verbose)
        bedfile.out
    }
    else {
        prefix.out <- tempfile()
        bigsnpr:::system_verbose(paste(plink2.path, "--bfile", prefix.in, 
            "--make-king-table --king-table-filter", thr.king, 
            "--out", prefix.out, "--threads", ncores, extra.options), 
            verbose = verbose)
        rel_df <- bigreadr::fread2(paste0(prefix.out, ".kin0"), 
            header = TRUE, nThread=ncores)
        names(rel_df) <- sub("^#(.*)$", "\\1", names(rel_df))
        rel_df
    }
}

In [94]:
bigsnpr:::system_verbose

## Calculate PCA

### 1. Remove related individuals from calculations

In [10]:
rel <- read.table("/gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/ukb28374_exomedata/exome_data_OCT2020/ukb_rel_a32285_s488244.dat", header=T)
head(rel)
dim(rel)
summary(rel$Kinship)

Unnamed: 0_level_0,ID1,ID2,HetHet,IBS0,Kinship
Unnamed: 0_level_1,<int>,<int>,<dbl>,<dbl>,<dbl>
1,1000019,4020065,0.067,0.0,0.2487
2,1000035,3287226,0.047,0.0138,0.0714
3,1000054,1291619,0.051,0.0102,0.1154
4,1000170,4380447,0.044,0.0146,0.0543
5,1000224,2776529,0.078,0.0042,0.2583
6,1000291,3042243,0.048,0.0125,0.0851


   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-1.0000  0.0566  0.0699  0.1183  0.2254  0.4999 

In [14]:
fam <- read.table("/gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/ukb28374_exomedata/exome_data_OCT2020/ukb23155_s200631.fam")
colnames(fam) <- c('FID', 'IID', 'father', 'mother', 'sex', 'pheno')
head(fam)

Unnamed: 0_level_0,FID,IID,father,mother,sex,pheno
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>
1,1434748,1434748,0,0,2,-9
2,5523981,5523981,0,0,1,-9
3,5023838,5023838,0,0,2,-9
4,4023729,4023729,0,0,1,-9
5,4442146,4442146,0,0,2,-9
6,5654789,5654789,0,0,2,-9


In [22]:
ind.rel <- match(c(rel$ID1, rel$ID2), fam$IID)  # /!\ use $ID1 instead with old PLINK
ind.rel
#ind.norel <- (fam)[-ind.rel]

In [13]:
library(dplyr)
related <- rel %>%
    filter(ID1 %in% exomed_ind$IID | ID2 %in% exomed_ind$IID)
head(related)
dim(related)

Unnamed: 0_level_0,ID1,ID2,HetHet,IBS0,Kinship
Unnamed: 0_level_1,<int>,<int>,<dbl>,<dbl>,<dbl>
1,1000019,4020065,0.067,0.0,0.2487
2,1000035,3287226,0.047,0.0138,0.0714
3,1000054,1291619,0.051,0.0102,0.1154
4,1000224,2776529,0.078,0.0042,0.2583
5,1000291,3042243,0.048,0.0125,0.0851
6,1000516,1748770,0.08,0.0046,0.2611


In [3]:
king <- read.table("/home/dc2325/scratch60/pca/genotypes21_22.kin0", header=F)
colnames(king) <- c("FID1","ID1","FID2","ID2","NSNP","HETHET","IBS0","KINSHIP")
head(king)

Unnamed: 0_level_0,FID1,ID1,FID2,ID2,NSNP,HETHET,IBS0,KINSHIP
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>
1,1000022,1000022,1000019,1000019,16507,0.0687587,0.0383474,-0.0284726
2,1000035,1000035,1000019,1000019,16526,0.0743677,0.0418129,-0.0390717
3,1000035,1000035,1000022,1000022,16476,0.0759286,0.0399976,-0.0165766
4,1000046,1000046,1000019,1000019,16547,0.0710099,0.038859,-0.0296362
5,1000046,1000046,1000022,1000022,16497,0.0741347,0.0338849,0.0111442
6,1000046,1000046,1000035,1000035,16516,0.0730806,0.0377815,-0.00886376


In [20]:
library(dplyr)
rel <- king %>%
    filter (KINSHIP > 0.055)
rel

FID1,ID1,FID2,ID2,NSNP,HETHET,IBS0,KINSHIP
<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>
1001937,1001937,1000210,1000210,16534,0.0835249,0.0273376,0.059556
1001937,1001937,1000584,1000584,16515,0.0822283,0.026703,0.063195
1002650,1002650,1001941,1001941,16480,0.082585,0.0284587,0.0573131
1004369,1004369,1000141,1000141,16529,0.0814326,0.0278904,0.0550962
1004369,1004369,1000584,1000584,16504,0.083192,0.0289021,0.0558927
1004560,1004560,1004369,1004369,16538,0.0838675,0.0283589,0.0592283


In [31]:
rel_1 <- sort(unique(unlist(rel[, c("ID1", "ID2")])))
rel_1
#FID <- sort(rel_1)

In [30]:
rel_3 <- data.frame(FID)
rel_3$IID <- rel_3$FID
head(rel_3)

Unnamed: 0_level_0,FID,IID
Unnamed: 0_level_1,<int>,<int>
1,1000141,1000141
2,1000210,1000210
3,1000584,1000584
4,1001937,1001937
5,1001941,1001941
6,1002650,1002650


In [60]:
bedfile <- ("/home/dc2325/scratch60/pca/asian_ancestry/ukb23155_s200631.filtered.merged.prune.bed")

In [None]:
(obj.bed <- bed(bedfile))

In [None]:
bed

In [None]:

ind.rel <- match(c(rel$IID1, rel$IID2), obj.bed$fam$sample.ID)  # /!\ use $ID1 instead with old PLINK
ind.norel <- rows_along(obj.bed)[-ind.rel]

obj.svd <- bed_autoSVD(obj.bed, ind.row = ind.norel, k = 20,
                       ncores = nb_cores())