In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [43]:
# input & output

sqc_d <- '/oak/stanford/groups/mrivas/ukbb24983/sqc'
dir_n <- 'population_stratification_w24983_%s'
sqc_f <- 'ukb24983_master_sqc.%s.phe'
covar <- 'ukb24983_GWAS_covar.%s.phe'

master_phe_f <- '/oak/stanford/groups/mrivas/ukbb24983/phenotypedata/master_phe/master.%s.phe'

old_v <- '20200313'
master_v <- '20200517'
new_v <- '20200522'


In [12]:
source('sample_qc_functions.R')


In [15]:
eigenvec_df <- read_eigenvec(file.path(sqc_d, sprintf(dir_n, new_v), 'pca'), c('white_british', 'others'))


In [16]:
eigenvec_df %>% colnames()


In [17]:
eigenvec_df %>% count(population)


population,n
<chr>,<int>
others,28551
white_british,337138


In [22]:
sqc_df <- fread(file.path(sqc_d, sprintf(dir_n, old_v), sprintf(sqc_f, old_v)))


In [23]:
sqc_df %>% count(population)

population,n
<chr>,<int>
african,6497
e_asian,1154
e_asian_outlier,618
non_british_white,24905
s_asian,7885
s_asian_outlier,77
white_british,337138
,110103


## replace the (local) PCs in WB and "others" with the new ones

In [31]:
sqc_unchanged <- sqc_df %>%
mutate(sort_order = 1:n()) %>%
filter(
    ! FID %in% (eigenvec_df %>% pull(FID)),
    ! IID %in% (eigenvec_df %>% pull(IID))
)


In [32]:
sqc_unchanged %>% dim()

In [33]:
sqc_updated <- sqc_df %>%
mutate(sort_order = 1:n()) %>%
select(-all_of(paste0('PC', 1:40)), -population) %>%
inner_join(
    eigenvec_df, by=c('FID', 'IID')
)


In [34]:
sqc_updated %>% dim()


In [36]:
sqc_new <- bind_rows(sqc_updated, sqc_unchanged) %>%
arrange(sort_order) %>%
select(-sort_order) %>%
select(all_of(colnames(sqc_df)))


In [37]:
sqc_new %>% dim()


## write the new sqc file and GWAS covar file

In [38]:
sqc_df %>%
rename('#FID' = 'FID') %>%
fwrite(file.path(sqc_d, sprintf(dir_n, new_v), sprintf(sqc_f, new_v)), sep='\t', na = "NA", quote=F)


In [53]:
covar_df <- sqc_df %>%
select(
    FID, IID, 
    population, split, 
    age, age0, age1, age2, age3, 
    sex, BMI, N_CNV, LEN_CNV, 
    Array, paste0('PC', 1:40), paste0('Global_PC', 1:40)
)

In [54]:
covar_df %>%
rename('#FID' = 'FID') %>%
fwrite(file.path(sqc_d, sprintf(dir_n, new_v), sprintf(covar, new_v)), sep='\t', na = "NA", quote=F)


## update the master phe file

In [45]:
master_phe_df <- fread(
    sprintf(master_phe_f, master_v), 
    colClasses=c('FID'='character', 'IID'='character')
)


In [56]:
master_phe_updated_df <- master_phe_df %>%
mutate(sort_order=1:n()) %>%
select(-all_of(setdiff(colnames(covar_df), c('FID', 'IID'))))%>%
left_join(
    covar_df %>%
    mutate(
        FID=as.character(FID),
        IID=as.character(IID)
    ),
    by=c('FID', 'IID')
) %>%
arrange(sort_order) %>%
select(all_of(colnames(master_phe_df)))


In [57]:
master_phe_df %>% dim() %>% print()
master_phe_updated_df %>% dim() %>% print()

[1] 516770   3562
[1] 516770   3562


In [58]:
master_phe_updated_df %>%
rename('#FID' = 'FID') %>%
fwrite(sprintf(master_phe_f, new_v), sep='\t', na = "NA", quote=F)
