# Step 6: finalize the population assignment and save it in master SQC file, GWAS covar file, and keep files

In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


## Step 1-0) Read the file names

In [6]:
source('0_file_names.R')


In [7]:
file_names <- get_file_names()

## Step 1-1) Read the input files

- `sqc` (sample quality control) file from UK Biobank has the same order as in fam file for the array fam file


In [4]:
source('../sample_qc_functions.R')

In [8]:
master_sqc_df <- read_master_sqc(file_names)


### Let's rename PC1-PC40 to Global_PC1-Global_PC40

In [9]:
master_sqc_df %>% colnames() %>%
lapply(function(x){
    ifelse(startsWith(x, 'PC'), paste0('Global_', x), x)
}) -> master_sqc_df_colnames

colnames(master_sqc_df) <- master_sqc_df_colnames

## Step 6-2) Read the population-specific PCs

In [10]:
bind_rows(
    # read from file_names$pop_refinement_pca
    
    c('white_british', 'non_british_white', 'african', 'e_asian') %>%
    lapply(function(pop){
        file.path(file_names$pop_refinement_pca, sprintf('ukb24983_%s.eigenvec', pop)) %>%
        fread(select=c('#FID', 'IID', paste0('PC', 1:40)), colClasses=c('#FID'='character', 'IID'='character')) %>%
        rename('FID' = '#FID') %>% 
        mutate(population = pop)
    }) %>% bind_rows(),
    
    # read from file_names$pop_specific_pca

    c('s_asian') %>%
    lapply(function(pop){
        file.path(file_names$pop_specific_pca, sprintf('ukb24983_%s.eigenvec', pop)) %>%
        fread(select=c('#FID', 'IID', paste0('PC', 1:40)), colClasses=c('#FID'='character', 'IID'='character')) %>%
        rename('FID' = '#FID') %>% 
        mutate(population = pop)
    }) %>% bind_rows()

) -> pop_specific_PCs_df


In [11]:
pop_specific_PCs_df %>% count(population) %>% arrange(-n)

population,n
<chr>,<int>
white_british,337129
non_british_white,24905
s_asian,7831
african,6497
e_asian,1704


## Step 6-3) Read "semi-related" population

In [12]:
file_names$semi_related_file %>%
fread(col.names=c('FID', 'IID'), colClasses = 'character') %>%
mutate(population = 'related') -> related_df


In [13]:
related_df %>% dim()

## Step 6-4) Join the data frames and finalize the population assignment

In [14]:
master_sqc_df %>% colnames()

In [15]:
master_sqc_df %>% count(father, mother)

father,mother,n
<int>,<int>,<int>
0,0,488377


In [16]:
master_sqc_df %>%
select(-father, -mother) %>%
mutate(FID = as.character(FID), IID = as.character(IID)) %>%
left_join(
    pop_specific_PCs_df, by=c('FID', 'IID')
) %>%
mutate(
    population = if_else(
        (FID %in% (related_df %>% pull(FID))) & (IID %in% (related_df %>% pull(IID))),
        'related',
        population
    ),
    population = if_else(
        pass_QC_filter,
        if_else((is.na(population) & pass_filter), 'others', population),
        'DO_NOT_PASS_SQC'
    )
) -> master_sqc_refined_pop_df


### The number of individuals in each population

In [17]:
master_sqc_refined_pop_df %>%
count(population) %>%
arrange(-n)

population,n
<chr>,<int>
white_british,337129
related,44632
,35092
others,28656
non_british_white,24905
s_asian,7831
african,6497
DO_NOT_PASS_SQC,1931
e_asian,1704


#### Here, "NA" has the sample-QC passed related individuals who are not included in our semi-related population.

In [18]:
master_sqc_refined_pop_df %>% show_population_counts()

population,UKBB,UKBL,w_exome,wo_exome,n
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
white_british,300094,37035,137920,199209,337129
related,39873,4759,18100,26532,44632
others,26594,2062,12049,16607,28656
non_british_white,22406,2499,10432,14473,24905
s_asian,7831,0,3541,4290,7831
african,6497,0,2716,3781,6497
DO_NOT_PASS_SQC,1805,126,707,1224,1931
e_asian,1704,0,725,979,1704


### The joint frequency of our population definition and the self-reported ancestry

In [16]:
master_sqc_refined_pop_df  %>% 
drop_na(population) %>%
count(population, f21000_top_label) %>% 
spread(f21000_top_label, n, fill= 0)


population,Asian or Asian British,Black or Black British,Chinese,Do not know,Mixed,Other ethnic group,Prefer not to answer,White,<NA>
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
african,0,6305,0,8,0,0,120,0,64
DO_NOT_PASS_SQC,68,12,1,0,5,7,5,1710,123
e_asian,279,0,1367,4,0,0,20,0,34
non_british_white,0,0,0,0,0,0,0,24895,10
others,676,680,52,152,2539,4007,1084,18661,805
related,386,281,44,8,123,162,134,43387,107
s_asian,7617,0,0,17,0,0,83,0,114
white_british,0,0,0,0,0,0,0,336987,142


## Step 6-5) Prepare and join GWAS covariates and training/val/test split info

In [19]:
yob_df <- fread(
    cmd=paste(
        'cat', 
        file_names$covar_yob_tab,
        '| cut -f1,11'
    ),
    colClasses=c('character', 'numeric')
) %>%
rename(
    'IID' = 'f.eid',
    'BirthYear' = 'f.34.0.0'
)


In [21]:
CNV_df <- fread(
    file_names$covar_CNV_f, select=c('FID', 'IID', 'N_CNV', 'LEN_CNV'),
    colClasses=c('FID'='character', 'IID'='character')
)

split_df <- lapply(
    c('train', 'valid', 'test'),
    function(s){
        fread(
            file.path(
                file_names$covar_split_dir, 
                sprintf('ukb24983_white_british_%s.phe', s)
            ), 
            col.names=c('FID', 'IID'),
            colClasses='character'
        ) %>%
        mutate(split = s)
    }
) %>%
bind_rows() %>%
mutate(
    split = if_else(split == 'valid', 'val', split)
)


In [22]:
master_sqc_all <- master_sqc_refined_pop_df %>%
left_join(yob_df, by='IID') %>%
left_join(
    fread(file_names$covar_bmi_f, data.table=F, sep='\t', col.names=c('FID', 'IID', 'BMI'), colClasses = c('character', 'character', 'numeric')),
    by=c('FID', 'IID')
) %>%
left_join(
    fread(file_names$covar_age_f, data.table=F, sep='\t', colClasses=c('IID'='character')), by='IID'
) %>%
left_join(CNV_df, by=c('FID', 'IID')) %>% 
rename('sex_str' = 'sex') %>%
mutate(
    age = if_else(
        is.na(BirthYear),
        2017 - median(yob_df$BirthYear, na.rm = T),
        2017 - BirthYear        
    ),
    sex   = if_else(sex_str == 'male', 1, if_else(sex_str == 'female', 0, NULL)),
    Array = if_else(genotyping_array == 'UKBB', 1, 0)
) %>%
left_join(split_df, by=c('FID', 'IID')) %>%
mutate(
    split = if_else(population == 'white_british', split, population)
)


In [23]:
master_sqc_all %>%
count(split, population) %>%
arrange(-n)


split,population,n
<chr>,<chr>,<int>
train,white_british,235991
test,white_british,67425
related,related,44632
,,35092
val,white_british,33713
others,others,28656
non_british_white,non_british_white,24905
s_asian,s_asian,7831
african,african,6497
DO_NOT_PASS_SQC,DO_NOT_PASS_SQC,1931


In [24]:
master_sqc_all %>% colnames()

## Step 6-6) write the results into files

### master SQC file

In [25]:
master_sqc_all %>%
rename('#FID' = 'FID') %>%
fwrite(file_names$master_sqc_file, sep='\t', na = "NA", quote=F)


### keep file for each population

In this version, we don't support the exclude phe file.

In [26]:
for (pop in c('white_british', 'non_british_white', 'african', 's_asian', 'e_asian', 'related', 'others')){
    master_sqc_all %>% filter(population == pop) %>% select(FID, IID) %>%
    fwrite(file.path(dirname(file_names$master_sqc_file), paste0('ukb24983_', pop, '.phe')), sep='\t', col.names = F)

#     master_sqc_all %>% filter(population != pop) %>% select(FID, IID) %>%
#     fwrite(file.path(dirname(file_names$master_sqc_file), paste0('ukb24983_', pop, '.exclude.phe')), sep='\t', col.names = F)
}


### GWAS covar file

In [27]:
GWAS_covar_df <- master_sqc_all %>% 
select(
    FID, IID, 
    population, split, 
    age, age0, age1, age2, age3, 
    sex, BMI, N_CNV, LEN_CNV, 
    Array, paste0('PC', 1:40), paste0('Global_PC', 1:40)
)


In [28]:
GWAS_covar_df %>%
rename('#FID' = 'FID') %>%
fwrite(file_names$GWAS_covar_file, sep='\t', na = "NA", quote=F)
