# Sample QC and population definition
#### Yosuke Tanigawa (ytanigaw@stanford.edu)
#### Last update: 2020/3/13

We define the following 5 populations in UK Biobank. Please see `README.md` file for more details.


In [1]:
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(data.table))


In [2]:
file_names <- list(
    sqc_file           = '/oak/stanford/groups/mrivas/ukbb24983/sqc/download/ukb_sqc_v2.txt',
    sqc_colnames       = '/oak/stanford/groups/mrivas/ukbb24983/sqc/download/ukb_sqc_v2.fields.txt',
    fam_array          = '/oak/stanford/groups/mrivas/ukbb24983/fam/ukb2498_cal_v2_s488370.fam',
    fam_exome          = '/oak/stanford/groups/mrivas/ukbb24983/exome/pgen/spb/data/ukb_exm_spb.fam',
    remove_csv_file    = '/oak/stanford/groups/mrivas/ukbb24983/sqc/w24983_20200204.csv',
    coding1001_tsv     = '/oak/stanford/groups/mrivas/ukbb24983/phenotypedata/extras/self_reported_ethnicity/misc/coding1001.tsv',
    extracted_phe_file = '/oak/stanford/groups/mrivas/ukbb24983/phenotypedata/extras/self_reported_ethnicity/phe/ukb2007183_ukb40831_f21000.phe',
    extracted_tsv_file = '/oak/stanford/groups/mrivas/ukbb24983/phenotypedata/extras/self_reported_ethnicity/misc/ukb2007183_ukb40831_f21000.tsv',
    pop_refinement_pca = '/oak/stanford/groups/mrivas/ukbb24983/sqc/population_stratification_w24983_20190805/pca',
    pop_specific_pca   = '/oak/stanford/groups/mrivas/ukbb24983/sqc/population_stratification_w24983_20190809/pca',
    out_figs_prefix    = 'figs/sample_qc_v3.2',
    # the source of covariate
    covar_yob_tab      = '/oak/stanford/groups/mrivas/ukbb24983/phenotypedata/2007183/40831/download/ukb40831.tab',
    covar_bmi_f        = '/oak/stanford/groups/mrivas/ukbb24983/phenotypedata/9796/24611/phe/INI21001.phe',
    covar_age_f        = '/oak/stanford/groups/mrivas/ukbb24983/phenotypedata/extras/time_phenotypes/misc/age_assess.phe',
    covar_CNV_f        = '/oak/stanford/groups/mrivas/ukbb24983/sqc/ukb24983_GWAS_covar_withCNV.phe',
    covar_split_dir    = '/oak/stanford/groups/mrivas/projects/degas-risk/population-split'
)


## Step 1) Read the input files

- `sqc` (sample quality control) file from UK Biobank has the same order as in fam file for the array fam file


In [3]:
source('sample_qc_functions.R')

In [4]:
master_sqc_df <- read_master_sqc(file_names)


### Characterize some counts

#### QC filter

In [5]:
master_sqc_df %>% count(putative_sex_chromosome_aneuploidy)


putative_sex_chromosome_aneuploidy,n
<int>,<int>
0,487725
1,652


In [6]:
master_sqc_df %>% count(het_missing_outliers)


het_missing_outliers,n
<int>,<int>
0,487409
1,968


In [7]:
master_sqc_df %>% count(excess_relatives)


excess_relatives,n
<int>,<int>
0,488189
1,188


In [8]:
master_sqc_df %>% count(used_in_pca_calculation)


used_in_pca_calculation,n
<int>,<int>
0,81158
1,407219


#### Summary (number of individuals by self-reported ethnicity)

- `n` is the total number of individuals
- `n_QC` is the number of individuals who passed the 4 QC filters above
- `n_QC_OCA` is the number of **unrelated** individuals who passed the 4 QC filters above


In [9]:
master_sqc_df %>% 
show_counts_for_self_reported_ethnicity()

f21000_top_label,f21000_sub_label,f21000,n,n_QC,n_QC_PCA
<chr>,<chr>,<int>,<int>,<int>,<int>
Do not know,Do not know,-1.0,200,200,181
Prefer not to answer,Prefer not to answer,-3.0,1531,1526,1307
White,White,1.0,525,522,441
White,British,1001.0,430740,429195,355081
White,Irish,1002.0,12576,12515,10492
White,Any other white background,1003.0,15632,15542,14539
Mixed,Mixed,2.0,46,46,42
Mixed,White and Black Caribbean,2001.0,594,593,527
Mixed,White and Black African,2002.0,397,396,362
Mixed,White and Asian,2003.0,795,794,719


### Save PCA plots

In [10]:
plot_pca_all       <- master_sqc_df %>% plot_pca_self_reported()
plot_pca_top_label <- plot_pca_all + facet_wrap( ~ f21000_top_label, ncol=3) 


In [11]:
# for(ext in c('png', 'pdf')){
for(ext in c('png')){    
    ggsave(
        sprintf('%s.PCA.self.reported.ethnicity.%s', file_names$out_figs_prefix, ext),
        plot_pca_all
    )
    
    ggsave(
        sprintf('%s.PCA.self.reported.ethnicity.facet.%s', file_names$out_figs_prefix, ext),
        plot_pca_top_label
    )
    
}

Saving 6.67 x 6.67 in image
Saving 6.67 x 6.67 in image


## Step 2) define the population groups based on thresholds on PC1 and PC2

In [12]:
master_sqc_pop_df <- master_sqc_df %>% 
define_populations()


In [13]:
pops <- master_sqc_pop_df %>% 
drop_na(population) %>% 
select(population) %>% 
unique() %>% 
pull()


### The number of individuals in the initial population assignment

In [14]:
master_sqc_pop_df %>% show_population_counts()

population,UKBB,UKBL,w_exome,wo_exome,n
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
white_british,300102,37036,34393,302745,337138
non_british_white,22406,2499,2694,22211,24905
s_asian,7962,0,893,7069,7962
african,6497,0,847,5650,6497
e_asian,1772,0,187,1585,1772


## Step 3) Write the initial population definition to phe files
Every time this notebook is run, we should save the new phenotype (`.phe`) files to a new directory that has the date in the name. We have kept the root directory as `/oak/stanford/groups/mrivas/ukbb24983/sqc/` on Sherlock, and the name of the directory as `population_stratification_w24983_YYYYMMDD`.

### It is imperative you rename the directory below, OR ELSE ALL THE PREVIOUS FILES GET OVERWRITTEN. I've commented it out in the GitHub version because of this.


```
out_d <- '/oak/stanford/groups/mrivas/ukbb24983/sqc/population_stratification_w24983_20200313'
```

### This time (version 3.2), we used the popualtion-specific PCA from the previous iteration (20190805). So, we commented out the codes below 

In [16]:
# for (pop in pops){
#     print(pop)
    
#     master_sqc_pop_df %>% filter(population == pop) %>%
#     select(FID, IID) %>% 
#     fwrite(file.path(out_d, 'before_refinement', paste0('ukb24983_', pop, '.phe')), sep='\t', col.names = F)    
# }


## Step 4) Run PCA for each population group (except White British)

Please check `sample_qc_v3.PCA.sh`

## Step 5) Read the results from PCA

In [17]:
eigenvec_df <- read_eigenvec(file_names$pop_refinement_pca, pops)


## Step 6) Population definition refinement based on the local PCs

- African: No update
- Non-British White: No update
- South Asian: filtering out some individuals
  - `0.02 <= PC1 <= 0.03`, `-0.05 <= PC2 <= 0.02`
- East Asian: filtering out some individuals
  - `-0.01 <= PC1 <= 0.02`, `-0.02 <= PC2 <= 0`

In [18]:
plot_local_PCs <- list()

In [19]:
plot_local_PCs[['african']] <- eigenvec_df %>% 
apply_threshold(
    x_axis = 'PC1', y_axis = 'PC2', 
    pop = 'african', 
    c(-0.02, 0.05), c(-0.02, 0.06)
)


[1] "Number of individuals: 6498 (before filter) --> 6498 (after filter)"


In [20]:
plot_local_PCs[['non_british_white']] <- eigenvec_df %>% 
apply_threshold(
    x_axis = 'PC1', y_axis = 'PC2', 
    pop = 'non_british_white', 
    c(-0.03, 0.01), c(-0.01, 0.03)
)


[1] "Number of individuals: 24909 (before filter) --> 24909 (after filter)"


In [21]:
plot_local_PCs[['s_asian']] <- eigenvec_df %>% 
apply_threshold(
    x_axis = 'PC1', y_axis = 'PC2',
    pop = 's_asian',
    c(-0.02, 0.03), c(-0.05, 0.02)
)


[1] "Number of individuals: 7962 (before filter) --> 7885 (after filter)"


In [22]:
plot_local_PCs[['e_asian']] <- eigenvec_df %>% 
apply_threshold(
    x_axis = 'PC1', y_axis = 'PC2',
    pop = 'e_asian', 
    c(-0.01, 0.02), c(-0.02, 0)
)


[1] "Number of individuals: 1772 (before filter) --> 1154 (after filter)"


### Save the local PC plots

In [23]:
# for(ext in c('png', 'pdf')){
for(ext in c('png')){
    for(pop in (names(plot_local_PCs))){
        ggsave(
            sprintf('%s.local.PCA.%s.%s', file_names$out_figs_prefix, pop, ext),
            plot_local_PCs[[pop]]
        )
    }    
}


Saving 6.67 x 6.67 in image
Saving 6.67 x 6.67 in image
Saving 6.67 x 6.67 in image
Saving 6.67 x 6.67 in image


### Refine the population definition

In [24]:
master_sqc_pop_ref_df <- master_sqc_pop_df %>% 
population_def_refinement(eigenvec_df)


### The number of individuals in each population

In [25]:
master_sqc_pop_ref_df %>% show_population_counts() 

population,UKBB,UKBL,w_exome,wo_exome,n
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
white_british,300102,37036,34393,302745,337138
non_british_white,22406,2499,2694,22211,24905
s_asian,7885,0,885,7000,7885
african,6497,0,847,5650,6497
e_asian,1154,0,135,1019,1154
e_asian_outlier,618,0,52,566,618
s_asian_outlier,77,0,8,69,77


#### The joint frequency of our population definition and the self-reported ancestry

In [26]:
master_sqc_pop_ref_df %>% 
mutate(
    population=if_else(
        (
            (population == 's_asian_outlier') | 
            (population == 'e_asian_outlier')
        ), '', population
    ),
    population=na_if(population, '')
) %>%
drop_na(population) %>%
count(population, f21000_top_label) %>% 
spread(f21000_top_label, n, fill= 0)


population,Asian or Asian British,Black or Black British,Chinese,Do not know,Prefer not to answer,White,<NA>
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
african,0,6305,0,8,120,0,64
e_asian,22,0,1114,1,4,0,13
non_british_white,0,0,0,0,0,24895,10
s_asian,7664,0,0,19,87,0,115
white_british,0,0,0,0,0,336995,143


### Plot the global PCs in each population group

In [27]:
# for(ext in c('png', 'pdf')){
for(ext in c('png')){    
    ggsave(
        sprintf('%s.PC1.vs.PC2.%s', file_names$out_figs_prefix, ext),
        master_sqc_pop_ref_df %>%
        plot_pca_population('PC1', 'PC2') %>%
        plot_pca_add_threshold()
    )
    for(pcx in 2:9){
        ggsave(
            sprintf('%s.PC%d.vs.PC%d.%s', file_names$out_figs_prefix, pcx, pcx+1, ext),
            master_sqc_pop_ref_df %>%
            plot_pca_population(sprintf('PC%d', pcx), sprintf('PC%d', pcx+1))
        )
    }
}


Saving 6.67 x 6.67 in image
Saving 6.67 x 6.67 in image
Saving 6.67 x 6.67 in image
Saving 6.67 x 6.67 in image
Saving 6.67 x 6.67 in image
Saving 6.67 x 6.67 in image
Saving 6.67 x 6.67 in image
Saving 6.67 x 6.67 in image
Saving 6.67 x 6.67 in image


## Step 7) Write the final results to `.phe` files

In [28]:
for (pop in pops){
    print(pop)
    
    master_sqc_pop_ref_df %>% 
    filter(population == pop) %>%
    select(FID, IID) %>% 
    fwrite(
        file.path(out_d, paste0('ukb24983_', pop, '.phe')), 
        sep='\t', col.names = F
    )

    master_sqc_pop_ref_df %>% 
    filter(population != pop) %>%
    select(FID, IID) %>% 
    fwrite(
        file.path(out_d, paste0('ukb24983_', pop, '.exclude.phe')), 
        sep='\t', col.names = F
    )    
}


[1] "white_british"
[1] "e_asian"
[1] "non_british_white"
[1] "s_asian"
[1] "african"


## Step 8) Re-run PCA for the refined population

Please check `sample_qc_v3.PCA.sh`

In [29]:
pop_specific_pcs_df <- read_eigenvec(file_names$pop_specific_pca, pops)

### Plot the local PCs in each population group

In [30]:
# for(ext in c('png', 'pdf')){
for(ext in c('png')){    
    for(pcx in 1:9){
        ggsave(
            sprintf('%s.local.PC%d.vs.PC%d.%s', file_names$out_figs_prefix, pcx, pcx+1, ext),            
            pop_specific_pcs_df %>% 
            plot_local_pc(master_sqc_pop_ref_df, sprintf('PC%d', pcx), sprintf('PC%d', pcx+1))            
        )
    }
}


Saving 6.67 x 6.67 in image
Saving 6.67 x 6.67 in image
Saving 6.67 x 6.67 in image
Saving 6.67 x 6.67 in image
Saving 6.67 x 6.67 in image
Saving 6.67 x 6.67 in image
Saving 6.67 x 6.67 in image
Saving 6.67 x 6.67 in image
Saving 6.67 x 6.67 in image


## Step 9) Prepare GWAS covariate file

In [31]:
yob_df <- fread(
    cmd=paste(
        'cat', 
        file_names$covar_yob_tab,
        '| cut -f1,11'
    )
) %>%
rename(
    'IID' = 'f.eid',
    'BirthYear' = 'f.34.0.0'
)


In [32]:
CNV_df <- fread(
    file_names$covar_CNV_f, select=c('FID', 'IID', 'N_CNV', 'LEN_CNV')
)

In [33]:
split_df <- lapply(
    c('train', 'valid', 'test'),
    function(s){
        fread(
            file.path(
                file_names$covar_split_dir, 
                sprintf('ukb24983_white_british_%s.phe', s)
            ), 
            col.names=c('FID', 'IID')
        ) %>%
        mutate(split = s)
    }
) %>%
bind_rows() %>%
mutate(
    split = if_else(split == 'valid', 'val', split)
)

In [34]:
master_sqc_all <- master_sqc_pop_ref_df %>%
left_join(
    yob_df, by='IID'
) %>%
replace_colnames('^PC', 'Global_PC') %>%
left_join(
    master_sqc_pop_ref_df %>% 
    select(FID, IID, population, paste0('PC', 1:40)) %>%
    filter(population == 'white_british') %>%
    bind_rows(pop_specific_pcs_df) %>% 
    rename('population_before_refinement' = 'population'),
    by=c('FID', 'IID')
) %>%
left_join(
    fread(file_names$covar_bmi_f, data.table=F, sep='\t', col.names=c('FID', 'IID', 'BMI')),
    by=c('FID', 'IID')
) %>%
left_join(
    fread(file_names$covar_age_f, data.table=F, sep='\t'), by='IID'
) %>%
left_join(
    CNV_df,
    by=c('FID', 'IID')
) %>% 
rename('sex_str' = 'sex') %>%
mutate(
    age = if_else(
        is.na(BirthYear),
        2017 - median(yob_df$BirthYear, na.rm = T),
        2017 - BirthYear        
    ),
    sex   = if_else(sex_str == 'male', 1, if_else(sex_str == 'female', 0, NULL)),
    Array = if_else(genotyping_array == 'UKBB', 1, 0)
) %>%
left_join(
    split_df, 
    by=c('FID', 'IID')
)


In [35]:
master_sqc_all %>% 
colnames()


In [36]:
master_sqc_all %>% 
fwrite(file.path(out_d, 'ukb24983_master_sqc.20200313.phe'), sep='\t', na = "NA", quote=F)


In [37]:
GWAS_covar_df <- master_sqc_all %>% 
select(
    FID, IID, 
    population, split, 
    age, age0, age1, age2, age3, 
    sex, BMI, N_CNV, LEN_CNV, 
    Array, paste0('PC', 1:40), paste0('Global_PC', 1:40)
)

In [38]:
GWAS_covar_df %>% 
fwrite(file.path(out_d, 'ukb24983_GWAS_covar.20200313.phe'), sep='\t', na = "NA", quote=F)
