In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [3]:
# input
master_phe_f <- '/oak/stanford/groups/mrivas/ukbb24983/phenotypedata/master_phe/master.20200522.phe'
missing_df <- 'missing_pop_GBE.minN100.20200627-110724.tsv' %>% fread() %>% rename('population'='#population')
related_f <- '/oak/stanford/groups/mrivas/ukbb24983/sqc/relatedness_20200514/semi_related.fam'

# output
gwas_redo_idx <- '20200627-gwas-additional-pops-redo.job.index.20200627-110724.tsv'


In [4]:
master_phe_df <- fread(
    master_phe_f,
    select=c('#FID','IID','population', missing_df %>% pull(GBE_ID) %>% unique()),
    colClasses=c('#FID'='character', 'IID'='character')
) %>%
rename('FID'='#FID')


In [5]:
related_df <- fread(related_f, head=F, colClasses='character')
colnames(related_df) <- c('FID', 'IID')


In [6]:
master_phe_head0_df <- fread(
    master_phe_f, nrows=0,
    colClasses=c('#FID'='character', 'IID'='character')
) %>%
rename('FID'='#FID')


In [7]:
master_phe_df %>%
left_join(
    related_df %>% mutate(related = 'related'),
    by=c('FID', 'IID') 
) %>%
mutate(
    population = if_else(is.na(related), population, 'related'),
#     population = if_else(population %in% c('e_asian_outlier', 's_asian_outlier'), 'others', population)
) -> master_phe_rel_df


In [8]:
master_phe_rel_df %>% count(population)

population,n
<chr>,<int>
african,6497
e_asian,1154
e_asian_outlier,618
non_british_white,24905
others,28467
related,44632
s_asian,7885
s_asian_outlier,77
white_british,337138
,65397


In [9]:
master_phe_col_idx <- data.frame(
    GBE_ID=colnames(master_phe_head0_df),
    col_idx=1:ncol(master_phe_head0_df),
    stringsAsFactors=F
)


In [10]:
master_phe_rel_df %>%
drop_na(population)%>%
select(-FID, -IID) %>%
gather(GBE_ID, val, -population) %>%
drop_na(val) %>%
filter(val != -9) %>% 
unique() %>%
mutate(GBE_CAT=str_replace_all(GBE_ID, '[0-9]', '')) %>%
filter(!GBE_CAT %in% c('INI','QT_FC')) %>%
count(population, GBE_ID) -> binary_traits_level_count


In [11]:
binary_traits_level_count %>% count(population)

population,n
<chr>,<int>
african,125
e_asian,124
e_asian_outlier,122
non_british_white,125
others,125
related,126
s_asian,125
s_asian_outlier,120
white_british,125


In [65]:
missing_df %>%
left_join(master_phe_col_idx, by='GBE_ID') %>%
left_join(binary_traits_level_count, by=c('population', 'GBE_ID')) %>%
filter(is.na(n) | (n > 1)) %>%
select(population, GBE_ID, col_idx, N_pop, GBE_NAME) -> missing_filtered_df 


In [66]:
missing_filtered_df  %>% count(population) %>% arrange(-n)

population,n
<chr>,<int>
others,219
related,112
e_asian,90
white_british,59
non_british_white,32
african,15
s_asian,13


In [67]:
missing_filtered_df %>%
rename('#population' = 'population') %>%
fwrite(gwas_redo_idx, sep='\t', na = "NA", quote=F)


In [68]:
missing_filtered_df %>%
filter(population == 'others') %>%
head()

Unnamed: 0_level_0,population,GBE_ID,col_idx,N_pop,GBE_NAME
Unnamed: 0_level_1,<chr>,<chr>,<int>,<int>,<chr>
1,others,BIN_FC20022506,895,354,Tobacco_smoking_current_vs_past
2,others,HC678,901,131,TTE_deficiency_of_other_b_group_vitamins
3,others,HC868,897,159,TTE_iridocyclitis
4,others,INI100150,2609,10886,"Drinking_water_intake_(Diet,_24hr_recall,_Online_follow-up)"
5,others,INI100210,277,2814,"Pure_fruit/vegetable_juice_intake_(Diet,_24hr_recall,_Online_follow-up)"
6,others,INI10030630,278,24267,Apolipoprotein_A
