In [1]:
import pandas as pd
import os
from functools import reduce

In [18]:
import numpy as np

In [2]:
# We import the SQC files from the OAK space
with open('/oak/stanford/groups/mrivas/ukbb24983/sqc/download/ukb_sqc_v2.fields.txt') as f:
    sqc_columns = [x for x in f.read().splitlines() if len(x) > 0]

In [3]:
sqc = pd.read_csv(
    '/oak/stanford/groups/mrivas/ukbb24983/sqc/download/ukb_sqc_v2.txt',
    sep='\s+', names = sqc_columns
)
# We get an idea of what is inside the SQC files by looking at the shape and columns
print(sqc.shape)
print(sqc.columns)

(488377, 89)
Index(['affymetrix_field_1', 'affymetrix_field_2', 'genotyping_array', 'Batch',
       'Plate_Name', 'Well', 'Cluster_CR', 'dQC', 'Internal_Pico_ng_uL',
       'Submitted_Gender', 'Inferred_Gender', 'X_intensity', 'Y_intensity',
       'Submitted_Plate_Name', 'Submitted_Well', 'sample_qc_missing_rate',
       'heterozygosity', 'heterozygosity_pc_corrected', 'het_missing_outliers',
       'putative_sex_chromosome_aneuploidy', 'in_kinship_table',
       'excluded_from_kinship_inference', 'excess_relatives',
       'in_white_British_ancestry_subset', 'used_in_pca_calculation', 'PC1',
       'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11',
       'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20',
       'PC21', 'PC22', 'PC23', 'PC24', 'PC25', 'PC26', 'PC27', 'PC28', 'PC29',
       'PC30', 'PC31', 'PC32', 'PC33', 'PC34', 'PC35', 'PC36', 'PC37', 'PC38',
       'PC39', 'PC40', 'in_Phasing_Input_chr1', 'in_Phasing_Input_chr2',
       'in

In [4]:
genotype_fam_file = '/oak/stanford/groups/mrivas/ukbb24983/fam/ukb2498_cal_v2_s488370.fam'
genotype_fam = pd.read_csv(
    genotype_fam_file, sep='\s+', 
    names = ['FID', 'IID', 'father', 'mother', 'sex_1=male_2=female_0=unkonwn', 'batch']
)
print(genotype_fam.shape)
genotype_fam.head()

(488377, 6)


Unnamed: 0,FID,IID,father,mother,sex_1=male_2=female_0=unkonwn,batch
0,2502845,2502845,0,0,1,Batch_b001
1,2314965,2314965,0,0,2,Batch_b001
2,1142584,1142584,0,0,2,Batch_b001
3,3665122,3665122,0,0,2,Batch_b001
4,4377492,4377492,0,0,2,Batch_b001


In [5]:
exclude_sqc = set(
    genotype_fam['FID'][
        (sqc.used_in_pca_calculation == 0) |
        (sqc.het_missing_outliers == 1) |
        (sqc.excess_relatives == 1) |
        (sqc.putative_sex_chromosome_aneuploidy == 1) |
        (genotype_fam.FID < 0)
    ]
)
# The total number of people excluded via SQC metrics and initial redactions
print("Total number of exclusions (initial redactions + SQC): " + str(len(exclude_sqc)))

Total number of exclusions (initial redactions + SQC): 81559


In [6]:
redacted = set()
list_indivs = open('/oak/stanford/groups/mrivas/ukbb24983/sqc/w24983_20181016.csv', 'r')
for val in list_indivs.read().split():
    redacted.add(int(val))
list_indivs.close()
print("Total number of redacted individuals: " + str(len(redacted)))
# We add the newly redacted individuals to the previous set of SQC rejects/initial redactions
exclude = exclude_sqc.union(redacted)
# This leads to the total number of exclusions...
print("Total number of exclusions (redactions + SQC rejections): " + str(len(exclude)))

Total number of redacted individuals: 79
Total number of exclusions (redactions + SQC rejections): 81623


In [7]:
field21000 = pd.read_csv(
    'ukb9796.24611.f.21000.tsv.gz', compression='gzip', sep='\t'
)

In [31]:
field21000['f.21000'] = [
    {x for x in l if (not np.isnan(x) and x not in {-1, -3})} 
    for l in 
    field21000[
        ['f.21000.0.0', 'f.21000.1.0', 'f.21000.2.0']
    ].values
]

In [32]:
field21000['f.21000.n.uniq'] = field21000['f.21000'].map(lambda x: len(x))

In [33]:
field21000[field21000['f.21000.n.uniq'] > 1].shape

(756, 6)

In [34]:
field21000[field21000['f.21000.n.uniq'] > 1].head()

Unnamed: 0,f.eid,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.21000,f.21000.n.uniq
340,3894664,2003.0,1003.0,5.0,"{2003.0, 5.0, 1003.0}",3
546,2194101,1.0,1002.0,1001.0,"{1.0, 1002.0, 1001.0}",3
743,4559510,1001.0,,1003.0,"{1001.0, 1003.0}",2
1041,5094397,1001.0,2002.0,,"{1001.0, 2002.0}",2
1074,5590666,3002.0,,3001.0,"{3001.0, 3002.0}",2


In [35]:
field21000[field21000['f.21000.n.uniq'] == 1].shape

(499070, 6)

In [38]:
set([1]).pop()

1

In [40]:
field21000['f.21000.QC'] = field21000['f.21000'].map(
    lambda x: x.pop() if len(x) == 1 else np.nan
)

In [41]:
field21000.head()

Unnamed: 0,f.eid,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.21000,f.21000.n.uniq,f.21000.QC
0,4646215,1003.0,,,{},1,1003.0
1,5007659,1001.0,,,{},1,1001.0
2,1038854,1001.0,,,{},1,1001.0
3,5441319,1001.0,,,{},1,1001.0
4,5743544,1001.0,,1001.0,{},1,1001.0


In [42]:
whites = {1, 1001, 1002, 1003}

In [45]:
british = {1001}

In [44]:
field21000[field21000['f.21000.QC'].map(
    lambda x: (not np.isnan(x)) and x in whites
)].shape

(472138, 7)

In [46]:
field21000[field21000['f.21000.QC'].map(
    lambda x: (not np.isnan(x)) and (x in whites) and (x not in british)
)].shape

(29733, 7)

In [47]:
non_british_whites = set(field21000[field21000['f.21000.QC'].map(
    lambda x: (not np.isnan(x)) and (x in whites) and (x not in british)
)]['f.eid'])

In [48]:
len(non_british_whites)

29733

In [49]:
len(non_british_whites - exclude)

26471

In [50]:
key='non_british_white'

In [51]:
write_dir='/oak/stanford/groups/mrivas/ukbb24983/sqc/population_stratification_w24983_20190411'


In [53]:
ethnic_groups=dict()
ethnic_groups[key] = non_british_whites - exclude

In [54]:
pd.DataFrame({
    'FID': sorted(ethnic_groups[key]),
    'IID': sorted(ethnic_groups[key])
}).to_csv(
    os.path.join(write_dir, 'ukb24983_{}.phe'.format(key)), sep='\t', index=False, header=False
)